From e9f9ee63447e65994b09952e17dfc3a30b8ee30c Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 29 Feb 2024 09:00:42 +0000
Subject: [PATCH 01/85] A few wrinkles to sort out, some changes to pyfive
 needed, but this is a pre-train interim commit

---
 activestorage/active.py         | 187 ++++++-----------
 activestorage/active_tools.py   | 351 --------------------------------
 activestorage/netcdf_to_zarr.py | 126 ------------
 3 files changed, 60 insertions(+), 604 deletions(-)
 delete mode 100644 activestorage/active_tools.py
 delete mode 100644 activestorage/netcdf_to_zarr.py

diff --git a/activestorage/active.py b/activestorage/active.py
index 1cf03eef..04deb549 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -4,19 +4,15 @@
 import numpy as np
 import pathlib
 import urllib
+import pyfive
+from pyfive import ZarrArrayStub, OrthogonalIndexer
 
-import h5netcdf
 import s3fs
 
-#FIXME: Consider using h5py throughout, for more generality
-from netCDF4 import Dataset
-from zarr.indexing import (
-    OrthogonalIndexer,
-)
 from activestorage.config import *
 from activestorage import reductionist
 from activestorage.storage import reduce_chunk
-from activestorage import netcdf_to_zarr as nz
+
 
 
 @contextlib.contextmanager
@@ -44,11 +40,12 @@ def load_from_s3(uri, storage_options=None):
     else:
         fs = s3fs.S3FileSystem(**storage_options)  # use passed-in dictionary
     with fs.open(uri, 'rb') as s3file:
-        ds = h5netcdf.File(s3file, 'r', invalid_netcdf=True)
+        ds = pyfive.File(s3file, 'r', invalid_netcdf=True)
         print(f"Dataset loaded from S3 via h5netcdf: {ds}")
         yield ds
 
 
+
 class Active:
     """ 
     Instantiates an interface to active storage which contains either zarr files
@@ -83,7 +80,7 @@ def __init__(
         active_storage_url=None
     ):
         """
-        Instantiate with a NetCDF4 dataset and the variable of interest within that file.
+        Instantiate with a NetCDF4 dataset URI and the variable of interest within that file.
         (We need the variable, because we need variable specific metadata from within that
         file, however, if that information is available at instantiation, it can be provided
         using keywords and avoid a metadata read.)
@@ -113,57 +110,48 @@ def __init__(
         self.ncvar = ncvar
         if self.ncvar is None:
             raise ValueError("Must set a netCDF variable name to slice")
-        self.zds = None
 
         self._version = 1
         self._components = False
         self._method = None
-        self._lock = False
         self._max_threads = max_threads
 
     def __getitem__(self, index):
         """ 
         Provides support for a standard get item.
+        #FIXME-BNL: Why is the argument index?
         """
         # In version one this is done by explicitly looping over each chunk in the file
         # and returning the requested slice ourselves. In version 2, we can pass this
         # through to the default method.
         ncvar = self.ncvar
 
+        # in all casese we need an open netcdf file to get at attributes
+        # FIXME. We then need to monkey patch the "filename" as rfile onto the dataset
+        if self.storage_type is None:
+            nc = pyfive.FILE(self.uri)
+        elif self.storage_type == "s3":
+            nc = load_from_s3(self.uri, self.storage_options)
+
         if self.method is None and self._version == 0:
             # No active operation
-            lock = self.lock
-            if lock:
-                lock.acquire()
-
             if self.storage_type is None:
-                nc = Dataset(self.uri)
                 data = nc[ncvar][index]
                 nc.close()
             elif self.storage_type == "s3":
-                with load_from_s3(self.uri, self.storage_options) as nc:
-                    data = nc[ncvar][index]
-                    data = self._mask_data(data, nc[ncvar])
-
-            if lock:
-                lock.release()
+        
+                data = nc[ncvar][index]
+                data = self._mask_data(data, nc[ncvar])
+                nc.close()
                 
             return data
         
         elif self._version == 1:
-            return self._via_kerchunk(index)
+            return self._get_selection(nc[ncvar], index)
         
         elif self._version  == 2:
-            # No active operation either
-            lock = self.lock
-            if lock:
-                lock.acquire()
-
-            data = self._via_kerchunk(index)
-
-            if lock:
-                lock.release()
 
+            data = self._get_selection(nc[ncvar], index)
             return data
 
         else:
@@ -223,27 +211,8 @@ def ncvar(self):
     def ncvar(self, value):
         self._ncvar = value
 
-    @property
-    def lock(self):
-        """Return or set a lock that prevents concurrent file reads when accessing the data locally.
-
-        The lock is either a `threading.Lock` instance, an object with
-        same API and functionality (such as
-        `dask.utils.SerializableLock`), or is `False` if no lock is
-        required.
 
-        To be effective, the same lock instance must be used across
-        all process threads.
 
-        """
-        return self._lock
-
-    @lock.setter
-    def lock(self, value):
-        if not value:
-            value = False
-            
-        self._lock = value
 
     def _get_active(self, method, *args):
         """ 
@@ -255,54 +224,21 @@ def _get_active(self, method, *args):
         """
         raise NotImplementedError
 
-    def _via_kerchunk(self, index):
-        """ 
-        The objective is to use kerchunk to read the slices ourselves. 
-        """
-        # FIXME: Order of calls is hardcoded'
-        if self.zds is None:
-            print(f"Kerchunking file {self.uri} with variable "
-                  f"{self.ncvar} for storage type {self.storage_type}")
-            ds, zarray, zattrs = nz.load_netcdf_zarr_generic(
-                self.uri,
-                self.ncvar,
-                self.storage_type,
-                self.storage_options,
-            )
-            # The following is a hangove from exploration
-            # and is needed if using the original doing it ourselves
-            # self.zds = make_an_array_instance_active(ds)
-            self.zds = ds
-
-            # Retain attributes and other information
-            if zarray.get('fill_value') is not None:
-                zattrs['_FillValue'] = zarray['fill_value']
-            
-            self.zarray = zarray
-            self.zattrs = zattrs
-
-            # FIXME: We do not get the correct byte order on the Zarr
-            # Array's dtype when using S3, so capture it here.
-            self._dtype = np.dtype(zarray['dtype'])
-            
-        return self._get_selection(index)
-
-    def _get_selection(self, *args):
+ 
+
+    def _get_selection(self, ds, *args):
         """ 
-        First we need to convert the selection into chunk coordinates,
-        steps etc, via the Zarr machinery, then we get everything else we can
-        from zarr and friends and use simple dictionaries and tuples, then
-        we can go to the storage layer with no zarr.
+        We need to load the b-tree index first. The current machinery is 
+        using a temporary branch in a fork of pyfive. It will get cleaner
+        when we tidy that up.
         """
-        compressor = self.zds._compressor
-        filters = self.zds._filters
 
         # Get missing values
-        _FillValue = self.zattrs.get('_FillValue')
-        missing_value = self.zattrs.get('missing_value')
-        valid_min = self.zattrs.get('valid_min')
-        valid_max = self.zattrs.get('valid_max')
-        valid_range = self.zattrs.get('valid_range')
+        _FillValue = ds.attrs.get('_FillValue')
+        missing_value = ds.attrs.get('missing_value')
+        valid_min = ds.attrs.get('valid_min')
+        valid_max = ds.attrs.get('valid_max')
+        valid_range = ds.attrs.get('valid_range')
         if valid_max is not None or valid_min is not None:
             if valid_range is not None:
                 raise ValueError(
@@ -319,31 +255,25 @@ def _get_selection(self, *args):
             valid_min,
             valid_max,
         )
+        array = ZarrArrayStub(ds.shape, ds.chunks)
 
-        indexer = OrthogonalIndexer(*args, self.zds)
+
+        indexer = OrthogonalIndexer(*args, array)
         out_shape = indexer.shape
         out_dtype = self.zds._dtype
-        stripped_indexer = [(a, b, c) for a,b,c in indexer]
-        drop_axes = indexer.drop_axes  # not sure what this does and why, yet.
+        #stripped_indexer = [(a, b, c) for a,b,c in indexer]
+        #drop_axes = indexer.drop_axes  # not sure what this does and why, yet.
 
-        # yes this next line is bordering on voodoo ...
-        # this returns a nested dictionary with the full file FS reference
-        # ie all the gubbins: chunks, data structure, types, etc
-        # if using zarr<=2.13.3 call with _mutable_mapping ie
-        # fsref = self.zds.chunk_store._mutable_mapping.fs.references 
-        fsref = self.zds.chunk_store.fs.references
 
-        return self._from_storage(stripped_indexer, drop_axes, out_shape,
-                                  out_dtype, compressor, filters, missing, fsref)
+        return self._from_storage(ds, indexer, out_shape, out_dtype, missing)
 
-    def _from_storage(self, stripped_indexer, drop_axes, out_shape, out_dtype,
-                      compressor, filters, missing, fsref):
+    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing):
         method = self.method
         if method is not None:
             out = []
             counts = []
         else:
-            out = np.empty(out_shape, dtype=out_dtype, order=self.zds._order)
+            out = np.empty(out_shape, dtype=out_dtype, order=ds.order)
             counts = None  # should never get touched with no method!
 
         # Create a shared session object.
@@ -370,13 +300,12 @@ def _from_storage(self, stripped_indexer, drop_axes, out_shape, out_dtype,
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
             futures = []
             # Submit chunks for processing.
-            for chunk_coords, chunk_selection, out_selection in stripped_indexer:
+            for chunk_coords, chunk_selection, out_selection in indexer:
                 future = executor.submit(
                     self._process_chunk,
-                    session, fsref, chunk_coords, chunk_selection,
-                    counts, out_selection,
-                    compressor, filters, missing,
-                    drop_axes=drop_axes)
+                    session,  ds, chunk_coords, chunk_selection,
+                    counts, out_selection, missing)
+                    #drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
@@ -446,9 +375,9 @@ def _get_endpoint_url(self):
 
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
-    def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
-                       out_selection, compressor, filters, missing, 
-                       drop_axes=None):
+    def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
+                       out_selection, missing) 
+                       #drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
@@ -456,11 +385,15 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
         the output array.
 
         Note the need to use counts for some methods
+        #FIXME: Do, we, it's not actually used?
 
         """
-        coord = '.'.join([str(c) for c in chunk_coords])
-        key = f"{self.ncvar}/{coord}"
-        rfile, offset, size = tuple(fsref[key])
+        
+        offset, size, filter_mask = ds.get_chunk_details[chunk_coords]
+        rfile = ds.rfile
+        #FIXME: need compressor and filters from ds
+        compressor = None
+        filters=None
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
@@ -485,8 +418,8 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
                                                        missing, dtype,
-                                                       self.zds._chunks,
-                                                       self.zds._order,
+                                                       ds.chunks,
+                                                       ds.order,
                                                        chunk_selection,
                                                        operation=self._method)
             else:
@@ -504,8 +437,8 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
                                                        missing, dtype,
-                                                       self.zds._chunks,
-                                                       self.zds._order,
+                                                       ds.chunks,
+                                                       ds.order,
                                                        chunk_selection,
                                                        operation=self._method)
         else:
@@ -515,14 +448,14 @@ def _process_chunk(self, session, fsref, chunk_coords, chunk_selection, counts,
             # although we will version changes.
             tmp, count = reduce_chunk(rfile, offset, size, compressor, filters,
                                       missing, self.zds._dtype,
-                                      self.zds._chunks, self.zds._order,
+                                      ds.chunks, ds.order,
                                       chunk_selection, method=self.method)
 
         if self.method is not None:
             return tmp, count
         else:
-            if drop_axes:
-                tmp = np.squeeze(tmp, axis=drop_axes)
+            #if drop_axes:
+            #    tmp = np.squeeze(tmp, axis=drop_axes)
             return tmp, out_selection
 
     def _mask_data(self, data, ds_var):
diff --git a/activestorage/active_tools.py b/activestorage/active_tools.py
deleted file mode 100644
index 59c05508..00000000
--- a/activestorage/active_tools.py
+++ /dev/null
@@ -1,351 +0,0 @@
-"""
-Module to hold Zarr lift-up code.
-
-We're effectively subclassing zarr.core.Array, but not actually doing so, 
-instead we're providing tools to hack instances of it
-"""
-import numpy as np
-import zarr
-
-from packaging import version
-
-from zarr.core import Array
-# import other zarr gubbins used in the methods we override
-from zarr.indexing import (
-    OrthogonalIndexer,
-    PartialChunkIterator,
-    check_fields,
-    is_contiguous_selection,
-
-)
-from zarr.util import (
-    check_array_shape,
-    is_total_slice,
-    PartialReadBuffer,
-)
-
-from numcodecs.compat import ensure_ndarray
-from zarr.errors import ArrayIndexError
-
-
-def make_an_array_instance_active(instance):
-    """ 
-    Given a zarr array instance, override some key methods so 
-    we can do active storage things. Note this only works for
-    normal and _ methods and would not work on __ methods.
-    
-    This an ugly hack for development to avoid having to hack
-    zarr internal in a fork.
-    """
-
-    instance.get_orthogonal_selection = as_get_orthogonal_selection.__get__(instance, Array)
-    instance._get_selection = as_get_selection.__get__(instance, Array)
-    instance._chunk_getitem = as_chunk_getitem.__get__(instance, Array)
-    instance._process_chunk = as_process_chunk.__get__(instance, Array)
-    instance._process_chunk_V = as_process_chunk_V.__get__(instance, Array)
-
-    return instance
-
-def as_get_orthogonal_selection(self, selection, out=None,
-                                 fields=None):
-    """
-    Retrieve data by making a selection for each dimension of the array. For
-    example, if an array has 2 dimensions, allows selecting specific rows and/or
-    columns. The selection for each dimension can be either an integer (indexing a
-    single item), a slice, an array of integers, or a Boolean array where True
-    values indicate a selection.
-    Parameters
-    ----------
-    selection : tuple
-        A selection for each dimension of the array. May be any combination of int,
-        slice, integer array or Boolean array.
-    out : ndarray, optional
-        If given, load the selected data directly into this array.
-    fields : str or sequence of str, optional
-        For arrays with a structured dtype, one or more fields can be specified to
-        extract data for.
-    Returns
-    -------
-    out : ndarray
-        A NumPy array containing the data for the requested selection.
-    Examples
-    --------
-    Setup a 2-dimensional array::
-        >>> import zarr
-        >>> import numpy as np
-        >>> z = zarr.array(np.arange(100).reshape(10, 10))
-    Retrieve rows and columns via any combination of int, slice, integer array and/or
-    Boolean array::
-        >>> z.get_orthogonal_selection(([1, 4], slice(None)))
-        array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-                [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]])
-        >>> z.get_orthogonal_selection((slice(None), [1, 4]))
-        array([[ 1,  4],
-                [11, 14],
-                [21, 24],
-                [31, 34],
-                [41, 44],
-                [51, 54],
-                [61, 64],
-                [71, 74],
-                [81, 84],
-                [91, 94]])
-        >>> z.get_orthogonal_selection(([1, 4], [1, 4]))
-        array([[11, 14],
-                [41, 44]])
-        >>> sel = np.zeros(z.shape[0], dtype=bool)
-        >>> sel[1] = True
-        >>> sel[4] = True
-        >>> z.get_orthogonal_selection((sel, sel))
-        array([[11, 14],
-                [41, 44]])
-    For convenience, the orthogonal selection functionality is also available via the
-    `oindex` property, e.g.::
-        >>> z.oindex[[1, 4], :]
-        array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-                [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]])
-        >>> z.oindex[:, [1, 4]]
-        array([[ 1,  4],
-                [11, 14],
-                [21, 24],
-                [31, 34],
-                [41, 44],
-                [51, 54],
-                [61, 64],
-                [71, 74],
-                [81, 84],
-                [91, 94]])
-        >>> z.oindex[[1, 4], [1, 4]]
-        array([[11, 14],
-                [41, 44]])
-        >>> sel = np.zeros(z.shape[0], dtype=bool)
-        >>> sel[1] = True
-        >>> sel[4] = True
-        >>> z.oindex[sel, sel]
-        array([[11, 14],
-                [41, 44]])
-    Notes
-    -----
-    Orthogonal indexing is also known as outer indexing.
-    Slices with step > 1 are supported, but slices with negative step are not.
-    See Also
-    --------
-    get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection,
-    get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection,
-    vindex, oindex, __getitem__, __setitem__
-    """
-
-    # refresh metadata
-    if not self._cache_metadata:
-        self._load_metadata()
-
-    # check args
-    check_fields(fields, self._dtype)
-
-    # setup indexer
-    indexer = OrthogonalIndexer(selection, self)
-
-    return self._get_selection(indexer=indexer, out=out,
-                                fields=fields)
-
-
-def as_get_selection(self, indexer, out=None,
-                    fields=None):
-
-    # We iterate over all chunks which overlap the selection and thus contain data
-    # that needs to be extracted. Each chunk is processed in turn, extracting the
-    # necessary data and storing into the correct location in the output array.
-
-    # N.B., it is an important optimisation that we only visit chunks which overlap
-    # the selection. This minimises the number of iterations in the main for loop.
-
-    # check fields are sensible
-    out_dtype = check_fields(fields, self._dtype)
-
-    # determine output shape
-    out_shape = indexer.shape
-
-    # setup output array
-    if out is None:
-        out = np.empty(out_shape, dtype=out_dtype, order=self._order)
-    else:
-        check_array_shape('out', out, out_shape)
-
-    chunks_info = []
-    chunks_locs = []
-
-    # note: Zarr API has changed with zarr=2.15
-    # hasattr(self.chunk_store, "getitems") = True for zarr >= 2.15
-    zarr_version = zarr.__version__
-    zarr_api_change_version = "2.15"
-    if version.parse(zarr_version) < version.parse(zarr_api_change_version):
-        att_getitems = not hasattr(self.chunk_store, "getitems")
-    elif version.parse(zarr_version) >= version.parse(zarr_api_change_version):
-        att_getitems = hasattr(self.chunk_store, "getitems")
-    # iterate over chunks
-    if att_getitems or any(map(lambda x: x == 0, self.shape)):
-        # sequentially get one key at a time from storage
-        for chunk_coords, chunk_selection, out_selection in indexer:
-
-            if version.parse(zarr_version) < version.parse(zarr_api_change_version):
-                # load chunk selection into output array
-                pci = self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
-                                          drop_axes=indexer.drop_axes, fields=fields)
-                chunks_info.append(pci)
-                chunks_locs.append(chunk_coords)
-            elif version.parse(zarr_version) >= version.parse(zarr_api_change_version):
-                chunk_coords = [chunk_coords]
-                # load chunk selection into output array
-                pci = self._chunk_getitems(chunk_coords, chunk_selection, out, out_selection,
-                                           drop_axes=indexer.drop_axes, fields=fields)
-                chunks_info.append(pci)
-                chunks_locs.append(chunk_coords[0])
-    else:
-        # allow storage to get multiple items at once
-        lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
-        self._chunk_getitems(lchunk_coords, lchunk_selection, out, lout_selection,
-                                drop_axes=indexer.drop_axes, fields=fields)
-
-    if out.shape:
-        return out, chunks_info, chunks_locs
-    else:
-        return out[()], chunks_info, chunks_locs
-
-
-def as_chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection,
-                    drop_axes=None, fields=None):
-    """Obtain part or whole of a chunk.
-    Parameters
-    ----------
-    chunk_coords : tuple of ints
-        Indices of the chunk.
-    chunk_selection : selection
-        Location of region within the chunk to extract.
-    out : ndarray
-        Array to store result in.
-    out_selection : selection
-        Location of region within output array to store results in.
-    drop_axes : tuple of ints
-        Axes to squeeze out of the chunk.
-    fields
-        TODO
-    """
-    out_is_ndarray = True
-    try:
-        out = ensure_ndarray(out)
-    except TypeError:
-        out_is_ndarray = False
-
-    assert len(chunk_coords) == len(self._cdata_shape)
-
-    try:
-        pci_info = self._process_chunk_V(chunk_selection)
-    except KeyError:
-        # chunk not initialized
-        if self._fill_value is not None:
-            if fields:
-                fill_value = self._fill_value[fields]
-            else:
-                fill_value = self._fill_value
-            out[out_selection] = fill_value
-        pci_info = self._process_chunk_V(chunk_selection)
-
-    return pci_info
-
-
-def as_process_chunk(
-    self,
-    out,
-    cdata,
-    chunk_selection,
-    drop_axes,
-    out_is_ndarray,
-    fields,
-    out_selection,
-    partial_read_decode=False,
-):
-    """Take binary data from storage and fill output array"""
-    if (out_is_ndarray and
-            not fields and
-            is_contiguous_selection(out_selection) and
-            is_total_slice(chunk_selection, self._chunks) and
-            not self._filters and
-            self._dtype != object):
-
-        dest = out[out_selection]
-        write_direct = (
-            dest.flags.writeable and
-            (
-                (self._order == 'C' and dest.flags.c_contiguous) or
-                (self._order == 'F' and dest.flags.f_contiguous)
-            )
-        )
-
-        if write_direct:
-
-            # optimization: we want the whole chunk, and the destination is
-            # contiguous, so we can decompress directly from the chunk
-            # into the destination array
-            if self._compressor:
-                if isinstance(cdata, PartialReadBuffer):
-                    cdata = cdata.read_full()
-                self._compressor.decode(cdata, dest)
-            else:
-                chunk = ensure_ndarray(cdata).view(self._dtype)
-                if np.prod(chunk.shape) > np.prod(self._chunks):
-                    raise ValueError(f"Chunk shape {chunk.shape} exceeds "
-                                     f"data chunks shape {self._chunks}")
-                chunk = chunk.reshape(self._chunks, order=self._order)
-                np.copyto(dest, chunk)
-            return
-
-    # decode chunk
-    try:
-        if partial_read_decode:
-            cdata.prepare_chunk()
-            # size of chunk
-            tmp = np.empty(self._chunks, dtype=self.dtype)
-            index_selection = PartialChunkIterator(chunk_selection, self.chunks)
-            for start, nitems, partial_out_selection in index_selection:
-                expected_shape = [
-                    len(
-                        range(*partial_out_selection[i].indices(self.chunks[0] + 1))
-                    )
-                    if i < len(partial_out_selection)
-                    else dim
-                    for i, dim in enumerate(self.chunks)
-                ]
-                cdata.read_part(start, nitems)
-                chunk_partial = self._decode_chunk(
-                    cdata.buff,
-                    start=start,
-                    nitems=nitems,
-                    expected_shape=expected_shape,
-                )
-                tmp[partial_out_selection] = chunk_partial
-            out[out_selection] = tmp[chunk_selection]
-            return
-    except ArrayIndexError:
-        cdata = cdata.read_full()
-    if self._compressor and isinstance(cdata, np.ndarray):
-        raise TypeError(f"cdata {cdata} is an ndarray, can not decompress.")
-    chunk = self._decode_chunk(cdata)
-
-    # select data from chunk
-    if fields:
-        chunk = chunk[fields]
-    tmp = chunk[chunk_selection]
-    if drop_axes:
-        tmp = np.squeeze(tmp, axis=drop_axes)
-
-    # store selected data in output
-    if np.prod(tmp.shape) > np.prod(out.shape):
-        raise ValueError(f"Storage chunk shape {tmp.shape} exceeds permitted "
-                         f"output data shape {out.shape}.")
-    out[out_selection] = tmp
-
-def as_process_chunk_V(self, chunk_selection):
-    """Run an instance of PCI inside the engine."""
-    index_selection = PartialChunkIterator(chunk_selection, self.chunks)
-    for _, _, _ in index_selection:
-        return self.chunks, chunk_selection, index_selection
diff --git a/activestorage/netcdf_to_zarr.py b/activestorage/netcdf_to_zarr.py
deleted file mode 100644
index 4bd0bb82..00000000
--- a/activestorage/netcdf_to_zarr.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-import numpy as np
-import zarr
-import ujson
-import fsspec
-import s3fs
-import tempfile
-
-from activestorage.config import *
-from kerchunk.hdf import SingleHdf5ToZarr
-
-
-def gen_json(file_url, varname, outf, storage_type, storage_options):
-    """Generate a json file that contains the kerchunk-ed data for Zarr."""
-    # S3 configuration presets
-    if storage_type == "s3" and storage_options is None:
-        fs = s3fs.S3FileSystem(key=S3_ACCESS_KEY,
-                               secret=S3_SECRET_KEY,
-                               client_kwargs={'endpoint_url': S3_URL},
-                               default_fill_cache=False,
-                               default_cache_type="none"
-        )
-        fs2 = fsspec.filesystem('')
-        with fs.open(file_url, 'rb') as s3file:
-            h5chunks = SingleHdf5ToZarr(s3file, file_url,
-                                        inline_threshold=0)
-            with fs2.open(outf, 'wb') as f:
-                content = h5chunks.translate()
-                f.write(ujson.dumps(content).encode())
-
-    # S3 passed-in configuration
-    elif storage_type == "s3" and storage_options is not None:
-        storage_options = storage_options.copy()
-        storage_options['default_fill_cache'] = False
-        storage_options['default_cache_type'] = "none"
-        fs = s3fs.S3FileSystem(**storage_options)
-        fs2 = fsspec.filesystem('')
-        with fs.open(file_url, 'rb') as s3file:
-            h5chunks = SingleHdf5ToZarr(s3file, file_url,
-                                        inline_threshold=0)
-            with fs2.open(outf, 'wb') as f:
-                content = h5chunks.translate()
-                f.write(ujson.dumps(content).encode())
-    # not S3
-    else:
-        fs = fsspec.filesystem('')
-        with fs.open(file_url, 'rb') as local_file:
-            try:
-                h5chunks = SingleHdf5ToZarr(local_file, file_url,
-                                            inline_threshold=0)
-            except OSError as exc:
-                raiser_1 = f"Unable to open file {file_url}. "
-                raiser_2 = "Check if file is netCDF3 or netCDF-classic"
-                print(raiser_1 + raiser_2)
-                raise exc
-
-            # inline threshold adjusts the Size below which binary blocks are
-            # included directly in the output
-            # a higher inline threshold can result in a larger json file but
-            # faster loading time
-            # for active storage, we don't want anything inline
-            with fs.open(outf, 'wb') as f:
-                content = h5chunks.translate()
-                f.write(ujson.dumps(content).encode())
-
-    zarray =  ujson.loads(content['refs'][f"{varname}/.zarray"])
-    zattrs =  ujson.loads(content['refs'][f"{varname}/.zattrs"])
-                
-    return outf, zarray, zattrs
-
-
-def open_zarr_group(out_json, varname):
-    """
-    Do the magic opening
-
-    Open a json file read and saved by the reference file system
-    into a Zarr Group, then extract the Zarr Array you need.
-    That Array is in the 'data' attribute.
-    """
-    fs = fsspec.filesystem("reference", fo=out_json)
-    mapper = fs.get_mapper("")  # local FS mapper
-    #mapper.fs.reference has the kerchunk mapping, how does this propagate into the Zarr array?
-    zarr_group = zarr.open_group(mapper)
-   
-    try:
-        zarr_array = getattr(zarr_group, varname)
-    except AttributeError as attrerr:
-        print(f"Zarr Group does not contain variable {varname}. "
-              f"Zarr Group info: {zarr_group.info}")
-        raise attrerr
-    #print("Zarr array info:",  zarr_array.info)
-    
-    return zarr_array
-
-
-def load_netcdf_zarr_generic(fileloc, varname, storage_type, storage_options, build_dummy=True):
-    """Pass a netCDF4 file to be shaped as Zarr file by kerchunk."""
-    print(f"Storage type {storage_type}")
-
-    # Write the Zarr group JSON to a temporary file.
-    with tempfile.NamedTemporaryFile() as out_json:
-        _, zarray, zattrs = gen_json(fileloc,
-                                     varname,
-                                     out_json.name,
-                                     storage_type,
-                                     storage_options)
-
-        # open this monster
-        print(f"Attempting to open and convert {fileloc}.")
-        ref_ds = open_zarr_group(out_json.name, varname)
-
-    return ref_ds, zarray, zattrs
-
-
-#d = {'version': 1,
-# 'refs': {
-#     '.zgroup': '{"zarr_format":2}',
-#     '.zattrs': '{"Conventions":"CF-1.6","access-list":"grenvillelister simonwilson jeffcole","awarning":"**** THIS SUITE WILL ARCHIVE NON-DUPLEXED DATA TO MOOSE. FOR CRITICAL MODEL RUNS SWITCH TO DUPLEXED IN: postproc --> Post Processing - common settings --> Moose Archiving --> non_duplexed_set. Follow guidance in http:\\/\\/www-twiki\\/Main\\/MassNonDuplexPolicy","branch-date":"1950-01-01","calendar":"360_day","code-version":"UM 11.6, NEMO vn3.6","creation_time":"2022-10-28 12:28","decription":"Initialised from EN4 climatology","description":"Copy of u-ar696\\/trunk@77470","email":"r.k.schieman@reading.ac.uk","end-date":"2015-01-01","experiment-id":"historical","forcing":"AA,BC,CO2","forcing-info":"blah, blah, blah","institution":"NCAS","macro-parent-experiment-id":"historical","macro-parent-experiment-mip":"CMIP","macro-parent-variant-id":"r1i1p1f3","model-id":"HadGEM3-CG31-MM","name":"\\/work\\/n02\\/n02\\/grenvill\\/cylc-run\\/u-cn134\\/share\\/cycle\\/19500101T0000Z\\/3h_","owner":"rosalynhatcher","project":"Coupled Climate","timeStamp":"2022-Oct-28 12:20:33 GMT","title":"[CANARI] GC3.1 N216 ORCA025 UM11.6","uuid":"51e5ef20-d376-4aa6-938e-4c242886b7b1"}',
-#     'lat/.zarray': '{"chunks":[324],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":null,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[324],"zarr_format":2}', 'lat/.zattrs': '{"_ARRAY_DIMENSIONS":["lat"],"axis":"Y","long_name":"Latitude","standard_name":"latitude","units":"degrees_north"}',
-#     'lat/0': ['/home/david/Downloads/3h__19500101-19500110.nc', 26477, 560],
-#     'lon/.zarray': '{"chunks":[432],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":null,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[432],"zarr_format":2}',
-#     'lon/.zattrs': '{"_ARRAY_DIMENSIONS":["lon"],"axis":"X","long_name":"Longitude","standard_name":"longitude","units":"degrees_east"}',
-#     'lon/0': ['/home/david/Downloads/3h__19500101-19500110.nc', 27037, 556],
-#     'm01s00i507_10/.zarray': '{"chunks":[1,324,432],"compressor":{"id":"zlib","level":1},"dtype":"<f4","fill_value":-1073741824.0,"filters":[{"elementsize":4,"id":"shuffle"}],"order":"C","shape":[80,324,432],"zarr_format":2}',
-#     'm01s00i507_10/.zattrs': '{"_ARRAY_DIMENSIONS":["time_counter","lat","lon"],"cell_methods":"time: mean (interval: 900 s)","coordinates":"time_centered","interval_offset":"0ts","interval_operation":"900 s","interval_write":"3 h","long_name":"OPEN SEA SURFACE TEMP AFTER TIMESTEP","missing_value":-1073741824.0,"online_operation":"average","standard_name":"surface_temperature","units":"K"}',
-#     }}

From eb9e8535d3d545091bff354fd7e9e44afc7403bb Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 3 Mar 2024 13:55:55 +0000
Subject: [PATCH 02/85] bnl_test works with pyfive. The tests are borked
 because they're riddled with zarr dependencies

---
 activestorage/active.py | 45 ++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 04deb549..c70ddd33 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -5,7 +5,6 @@
 import pathlib
 import urllib
 import pyfive
-from pyfive import ZarrArrayStub, OrthogonalIndexer
 
 import s3fs
 
@@ -129,7 +128,7 @@ def __getitem__(self, index):
         # in all casese we need an open netcdf file to get at attributes
         # FIXME. We then need to monkey patch the "filename" as rfile onto the dataset
         if self.storage_type is None:
-            nc = pyfive.FILE(self.uri)
+            nc = pyfive.File(self.uri)
         elif self.storage_type == "s3":
             nc = load_from_s3(self.uri, self.storage_options)
 
@@ -228,11 +227,16 @@ def _get_active(self, method, *args):
 
     def _get_selection(self, ds, *args):
         """ 
-        We need to load the b-tree index first. The current machinery is 
-        using a temporary branch in a fork of pyfive. It will get cleaner
-        when we tidy that up.
+        At this point we have a Dataset object, but all the important information about
+        how to use it is in the attribute DataoobjectDataset class. Here we gather 
+        metadata from the dataset instance and then continue with the dataobjects instance.
         """
 
+
+
+        # stick this here for later, to discuss with David
+        keepdims = True
+
         # Get missing values
         _FillValue = ds.attrs.get('_FillValue')
         missing_value = ds.attrs.get('missing_value')
@@ -255,19 +259,20 @@ def _get_selection(self, ds, *args):
             valid_min,
             valid_max,
         )
-        array = ZarrArrayStub(ds.shape, ds.chunks)
 
 
-        indexer = OrthogonalIndexer(*args, array)
+        ds = ds._dataobjects
+        array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
+        indexer = pyfive.OrthogonalIndexer(*args, array)
+
         out_shape = indexer.shape
-        out_dtype = self.zds._dtype
+        out_dtype =ds.dtype
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
-        #drop_axes = indexer.drop_axes  # not sure what this does and why, yet.
-
+        drop_axes = indexer.drop_axes and keepdims
 
-        return self._from_storage(ds, indexer, out_shape, out_dtype, missing)
+        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, drop_axes)
 
-    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing):
+    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, drop_axes):
         method = self.method
         if method is not None:
             out = []
@@ -304,8 +309,7 @@ def _from_storage(self, ds, indexer, out_shape, out_dtype, missing):
                 future = executor.submit(
                     self._process_chunk,
                     session,  ds, chunk_coords, chunk_selection,
-                    counts, out_selection, missing)
-                    #drop_axes=drop_axes)
+                    counts, out_selection, missing, drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
@@ -376,8 +380,7 @@ def _get_endpoint_url(self):
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
     def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
-                       out_selection, missing) 
-                       #drop_axes=None):
+                       out_selection, missing, drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
@@ -389,8 +392,8 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
 
         """
         
-        offset, size, filter_mask = ds.get_chunk_details[chunk_coords]
-        rfile = ds.rfile
+        offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
+        rfile = ds.fh.name
         #FIXME: need compressor and filters from ds
         compressor = None
         filters=None
@@ -447,15 +450,15 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
             # so neither the returned data or the interface should be considered stable
             # although we will version changes.
             tmp, count = reduce_chunk(rfile, offset, size, compressor, filters,
-                                      missing, self.zds._dtype,
+                                      missing, ds.dtype,
                                       ds.chunks, ds.order,
                                       chunk_selection, method=self.method)
 
         if self.method is not None:
             return tmp, count
         else:
-            #if drop_axes:
-            #    tmp = np.squeeze(tmp, axis=drop_axes)
+            if drop_axes:
+                tmp = np.squeeze(tmp, axis=drop_axes)
             return tmp, out_selection
 
     def _mask_data(self, data, ds_var):

From 0d7bf5582ef394d9c8a57ff80679aa180b4d8989 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 4 Mar 2024 11:18:15 +0000
Subject: [PATCH 03/85] Gzip decompression working. Will document issues
 remaining in #188

---
 activestorage/active.py       | 25 +++++------
 activestorage/hdf2numcodec.py | 78 +++++++++++++++++++++++++++++++++++
 bnl/bnl_test.py               | 45 ++++++++++++++++++++
 3 files changed, 136 insertions(+), 12 deletions(-)
 create mode 100644 activestorage/hdf2numcodec.py
 create mode 100644 bnl/bnl_test.py

diff --git a/activestorage/active.py b/activestorage/active.py
index c70ddd33..d11ba4eb 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -11,6 +11,7 @@
 from activestorage.config import *
 from activestorage import reductionist
 from activestorage.storage import reduce_chunk
+from activestorage.hdf2numcodec import decode_filters
 
 
@@ -260,7 +261,7 @@ def _get_selection(self, ds, *args):
             valid_max,
         )
 
-
+        nfilters = decode_filters(ds._dataobjects.filter_pipeline ,ds.name)
         ds = ds._dataobjects
         array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
         indexer = pyfive.OrthogonalIndexer(*args, array)
@@ -270,10 +271,11 @@ def _get_selection(self, ds, *args):
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
 
-        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, drop_axes)
+        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, nfilters, drop_axes)
 
-    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, drop_axes):
+    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, nfilters, drop_axes):
         method = self.method
+       
         if method is not None:
             out = []
             counts = []
@@ -309,7 +311,7 @@ def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, drop_axes):
                 future = executor.submit(
                     self._process_chunk,
                     session,  ds, chunk_coords, chunk_selection,
-                    counts, out_selection, missing, drop_axes=drop_axes)
+                    counts, out_selection, missing, nfilters, drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
@@ -380,7 +382,7 @@ def _get_endpoint_url(self):
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
     def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
-                       out_selection, missing, drop_axes=None):
+                       out_selection, missing, nfilters,drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
@@ -394,9 +396,10 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
         
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
         rfile = ds.fh.name
-        #FIXME: need compressor and filters from ds
+        #The compressor is none, as compression is in the filter pipeline
+        #Note that the filter_pipeline here is the _numcodecs_ one, not the native ds one!
         compressor = None
-        filters=None
+        filters=nfilters
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
@@ -404,9 +407,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
             parsed_url = urllib.parse.urlparse(rfile)
             bucket = parsed_url.netloc
             object = parsed_url.path
-            # FIXME: We do not get the correct byte order on the Zarr Array's dtype
-            # when using S3, so use the value captured earlier.
-            dtype = self._dtype
+        
             # for certain S3 servers rfile needs to contain the bucket eg "bucket/filename"
             # as a result the parser above finds empty string bucket
             if bucket == "":
@@ -420,7 +421,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
                                                        S3_URL,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, dtype,
+                                                       missing, ds.dtype,
                                                        ds.chunks,
                                                        ds.order,
                                                        chunk_selection,
@@ -439,7 +440,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
                                                        self._get_endpoint_url(),
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, dtype,
+                                                       missing, ds.dtype,
                                                        ds.chunks,
                                                        ds.order,
                                                        chunk_selection,
diff --git a/activestorage/hdf2numcodec.py b/activestorage/hdf2numcodec.py
new file mode 100644
index 00000000..1a007b9b
--- /dev/null
+++ b/activestorage/hdf2numcodec.py
@@ -0,0 +1,78 @@
+import numcodecs
+
+def decode_filters(filter_pipeline, name):
+    """
+    
+    Convert HDF5 filter and compression instructions into instructions understood
+    by numcodecs. Input is a pyfive filter_pipeline object
+
+    We can only support things that are supported by numcodecs, which one might 
+    hope to be a superset of what native pyfive can support.
+
+    See aslso zarr kerchunk _decode_filters. We may need to add much more 
+    support for BLOSC than is currently supported by pyfive.
+
+    """
+    filters = []
+
+    for filter in filter_pipeline:
+
+        filter_id=filter['filter_id']
+        properties = filter['client_data_values']
+
+        if filter_id == GZIP_DEFLATE_FILTER:
+            filters.append(numcodecs.Zlib(level=properties[0]))
+        elif filter_id == SHUFFLE_FILTER:
+            pass
+            #FIXME: inherited the pass by inspection from Zarr, pretty sure that's wrong.
+        elif filter_id == 32001:
+            blosc_compressors = (
+                "blosclz",
+                "lz4",
+                "lz4hc",
+                "snappy",
+                "zlib",
+                "zstd",
+            )
+            (
+                _1,
+                _2,
+                bytes_per_num,
+                total_bytes,
+                clevel,
+                shuffle,
+                compressor,
+            ) = properties
+            pars = dict(
+                blocksize=total_bytes,
+                clevel=clevel,
+                shuffle=shuffle,
+                cname=blosc_compressors[compressor],
+            )
+            filters.append(numcodecs.Blosc(**pars))
+        elif filter_id == 32015:
+            filters.append(numcodecs.Zstd(level=properties[0]))
+        elif filter_id == 32004:
+            raise RuntimeError(
+                f"{name} uses lz4 compression - not supported as yet"
+            )
+        elif filter_id == 32008:
+            raise RuntimeError(
+                f"{name} uses bitshuffle compression - not supported as yet"
+            )
+        else:
+            raise RuntimeError(
+                f"{name} uses filter id {filter_id} with properties {properties},"
+                f" not supported as yet"
+            )
+    return filters
+
+# These are from pyfive's HDF5 filter definitions
+# IV.A.2.l The Data Storage - Filter Pipeline message
+RESERVED_FILTER = 0
+GZIP_DEFLATE_FILTER = 1
+SHUFFLE_FILTER = 2
+FLETCH32_FILTER = 3
+SZIP_FILTER = 4
+NBIT_FILTER = 5
+SCALEOFFSET_FILTER = 6
diff --git a/bnl/bnl_test.py b/bnl/bnl_test.py
new file mode 100644
index 00000000..7354d1b3
--- /dev/null
+++ b/bnl/bnl_test.py
@@ -0,0 +1,45 @@
+import os
+import pytest
+
+import numpy as np
+from netCDF4 import Dataset
+from pathlib import Path
+import s3fs
+
+from activestorage.active import Active
+from activestorage.config import *
+
+
+def mytest():
+    """
+    Test again a native model, this time around netCDF4 loadable with h5py
+    Also, this has _FillValue and missing_value
+    """
+    ncfile, v = "cesm2_native.nc","TREFHT"
+    ncfile, v  = "CMIP6-test.nc", 'tas'
+    #ncfile, v = "chunked.hdf5", "dataset1"
+    mypath = Path(__file__).parent
+    uri = str(mypath/ncfile)
+    active = Active(uri, v, None)
+    active._version = 0
+    if v == "dataset1":
+        d = active[2,:]
+    else:
+        d = active[4:5, 1:2]
+    mean_result = np.mean(d)
+
+    active = Active(uri, v, None)
+    active._version = 2
+    active.method = "mean"
+    active.components = True
+    if  v == "dataset1":
+        result2 = active[2,:]
+    else:
+        result2 = active[4:5, 1:2]
+    print(result2, ncfile)
+    # check for active
+    np.testing.assert_allclose(mean_result, result2["sum"]/result2["n"], rtol=1e-6)
+
+
+if __name__=="__main__":
+    mytest()
\ No newline at end of file

From d73fcd84c94b237a98df71183c8d9c18ee3f2ce6 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 4 Mar 2024 12:54:07 +0000
Subject: [PATCH 04/85] Split compression and filters

---
 activestorage/__init__.py     |   5 +-
 activestorage/active.py       |  17 ++---
 activestorage/hdf2numcodec.py | 119 ++++++++++++++++++++--------------
 3 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/activestorage/__init__.py b/activestorage/__init__.py
index 7e632ea9..f5a6f53d 100644
--- a/activestorage/__init__.py
+++ b/activestorage/__init__.py
@@ -1,4 +1,5 @@
 from .active import Active
-
-__version__ = "0.0.1"
+# 0.0.1 is the initial version using kerchunk
+# 0.0.2 is testing out the use of pyfive instead
+__version__ = "0.0.2"
 
diff --git a/activestorage/active.py b/activestorage/active.py
index d11ba4eb..57080e57 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -233,8 +233,6 @@ def _get_selection(self, ds, *args):
         metadata from the dataset instance and then continue with the dataobjects instance.
         """
 
-
-
         # stick this here for later, to discuss with David
         keepdims = True
 
@@ -261,7 +259,8 @@ def _get_selection(self, ds, *args):
             valid_max,
         )
 
-        nfilters = decode_filters(ds._dataobjects.filter_pipeline ,ds.name)
+        # hopefully fix pyfive to get a dtype directly
+        compressor, filters = decode_filters(ds._dataobjects.filter_pipeline , np.dtype(ds.dtype).itemsize, ds.name)
         ds = ds._dataobjects
         array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
         indexer = pyfive.OrthogonalIndexer(*args, array)
@@ -271,9 +270,9 @@ def _get_selection(self, ds, *args):
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
 
-        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, nfilters, drop_axes)
+        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, compressor, filters, drop_axes)
 
-    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, nfilters, drop_axes):
+    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, compressor, filters, drop_axes):
         method = self.method
        
         if method is not None:
@@ -311,7 +310,7 @@ def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, nfilters, dr
                 future = executor.submit(
                     self._process_chunk,
                     session,  ds, chunk_coords, chunk_selection,
-                    counts, out_selection, missing, nfilters, drop_axes=drop_axes)
+                    counts, out_selection, missing, compressor, filters, drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
@@ -382,7 +381,7 @@ def _get_endpoint_url(self):
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
     def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
-                       out_selection, missing, nfilters,drop_axes=None):
+                       out_selection, missing, compressor, filters, drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
@@ -396,10 +395,6 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
         
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
         rfile = ds.fh.name
-        #The compressor is none, as compression is in the filter pipeline
-        #Note that the filter_pipeline here is the _numcodecs_ one, not the native ds one!
-        compressor = None
-        filters=nfilters
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
diff --git a/activestorage/hdf2numcodec.py b/activestorage/hdf2numcodec.py
index 1a007b9b..faac282e 100644
--- a/activestorage/hdf2numcodec.py
+++ b/activestorage/hdf2numcodec.py
@@ -1,71 +1,94 @@
 import numcodecs
 
-def decode_filters(filter_pipeline, name):
+def decode_filters(filter_pipeline, itemsize, name):
     """
     
     Convert HDF5 filter and compression instructions into instructions understood
     by numcodecs. Input is a pyfive filter_pipeline object
 
-    We can only support things that are supported by numcodecs, which one might 
-    hope to be a superset of what native pyfive can support.
+    We can only support things that are supported by our storage backends, which
+    is a rather limited range right now:
+     - gzip compression, and
+     - shuffle filter 
 
     See aslso zarr kerchunk _decode_filters. We may need to add much more 
-    support for BLOSC than is currently supported by pyfive.
+    support for BLOSC than currently supported.
+
+    Useful notes on filters are here:
+     - https://docs.unidata.ucar.edu/netcdf-c/current/filters.html and
+     - https://docs.hdfgroup.org/hdf5/v1_8/group___h5_z.html
+
+    In particular, note that we can only support things that go beyond native HDF5
+    if _we_ support them directly.
 
     """
-    filters = []
+    compressors, filters = [], []
 
     for filter in filter_pipeline:
 
         filter_id=filter['filter_id']
         properties = filter['client_data_values']
 
+
+        # We suppor the following
         if filter_id == GZIP_DEFLATE_FILTER:
-            filters.append(numcodecs.Zlib(level=properties[0]))
+            compressors.append(numcodecs.Zlib(level=properties[0]))
         elif filter_id == SHUFFLE_FILTER:
-            pass
-            #FIXME: inherited the pass by inspection from Zarr, pretty sure that's wrong.
-        elif filter_id == 32001:
-            blosc_compressors = (
-                "blosclz",
-                "lz4",
-                "lz4hc",
-                "snappy",
-                "zlib",
-                "zstd",
-            )
-            (
-                _1,
-                _2,
-                bytes_per_num,
-                total_bytes,
-                clevel,
-                shuffle,
-                compressor,
-            ) = properties
-            pars = dict(
-                blocksize=total_bytes,
-                clevel=clevel,
-                shuffle=shuffle,
-                cname=blosc_compressors[compressor],
-            )
-            filters.append(numcodecs.Blosc(**pars))
-        elif filter_id == 32015:
-            filters.append(numcodecs.Zstd(level=properties[0]))
-        elif filter_id == 32004:
-            raise RuntimeError(
-                f"{name} uses lz4 compression - not supported as yet"
-            )
-        elif filter_id == 32008:
-            raise RuntimeError(
-                f"{name} uses bitshuffle compression - not supported as yet"
-            )
+            filters.append(numcodecs.Shuffle(elementsize=itemsize))
         else:
-            raise RuntimeError(
-                f"{name} uses filter id {filter_id} with properties {properties},"
-                f" not supported as yet"
-            )
-    return filters
+            raise NotImplementedError('We cannot yet support filter id ',filter_id)
+        
+        # We might be able, in the future, to support the following
+        # At the moment the following code cannot be implemented, but we can move 
+        # the loops up as we develop backend support.
+
+        if 0:
+
+
+            if filter_id == 32001:
+                blosc_compressors = (
+                    "blosclz",
+                    "lz4",
+                    "lz4hc",
+                    "snappy",
+                    "zlib",
+                    "zstd",
+                )
+                (
+                    _1,
+                    _2,
+                    bytes_per_num,
+                    total_bytes,
+                    clevel,
+                    shuffle,
+                    compressor,
+                ) = properties
+                pars = dict(
+                    blocksize=total_bytes,
+                    clevel=clevel,
+                    shuffle=shuffle,
+                    cname=blosc_compressors[compressor],
+                )
+                filters.append(numcodecs.Blosc(**pars))
+            elif filter_id == 32015:
+                filters.append(numcodecs.Zstd(level=properties[0]))
+            elif filter_id == 32004:
+                raise RuntimeError(
+                    f"{name} uses lz4 compression - not supported as yet"
+                )
+            elif filter_id == 32008:
+                raise RuntimeError(
+                    f"{name} uses bitshuffle compression - not supported as yet"
+                )
+            else:
+                raise RuntimeError(
+                    f"{name} uses filter id {filter_id} with properties {properties},"
+                    f" not supported as yet"
+                )
+            
+    if len(compressors) > 1: 
+        raise ValueError('We only expected one compression algorithm')
+    return compressors[0], filters
 
 # These are from pyfive's HDF5 filter definitions
 # IV.A.2.l The Data Storage - Filter Pipeline message

From 0c4272f61bc07b1b55696b1df0d13120c2886391 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 4 Mar 2024 15:19:05 +0000
Subject: [PATCH 05/85] Slight doc improvement in hdf2numcodec

---
 activestorage/hdf2numcodec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/activestorage/hdf2numcodec.py b/activestorage/hdf2numcodec.py
index faac282e..96796855 100644
--- a/activestorage/hdf2numcodec.py
+++ b/activestorage/hdf2numcodec.py
@@ -4,7 +4,8 @@ def decode_filters(filter_pipeline, itemsize, name):
     """
     
     Convert HDF5 filter and compression instructions into instructions understood
-    by numcodecs. Input is a pyfive filter_pipeline object
+    by numcodecs. Input is a pyfive filter_pipeline object, the itemsize, and the
+    dataset name for error messages.
 
     We can only support things that are supported by our storage backends, which
     is a rather limited range right now:

From c20b25916b3034885790d211e5e88a43e737312f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 4 Mar 2024 15:26:51 +0000
Subject: [PATCH 06/85] Need to turn off decoding filter pipeline if there is
 none.

---
 activestorage/active.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 57080e57..23d8ba3c 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -260,7 +260,10 @@ def _get_selection(self, ds, *args):
         )
 
         # hopefully fix pyfive to get a dtype directly
-        compressor, filters = decode_filters(ds._dataobjects.filter_pipeline , np.dtype(ds.dtype).itemsize, ds.name)
+        if ds._dataobjects.filter_pipeline is None:
+            compressor, filters = None, None
+        else:
+            compressor, filters = decode_filters(ds._dataobjects.filter_pipeline , np.dtype(ds.dtype).itemsize, ds.name)
         ds = ds._dataobjects
         array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
         indexer = pyfive.OrthogonalIndexer(*args, array)

From 68e49d023968db7448289c2f0202253b551f7773 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 08:12:23 +0000
Subject: [PATCH 07/85] Support for contiguous data. Passing lots of tests now.

---
 activestorage/active.py | 29 +++++++++++++++++------------
 bnl/bnl_test.py         |  1 +
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 23d8ba3c..3825e099 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -259,23 +259,28 @@ def _get_selection(self, ds, *args):
             valid_max,
         )
 
+        name = ds.name
+        dtype = np.dtype(ds.dtype)
         # hopefully fix pyfive to get a dtype directly
-        if ds._dataobjects.filter_pipeline is None:
+        array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
+        ds = ds._dataobjects
+        
+        if ds.filter_pipeline is None:
             compressor, filters = None, None
         else:
-            compressor, filters = decode_filters(ds._dataobjects.filter_pipeline , np.dtype(ds.dtype).itemsize, ds.name)
-        ds = ds._dataobjects
-        array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
+            compressor, filters = decode_filters(ds.filter_pipeline , dtype.itemsize, name)
+        
         indexer = pyfive.OrthogonalIndexer(*args, array)
-
         out_shape = indexer.shape
         out_dtype =ds.dtype
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
 
-        return self._from_storage(ds, indexer, out_shape, out_dtype, missing, compressor, filters, drop_axes)
+        # we use array._chunks rather than ds.chunks, as the latter is none in the case of
+        # unchunked data, and we need to tell the storage the array dimensions in this case.
+        return self._from_storage(ds, indexer, array._chunks, out_shape, out_dtype, missing, compressor, filters, drop_axes)
 
-    def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, compressor, filters, drop_axes):
+    def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, missing, compressor, filters, drop_axes):
         method = self.method
        
         if method is not None:
@@ -312,7 +317,7 @@ def _from_storage(self, ds, indexer, out_shape, out_dtype, missing, compressor,
             for chunk_coords, chunk_selection, out_selection in indexer:
                 future = executor.submit(
                     self._process_chunk,
-                    session,  ds, chunk_coords, chunk_selection,
+                    session,  ds, chunks, chunk_coords, chunk_selection,
                     counts, out_selection, missing, compressor, filters, drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
@@ -383,7 +388,7 @@ def _get_endpoint_url(self):
 
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
-    def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
+    def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, counts,
                        out_selection, missing, compressor, filters, drop_axes=None):
         """
         Obtain part or whole of a chunk.
@@ -420,7 +425,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
                                                        missing, ds.dtype,
-                                                       ds.chunks,
+                                                       chunks,
                                                        ds.order,
                                                        chunk_selection,
                                                        operation=self._method)
@@ -439,7 +444,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
                                                        missing, ds.dtype,
-                                                       ds.chunks,
+                                                       chunks,
                                                        ds.order,
                                                        chunk_selection,
                                                        operation=self._method)
@@ -450,7 +455,7 @@ def _process_chunk(self, session, ds, chunk_coords, chunk_selection, counts,
             # although we will version changes.
             tmp, count = reduce_chunk(rfile, offset, size, compressor, filters,
                                       missing, ds.dtype,
-                                      ds.chunks, ds.order,
+                                      chunks, ds.order,
                                       chunk_selection, method=self.method)
 
         if self.method is not None:
diff --git a/bnl/bnl_test.py b/bnl/bnl_test.py
index 7354d1b3..c5bb3c3d 100644
--- a/bnl/bnl_test.py
+++ b/bnl/bnl_test.py
@@ -18,6 +18,7 @@ def mytest():
     ncfile, v = "cesm2_native.nc","TREFHT"
     ncfile, v  = "CMIP6-test.nc", 'tas'
     #ncfile, v = "chunked.hdf5", "dataset1"
+    ncfile, v = 'daily_data.nc', 'ta'
     mypath = Path(__file__).parent
     uri = str(mypath/ncfile)
     active = Active(uri, v, None)

From b745d5e5c521d7eb3ffe1a3c795ce3d8495bd367 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 12:43:45 +0000
Subject: [PATCH 08/85] Remove the file context manager for now

---
 activestorage/active.py | 35 +++++++++++---------
 bnl/bnl_s3test.py       | 72 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 16 deletions(-)
 create mode 100644 bnl/bnl_s3test.py

diff --git a/activestorage/active.py b/activestorage/active.py
index 3825e099..428a65cd 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -14,8 +14,6 @@
 from activestorage.hdf2numcodec import decode_filters
 
 
-
-@contextlib.contextmanager
 def load_from_s3(uri, storage_options=None):
     """
     Load a netCDF4-like object from S3.
@@ -39,10 +37,11 @@ def load_from_s3(uri, storage_options=None):
                                client_kwargs={'endpoint_url': S3_URL})  # eg "http://localhost:9000" for Minio
     else:
         fs = s3fs.S3FileSystem(**storage_options)  # use passed-in dictionary
-    with fs.open(uri, 'rb') as s3file:
-        ds = pyfive.File(s3file, 'r', invalid_netcdf=True)
-        print(f"Dataset loaded from S3 via h5netcdf: {ds}")
-        yield ds
+   
+    s3file = fs.open(uri, 'rb')
+    ds = pyfive.File(s3file)
+    print(f"Dataset loaded from S3 via h5netcdf: {uri}")
+    return ds
 
 
@@ -127,25 +126,28 @@ def __getitem__(self, index):
         ncvar = self.ncvar
 
         # in all casese we need an open netcdf file to get at attributes
-        # FIXME. We then need to monkey patch the "filename" as rfile onto the dataset
+        # we keep it open because we need it's b-tree
         if self.storage_type is None:
             nc = pyfive.File(self.uri)
         elif self.storage_type == "s3":
             nc = load_from_s3(self.uri, self.storage_options)
+        self.filename = self.uri
+ 
 
         if self.method is None and self._version == 0:
             # No active operation
             if self.storage_type is None:
                 data = nc[ncvar][index]
-                nc.close()
+                
             elif self.storage_type == "s3":
         
                 data = nc[ncvar][index]
                 data = self._mask_data(data, nc[ncvar])
-                nc.close()
-                
+
+            data.filename = self.uri        
             return data
         
+        
         elif self._version == 1:
             return self._get_selection(nc[ncvar], index)
         
@@ -402,12 +404,12 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         """
         
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
-        rfile = ds.fh.name
+
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
-            print("S3 rfile is:", rfile)
-            parsed_url = urllib.parse.urlparse(rfile)
+            print("S3 rfile is:", self.filename)
+            parsed_url = urllib.parse.urlparse(self.filename)
             bucket = parsed_url.netloc
             object = parsed_url.path
         
@@ -419,12 +421,13 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             print("S3 bucket:", bucket)
             print("S3 file:", object)
             if self.storage_options is None:
+                # for the moment we need to force ds.dtype to be a numpy type
                 tmp, count = reductionist.reduce_chunk(session,
                                                        S3_ACTIVE_STORAGE_URL,
                                                        S3_URL,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, ds.dtype,
+                                                       missing, np.dtype(ds.dtype),
                                                        chunks,
                                                        ds.order,
                                                        chunk_selection,
@@ -443,7 +446,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        self._get_endpoint_url(),
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, ds.dtype,
+                                                       missing, np.dtype(ds.dtype),
                                                        chunks,
                                                        ds.order,
                                                        chunk_selection,
@@ -453,7 +456,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             # see https://github.com/valeriupredoi/PyActiveStorage/issues/33
             # so neither the returned data or the interface should be considered stable
             # although we will version changes.
-            tmp, count = reduce_chunk(rfile, offset, size, compressor, filters,
+            tmp, count = reduce_chunk(self.filename, offset, size, compressor, filters,
                                       missing, ds.dtype,
                                       chunks, ds.order,
                                       chunk_selection, method=self.method)
diff --git a/bnl/bnl_s3test.py b/bnl/bnl_s3test.py
new file mode 100644
index 00000000..dd7a72ba
--- /dev/null
+++ b/bnl/bnl_s3test.py
@@ -0,0 +1,72 @@
+
+from activestorage.active import Active
+import os
+from pathlib import Path
+from netCDF4 import Dataset
+import numpy as np
+import pyfive
+import s3fs
+
+from activestorage.active import load_from_s3
+
+S3_BUCKET = "bnl"
+
+def simple(filename, var):
+
+    S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
+    fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
+    uri = 'bnl/'+filename
+
+    with fs.open(uri,'rb') as s3file2:
+        f2 = pyfive.File(s3file2)
+        print(f2[var])
+
+    with fs.open(uri, 'rb') as s3file:
+        ds = pyfive.File(s3file)
+        print(ds[var])
+
+    #f2 = load_from_s3(uri, storage_options={'client_kwargs':{"endpoint_url":S3_URL}})
+      
+def test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var):
+    """
+    Test use of datasets with compression and filters applied for a real
+    CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
+
+    This is for a special anon=True bucket connected to via valid key.secret
+    """
+    storage_options = {
+        'key': "f2d55c6dcfc7618b2c34e00b58df3cef",
+        'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT",
+        'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}
+    }
+    active_storage_url = "https://192.171.169.248:8080"
+
+
+    mypath = Path(__file__).parent
+    uri = str(mypath/ncfile)
+    with Dataset(uri) as nc_data:
+        nc_min = np.min(nc_data[var][0:2,4:6,7:9])
+    print(f"Numpy min from compressed file {nc_min}")
+
+    ofile = os.path.basename(uri)
+    test_file_uri = os.path.join(
+        S3_BUCKET,
+        ofile
+    )
+    print("S3 Test file path:", test_file_uri)
+    active = Active(test_file_uri, var, storage_type="s3",
+                    storage_options=storage_options,
+                    active_storage_url=active_storage_url)
+
+    active._version = 1
+    active._method = "min"
+
+    result = active[0:2,4:6,7:9]
+    assert nc_min == result
+    assert result == 239.25946044921875
+
+
+if __name__=="__main__":
+    ncfile, var = 'CMIP6-test.nc','tas'
+    simple(ncfile, var)
+    test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var)
\ No newline at end of file

From 74f86620c821b0ba5b48a6611fa3f376be458745 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 13:40:39 +0000
Subject: [PATCH 09/85] A couple more h5py lurkers and one more bug squashed.

---
 activestorage/active.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 428a65cd..c5b3c477 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -143,8 +143,7 @@ def __getitem__(self, index):
         
                 data = nc[ncvar][index]
                 data = self._mask_data(data, nc[ncvar])
-
-            data.filename = self.uri        
+     
             return data
         
         
From b628a68f1bd1dcf3f5e7839f1add03690805dc42 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 14:46:46 +0000
Subject: [PATCH 10/85] removed redundant test file

---
 tests/unit/test_active_tools.py | 191 --------------------------------
 1 file changed, 191 deletions(-)
 delete mode 100644 tests/unit/test_active_tools.py

diff --git a/tests/unit/test_active_tools.py b/tests/unit/test_active_tools.py
deleted file mode 100644
index 2e1102f5..00000000
--- a/tests/unit/test_active_tools.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import os
-import numpy as np
-import pytest
-import zarr
-
-from activestorage import active_tools as at
-from numcodecs import Blosc
-from zarr.indexing import (
-    OrthogonalIndexer,
-    is_contiguous_selection,
-)
-from zarr.util import is_total_slice
-
-
-def assemble_zarr():
-    """Create a test zarr object."""
-    compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.BITSHUFFLE)
-    z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='i1', order='C',
-                    compressor=compressor)
-
-    return z
-
-
-def assemble_zarr_uncompressed():
-    """Create a test zarr object."""
-    z = zarr.create((1000, 1000), chunks=(2, 8), dtype='i1', order='C',
-                    compressor=None)
-
-    return z
-
-
-def assemble_zarr_with_fillvalue():
-    """Create a test zarr object."""
-    compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.BITSHUFFLE)
-    z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='i1', order='C',
-                    compressor=compressor)
-    z._fill_value = -99
-
-    return z
-
-
-def assemble_zarr_uncompressed_2():
-    """Create a test zarr object."""
-    z = zarr.create((100, 100), chunks=(10, 10), dtype='i1', order='C',
-                    compressor=None)
-
-    return z
-
-
-def test_as_get_orthogonal_selection():
-    """Test Zarr Orthogonal selection."""
-    z = assemble_zarr()
-    z._cache_metadata = None
-    selection = (slice(0, 2, 1), slice(4, 6, 1))
-    sel = at.as_get_orthogonal_selection(z, selection=selection, out=None,
-                                         fields=None)
-    expected_shape = (2, 2)
-    np.testing.assert_array_equal(sel.shape, expected_shape)
-    expected_elem = [0, 0]
-    np.testing.assert_array_equal(sel[0], expected_elem)
-
-
-def test_as_get_selection():
-    """Test chunk iterator."""
-    z = assemble_zarr()
-    selection = (slice(0, 2, 1), slice(4, 6, 1))
-    indexer = OrthogonalIndexer(selection, z)
-    ch = at.as_get_selection(z, indexer, out=None,
-                             fields=None)
-    np.testing.assert_array_equal(ch[0][0], [0, 0])
-    np.testing.assert_array_equal(ch[1][0], None)
-    np.testing.assert_array_equal(ch[2][0], (0, 0))
-
-
-def test_as_chunk_getitem():
-    """Test chunk get item."""
-    z = assemble_zarr()
-    z = at.make_an_array_instance_active(z)
-    chunk_coords = (0, 3)
-    chunk_selection = [slice(0, 2, 1)]
-    out = np.array((2, 2, 2))
-    out_selection = slice(0, 2, 1)
-    ch = at.as_chunk_getitem(z, chunk_coords, chunk_selection, out, out_selection,
-                             drop_axes=None, fields=None)
-    np.testing.assert_array_equal(ch[0], (1000, 1000))
-    np.testing.assert_array_equal(ch[1], [slice(0, 2, 1)])
-    # PCI
-    assert list(ch[2]) == [(0, 2000, (slice(0, 2, 1),))]
-
-
-def test_as_chunk_getitem_with_fillvalue():
-    """Test chunk get item."""
-    z = assemble_zarr_with_fillvalue()
-    z = at.make_an_array_instance_active(z)
-    chunk_coords = (0, 3)
-    chunk_selection = [slice(0, 2, 1)]
-    out = np.array((2, 2, 2))
-    out_selection = slice(0, 2, 1)
-    ch = at.as_chunk_getitem(z, chunk_coords, chunk_selection, out, out_selection,
-                             drop_axes=None, fields=None)
-    np.testing.assert_array_equal(ch[0], (1000, 1000))
-    np.testing.assert_array_equal(ch[1], [slice(0, 2, 1)])
-    # PCI
-    assert list(ch[2]) == [(0, 2000, (slice(0, 2, 1),))]
-
-
-def test_process_chunk_uncompressed():
-    """Test for processing chunk engine for uncompressed data"""
-    z = assemble_zarr_uncompressed()
-    z = at.make_an_array_instance_active(z)
-    out = np.ones((1, 8))
-    cdata = np.ones((1, 2))
-    chunk_selection = slice(0, 1, 1)
-    out_selection = np.array((0))
-    ch = at.as_process_chunk(z,
-                             out,
-                             cdata,
-                             chunk_selection,
-                             drop_axes=False,
-                             out_is_ndarray=True,
-                             fields=None,
-                             out_selection=out_selection,
-                             partial_read_decode=False)
-
-    assert is_contiguous_selection(out_selection)
-    assert not is_total_slice(chunk_selection, z._chunks)
-    assert out.any()
-    assert out.shape == (1, 8)
-
-
-def test_process_chunk_uncompressed_write_direct():
-    """Test for processing chunk engine for uncompressed data"""
-    z = assemble_zarr_uncompressed_2()
-    z = at.make_an_array_instance_active(z)
-    out = np.ones((10, 10))
-    cdata = np.ones((10, 10))
-    chunk_selection = (slice(0, 10, 1), slice(10, 20, 1))
-    out_selection = np.array((0))
-    with pytest.raises(ValueError) as exc:
-        ch = at.as_process_chunk(z,
-                                 out,
-                                 cdata,
-                                 chunk_selection,
-                                 drop_axes=False,
-                                 out_is_ndarray=True,
-                                 fields=None,
-                                 out_selection=out_selection,
-                                 partial_read_decode=False)
-    assert str(exc.value) == "Chunk shape (10, 80) exceeds data chunks shape (10, 10)"
-
-    assert is_contiguous_selection(out_selection)
-    assert is_total_slice(chunk_selection, z._chunks)
-
-    z._chunks = (2, 8)
-    out = np.ones((1, 8))
-    cdata = np.ones((1, 2))
-    chunk_selection = slice(0, 8, 1)
-    out_selection = np.array((0))
-    with pytest.raises(ValueError) as exc:
-        ch = at.as_process_chunk(z,
-                                 out,
-                                 cdata,
-                                 chunk_selection,
-                                 drop_axes=False,
-                                 out_is_ndarray=True,
-                                 fields=None,
-                                 out_selection=out_selection,
-                                 partial_read_decode=False)
-    assert str(exc.value) == "Storage chunk shape (2, 8) exceeds permitted output data shape (1, 8)."
-
-
-def test_process_chunk_uncompressed_with_compressor():
-    """Test for processing chunk engine for uncompressed data"""
-    z = assemble_zarr()
-    z = at.make_an_array_instance_active(z)
-    out = np.ones((1, 8))
-    cdata = np.ones((1, 2))
-    chunk_selection = slice(0, 1, 1)
-    out_selection = np.array((0))
-    raised = f'cdata {cdata} is an ndarray, can not decompress.'
-    with pytest.raises(TypeError) as exc:
-        ch = at.as_process_chunk(z,
-                                 out,
-                                 cdata,
-                                 chunk_selection,
-                                 drop_axes=False,
-                                 out_is_ndarray=True,
-                                 fields=None,
-                                 out_selection=out_selection,
-                                 partial_read_decode=False)
-    assert str(exc.value) == raised

From 1803e7f78ce5ca3f7ec5c1a825cc31b7fb8e5b76 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 14:47:06 +0000
Subject: [PATCH 11/85] removed redundant test cases

---
 tests/unit/test_storage_types.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 561c36ff..4a3a4ccb 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -16,17 +16,13 @@
 from activestorage.active import Active
 from activestorage.config import *
 from activestorage.dummy_data import make_vanilla_ncdata
-from activestorage import netcdf_to_zarr
 import activestorage.reductionist
 import activestorage.storage
 
 
-# Capture the real function before it is mocked.
-old_netcdf_to_zarr = netcdf_to_zarr.load_netcdf_zarr_generic
 
 
 @mock.patch.object(activestorage.active, "load_from_s3")
-@mock.patch.object(activestorage.netcdf_to_zarr, "load_netcdf_zarr_generic")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
 def test_s3(mock_reduce, mock_nz, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
@@ -39,9 +35,6 @@ def test_s3(mock_reduce, mock_nz, mock_load, tmp_path):
     def load_from_s3(uri):
         yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
 
-    def load_netcdf_zarr_generic(uri, ncvar, storage_type, storage_options=None):
-        return old_netcdf_to_zarr(test_file, ncvar, None, None)
-
     def reduce_chunk(
         session,
         server,
@@ -149,7 +142,6 @@ def test_s3_load_failure(mock_load):
 
 
 @mock.patch.object(activestorage.active, "load_from_s3")
-@mock.patch.object(activestorage.netcdf_to_zarr, "load_netcdf_zarr_generic")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
 def test_reductionist_connection(mock_reduce, mock_nz, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
@@ -158,11 +150,7 @@ def test_reductionist_connection(mock_reduce, mock_nz, mock_load, tmp_path):
     def load_from_s3(uri):
         yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
 
-    def load_netcdf_zarr_generic(uri, ncvar, storage_type, storage_options=None):
-        return old_netcdf_to_zarr(test_file, ncvar, None, None)
-
     mock_load.side_effect = load_from_s3
-    mock_nz.side_effect = load_netcdf_zarr_generic
     mock_reduce.side_effect = requests.exceptions.ConnectTimeout()
 
     uri = "s3://fake-bucket/fake-object"
@@ -178,7 +166,6 @@ def load_netcdf_zarr_generic(uri, ncvar, storage_type, storage_options=None):
 
 
 @mock.patch.object(activestorage.active, "load_from_s3")
-@mock.patch.object(activestorage.netcdf_to_zarr, "load_netcdf_zarr_generic")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
 def test_reductionist_bad_request(mock_reduce, mock_nz, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
@@ -187,9 +174,6 @@ def test_reductionist_bad_request(mock_reduce, mock_nz, mock_load, tmp_path):
     def load_from_s3(uri):
         yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
 
-    def load_netcdf_zarr_generic(uri, ncvar, storage_type, storage_options=None):
-        return old_netcdf_to_zarr(test_file, ncvar, None, None)
-
     mock_load.side_effect = load_from_s3
     mock_nz.side_effect = load_netcdf_zarr_generic
     mock_reduce.side_effect = activestorage.reductionist.ReductionistError(400, "Bad request")

From f5a375c5f385d8cf57d1851b9ba5b51640ab0133 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 14:47:24 +0000
Subject: [PATCH 12/85] making sure we know Pyfive is run haha

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index c5b3c477..b37ba932 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -40,7 +40,7 @@ def load_from_s3(uri, storage_options=None):
    
     s3file = fs.open(uri, 'rb')
     ds = pyfive.File(s3file)
-    print(f"Dataset loaded from S3 via h5netcdf: {uri}")
+    print(f"Dataset loaded from S3 via h5netcdf: {uri} with Pyfive File handler")
     return ds
 
 
From d1c928230a20b38d723e34b0dd5d61f647522a91 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 15:01:33 +0000
Subject: [PATCH 13/85] fix bigger data test with native pyfive exception

---
 tests/test_bigger_data.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py
index 2f38bddb..2173558d 100644
--- a/tests/test_bigger_data.py
+++ b/tests/test_bigger_data.py
@@ -9,6 +9,7 @@
 
 from activestorage.active import Active
 from activestorage.config import *
+from pyfive.core import InvalidHDF5File as InvalidHDF5Err
 
 import utils
 
@@ -162,28 +163,17 @@ def test_native_emac_model_fails(test_data_path):
     """
     ncfile = str(test_data_path / "emac.nc")
     uri = utils.write_to_storage(ncfile)
-    # Use local file to avoid h5py
-    active = Active(ncfile, "aps_ave")
-    active._version = 0
-    d = active[4:5, 1:2]
-    if len(d):
-        mean_result = np.mean(d)
-    else:
-        # as it happens it is is possible for a slice to be
-        # all missing, so for the purpose of this test we 
-        # ignore it, but the general case should not.
-        pass
 
     if USE_S3:
         active = Active(uri, "aps_ave", utils.get_storage_type())
-        with pytest.raises(OSError):
+        with pytest.raises(InvalidHDF5Err):
             active[...]
     else:
         active = Active(uri, "aps_ave")
         active._version = 2
         active.method = "mean"
         active.components = True
-        with pytest.raises(OSError):
+        with pytest.raises(InvalidHDF5Err):
             result2 = active[4:5, 1:2]
 
 
From 69c3dcacbba22759e3714ce81b9d40a93a7533c5 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 16:26:37 +0000
Subject: [PATCH 14/85] fix package test

---
 tests/test_package.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_package.py b/tests/test_package.py
index 3ee2f41c..e6f00c37 100644
--- a/tests/test_package.py
+++ b/tests/test_package.py
@@ -6,7 +6,7 @@
 # test version
 def test_version():
     assert hasattr(activestorage, "__version__")
-    assert activestorage.__version__ == "0.0.1"
+    assert activestorage.__version__ == "0.0.2"
     print(activestorage.__version__)
 
 # check activestorage class
@@ -14,7 +14,6 @@ def test_activestorage_class_attrs():
     assert hasattr(activestorage, "Active")
     assert hasattr(activestorage, "active")
     assert hasattr(activestorage, "storage")
-    assert hasattr(activestorage, "netcdf_to_zarr")
 
 # check Active class
 def test_active_class_attrs():
@@ -25,7 +24,6 @@ def test_active_class_attrs():
     assert hasattr(act, "_get_active")
     assert hasattr(act, "_get_selection")
     assert hasattr(act, "_process_chunk")
-    assert hasattr(act, "_via_kerchunk")
     assert hasattr(act, "components")
     assert hasattr(act, "method")
     assert hasattr(act, "ncvar")

From c50c3d2edc0d4b4f010b8b36e47747761a73da94 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 16:32:05 +0000
Subject: [PATCH 15/85] mark lock test as xfail

---
 tests/unit/test_active.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py
index 2f95fdee..25f988cc 100644
--- a/tests/unit/test_active.py
+++ b/tests/unit/test_active.py
@@ -81,6 +81,7 @@ def test_active():
     init = active.__init__(uri=uri, ncvar=ncvar)
 
 
+@pytest.mark.xfail(reason="We don't employ locks with Pyfive anymore, yet.")
 def test_lock():
     """Unit test for class:Active."""
     uri = "tests/test_data/cesm2_native.nc"

From 09b4b3fa3124988150774bdf8eb04977f466dc68 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 16:47:13 +0000
Subject: [PATCH 16/85] toy for testing missing test

---
 bnl/test_missing_gubbins.py | 145 ++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 bnl/test_missing_gubbins.py

diff --git a/bnl/test_missing_gubbins.py b/bnl/test_missing_gubbins.py
new file mode 100644
index 00000000..38fdcaec
--- /dev/null
+++ b/bnl/test_missing_gubbins.py
@@ -0,0 +1,145 @@
+
+from activestorage import dummy_data as dd
+from activestorage import Active
+import pyfive
+import numpy.ma as ma
+import numpy as np
+import os
+from activestorage.config import *
+from pathlib import Path
+import s3fs
+
+
+def get_masked_data(ds):
+    """ This is buried in active itself we should pull it out for wider usage in our testing etc"""
+    attrs = ds.attrs
+    missing_value = attrs.get('missing_value')
+    _FillValue = attrs.get('_FillValue')
+    valid_min = attrs.get('valid_min')
+    valid_max = attrs.get('valid_max')
+    valid_range = attrs.get('valid_range')
+
+    if valid_max is not None or valid_min is not None:
+        if valid_range is not None:
+            raise ValueError(
+                "Invalid combination in the file of valid_min, "
+                "valid_max, valid_range: "
+                f"{valid_min}, {valid_max}, {valid_range}"
+            )
+    elif valid_range is not None:
+        valid_min, valid_max = valid_range
+
+    data = ds[:]
+    
+    if _FillValue is not None:
+        data = np.ma.masked_equal(data, _FillValue)
+
+    if missing_value is not None:
+        data = np.ma.masked_equal(data, missing_value)
+
+    if valid_max is not None:
+        data = np.ma.masked_greater(data, valid_max)
+
+    if valid_min is not None:
+        data = np.ma.masked_less(data, valid_min)
+
+    return data
+
+
+
+
+def upload_to_s3(server, username, password, bucket, object, rfile):
+    """Upload a file to an S3 object store."""
+    s3_fs = s3fs.S3FileSystem(key=username, secret=password, client_kwargs={'endpoint_url': server})
+    # Make sure s3 bucket exists
+    try:
+        s3_fs.mkdir(bucket)
+    except FileExistsError:
+        pass
+
+    s3_fs.put_file(rfile, os.path.join(bucket, object))
+
+
+def get_storage_type():
+    if USE_S3:
+        return "s3"
+    else:
+        return None
+    
+def write_to_storage(ncfile):
+    """Write a file to storage and return an appropriate URI or path to access it."""
+    if USE_S3:
+        object = os.path.basename(ncfile)
+        upload_to_s3(S3_URL, S3_ACCESS_KEY, S3_SECRET_KEY, S3_BUCKET, object, ncfile)
+        return os.path.join("s3://", S3_BUCKET, object)
+    else:
+        return ncfile
+
+
+def active_zero(testfile):
+    """Run Active with no active storage (version=0)."""
+    active = Active(testfile, "data", get_storage_type())
+    active._version = 0
+    d = active[0:2, 4:6, 7:9]
+    assert ma.is_masked(d)
+
+    # FIXME: For the S3 backend, h5netcdf is used to read the metadata. It does
+    # not seem to load the missing data attributes (missing_value, _FillValue,
+    # valid_min, valid_max, valid_range, etc).
+    assert ma.is_masked(d)
+
+    return np.mean(d)
+
+
+def active_two(testfile):
+    """Run Active with active storage (version=2)."""
+    active = Active(testfile, "data", get_storage_type())
+    active._version = 2
+    active.method = "mean"
+    active.components = True
+    result2 = active[0:2, 4:6, 7:9]
+
+    active_mean = result2["sum"] / result2["n"]
+
+    return active_mean
+
+
+def load_dataset(testfile):
+    """Load data as netCDF4.Dataset."""
+    ds = pyfive.File(testfile)
+    actual_data = get_masked_data(ds["data"])
+    
+    ds.close()
+
+    assert ma.is_masked(actual_data)
+
+    return actual_data
+
+
+def test_partially_missing_data(tmp_path):
+    testfile = str(tmp_path / 'test_partially_missing_data.nc')
+    r = dd.make_partially_missing_ncdata(testfile)
+
+    # retrieve the actual numpy-ed result
+    actual_data = load_dataset(testfile)
+    unmasked_numpy_mean = actual_data[0:2, 4:6, 7:9].data.mean()
+    masked_numpy_mean = actual_data[0:2, 4:6, 7:9].mean()
+    assert unmasked_numpy_mean != masked_numpy_mean
+    print("Numpy masked result (mean)", masked_numpy_mean)
+
+    # write file to storage
+    testfile = write_to_storage(testfile)
+
+    # numpy masked to check for correct Active behaviour
+    no_active_mean = active_zero(testfile)
+    print("No active storage result (mean)", no_active_mean)
+
+    active_mean = active_two(testfile)
+    print("Active storage result (mean)", active_mean)
+
+    np.testing.assert_array_equal(masked_numpy_mean, active_mean)
+    np.testing.assert_array_equal(no_active_mean, active_mean)
+
+
+mypath = Path(__file__).parent
+test_partially_missing_data(mypath)
\ No newline at end of file

From 8a7e35bde1ad3fe9281a9bc4f85cd6d40c00a36f Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 17:05:38 +0000
Subject: [PATCH 17/85] turn off some printing for now

---
 activestorage/active.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index b37ba932..9be07251 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -407,7 +407,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
-            print("S3 rfile is:", self.filename)
+            # print("S3 rfile is:", self.filename)
             parsed_url = urllib.parse.urlparse(self.filename)
             bucket = parsed_url.netloc
             object = parsed_url.path
@@ -417,8 +417,8 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             if bucket == "":
                 bucket = os.path.dirname(object)
                 object = os.path.basename(object)
-            print("S3 bucket:", bucket)
-            print("S3 file:", object)
+            # print("S3 bucket:", bucket)
+            # print("S3 file:", object)
             if self.storage_options is None:
                 # for the moment we need to force ds.dtype to be a numpy type
                 tmp, count = reductionist.reduce_chunk(session,

From cacb94e442fe591be3e860cf0629c324be821721 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 5 Mar 2024 17:08:43 +0000
Subject: [PATCH 18/85] starting to understand the test flakiness

---
 tests/unit/test_storage_types.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 4a3a4ccb..30c82b87 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -7,6 +7,7 @@
 import os
 import h5netcdf
 import numpy as np
+import pyfive
 import pytest
 import requests.exceptions
 from unittest import mock
@@ -24,16 +25,15 @@
 
 @mock.patch.object(activestorage.active, "load_from_s3")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
-def test_s3(mock_reduce, mock_nz, mock_load, tmp_path):
+def test_s3(mock_reduce, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
 
     # Since this is a unit test, we can't assume that an S3 object store or
     # active storage server is available. Therefore, we mock out the remote
     # service interaction and replace with local file operations.
 
-    @contextlib.contextmanager
-    def load_from_s3(uri):
-        yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
+    def load_from_s3(uri, storage_options=None):
+        return pyfive.File(test_file)
 
     def reduce_chunk(
         session,
@@ -67,7 +67,6 @@ def reduce_chunk(
         )
 
     mock_load.side_effect = load_from_s3
-    mock_nz.side_effect = load_netcdf_zarr_generic
     mock_reduce.side_effect = reduce_chunk
 
     uri = "s3://fake-bucket/fake-object"
@@ -78,6 +77,10 @@ def reduce_chunk(
     active._version = 1
     active._method = "max"
 
+    print("This test has severe flakiness:")
+    print("Either fails with AssestionError - bTREE stuff,")
+    print("or it fails with a multitude of KeyErrors.")
+    print(active)
     result = active[::]
 
     assert result == 999.0
@@ -85,7 +88,6 @@ def reduce_chunk(
     # S3 loading is not done from Active anymore
     mock_load.assert_not_called()
 
-    mock_nz.assert_called_once_with(uri, "data", "s3", None)
     # NOTE: This gets called multiple times with various arguments. Match on
     # the common ones.
     mock_reduce.assert_called_with(
@@ -143,11 +145,11 @@ def test_s3_load_failure(mock_load):
 
 @mock.patch.object(activestorage.active, "load_from_s3")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
-def test_reductionist_connection(mock_reduce, mock_nz, mock_load, tmp_path):
+def test_reductionist_connection(mock_reduce, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
 
     @contextlib.contextmanager
-    def load_from_s3(uri):
+    def load_from_s3(uri, storage_options=None):
         yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
 
     mock_load.side_effect = load_from_s3
@@ -167,15 +169,14 @@ def load_from_s3(uri):
 
 @mock.patch.object(activestorage.active, "load_from_s3")
 @mock.patch.object(activestorage.active.reductionist, "reduce_chunk")
-def test_reductionist_bad_request(mock_reduce, mock_nz, mock_load, tmp_path):
+def test_reductionist_bad_request(mock_reduce, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
 
     @contextlib.contextmanager
-    def load_from_s3(uri):
+    def load_from_s3(uri, storage_options=None):
         yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
 
     mock_load.side_effect = load_from_s3
-    mock_nz.side_effect = load_netcdf_zarr_generic
     mock_reduce.side_effect = activestorage.reductionist.ReductionistError(400, "Bad request")
 
     uri = "s3://fake-bucket/fake-object"

From ce0479c135651d27f621d9cb39a96d1846c3eb14 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 20:02:16 +0000
Subject: [PATCH 19/85] Addresses issues with missing data and with thread
 safety (basically we need to read the chunk cache before sending things off
 into the weeds).

---
 activestorage/active.py | 119 +++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 70 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index b37ba932..6a0ab3b4 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -44,7 +44,6 @@ def load_from_s3(uri, storage_options=None):
     return ds
 
 
-
 class Active:
     """ 
     Instantiates an interface to active storage which contains either zarr files
@@ -114,6 +113,7 @@ def __init__(
         self._components = False
         self._method = None
         self._max_threads = max_threads
+        self.missing = None
 
     def __getitem__(self, index):
         """ 
@@ -125,36 +125,50 @@ def __getitem__(self, index):
         # through to the default method.
         ncvar = self.ncvar
 
-        # in all casese we need an open netcdf file to get at attributes
+        # in all cases we need an open netcdf file to get at attributes
         # we keep it open because we need it's b-tree
         if self.storage_type is None:
             nc = pyfive.File(self.uri)
         elif self.storage_type == "s3":
             nc = load_from_s3(self.uri, self.storage_options)
         self.filename = self.uri
- 
 
-        if self.method is None and self._version == 0:
-            # No active operation
-            if self.storage_type is None:
-                data = nc[ncvar][index]
-                
-            elif self.storage_type == "s3":
+        ds = nc[ncvar]
+        # Get missing values
         
-                data = nc[ncvar][index]
-                data = self._mask_data(data, nc[ncvar])
-     
-            return data
+        _FillValue = ds.attrs.get('_FillValue')
+        missing_value = ds.attrs.get('missing_value')
+        valid_min = ds.attrs.get('valid_min')
+        valid_max = ds.attrs.get('valid_max')
+        valid_range = ds.attrs.get('valid_range')
+        if valid_max is not None or valid_min is not None:
+            if valid_range is not None:
+                raise ValueError(
+                    "Invalid combination in the file of valid_min, "
+                    "valid_max, valid_range: "
+                    f"{valid_min}, {valid_max}, {valid_range}"
+                )
+        elif valid_range is not None:            
+            valid_min, valid_max = valid_range
         
+        self.missing = _FillValue, missing_value, valid_min, valid_max,
         
+        if self.method is None and self._version == 0:
+            
+            # No active operation
+            data = ds[index]    
+            data = self._mask_data(data)
+            return data
+
         elif self._version == 1:
-            return self._get_selection(nc[ncvar], index)
+
+            #FIXME: is the difference between version 1 and 2 still honoured?
+            return self._get_selection(ds, index)
         
         elif self._version  == 2:
 
-            data = self._get_selection(nc[ncvar], index)
-            return data
-
+            return self._get_selection(ds, index)
+          
         else:
             raise ValueError(f'Version {self._version} not supported')
 
@@ -213,8 +227,6 @@ def ncvar(self, value):
         self._ncvar = value
 
 
-
-
     def _get_active(self, method, *args):
         """ 
         *args defines a slice of data. This method loops over each of the chunks
@@ -224,7 +236,6 @@ def _get_active(self, method, *args):
         an array returned via getitem.
         """
         raise NotImplementedError
-
  
 
     def _get_selection(self, ds, *args):
@@ -237,29 +248,6 @@ def _get_selection(self, ds, *args):
         # stick this here for later, to discuss with David
         keepdims = True
 
-        # Get missing values
-        _FillValue = ds.attrs.get('_FillValue')
-        missing_value = ds.attrs.get('missing_value')
-        valid_min = ds.attrs.get('valid_min')
-        valid_max = ds.attrs.get('valid_max')
-        valid_range = ds.attrs.get('valid_range')
-        if valid_max is not None or valid_min is not None:
-            if valid_range is not None:
-                raise ValueError(
-                    "Invalid combination in the file of valid_min, "
-                    "valid_max, valid_range: "
-                    f"{valid_min}, {valid_max}, {valid_range}"
-                )
-        elif valid_range is not None:            
-            valid_min, valid_max = valid_range
-        
-        missing = (
-            _FillValue,
-            missing_value,
-            valid_min,
-            valid_max,
-        )
-
         name = ds.name
         dtype = np.dtype(ds.dtype)
         # hopefully fix pyfive to get a dtype directly
@@ -279,9 +267,9 @@ def _get_selection(self, ds, *args):
 
         # we use array._chunks rather than ds.chunks, as the latter is none in the case of
         # unchunked data, and we need to tell the storage the array dimensions in this case.
-        return self._from_storage(ds, indexer, array._chunks, out_shape, out_dtype, missing, compressor, filters, drop_axes)
+        return self._from_storage(ds, indexer, array._chunks, out_shape, out_dtype, compressor, filters, drop_axes)
 
-    def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, missing, compressor, filters, drop_axes):
+    def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, filters, drop_axes):
         method = self.method
        
         if method is not None:
@@ -312,6 +300,10 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, missing, comp
             session = None
 
         # Process storage chunks using a thread pool.
+        # Because we do this, we need to read the dataset b-tree now, not as we go, so
+        # it is already in cache. If we remove the thread pool from here, we probably
+        # wouldn't need to do it before the first one.
+        ds._get_chunk_addresses()
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
             futures = []
             # Submit chunks for processing.
@@ -319,7 +311,7 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, missing, comp
                 future = executor.submit(
                     self._process_chunk,
                     session,  ds, chunks, chunk_coords, chunk_selection,
-                    counts, out_selection, missing, compressor, filters, drop_axes=drop_axes)
+                    counts, out_selection, compressor, filters, drop_axes=drop_axes)
                 futures.append(future)
             # Wait for completion.
             for future in concurrent.futures.as_completed(futures):
@@ -390,7 +382,7 @@ def _get_endpoint_url(self):
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
     def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, counts,
-                       out_selection, missing, compressor, filters, drop_axes=None):
+                       out_selection, compressor, filters, drop_axes=None):
         """
         Obtain part or whole of a chunk.
 
@@ -404,7 +396,6 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
 
-
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":
             print("S3 rfile is:", self.filename)
@@ -426,7 +417,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        S3_URL,
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, np.dtype(ds.dtype),
+                                                       self.missing, np.dtype(ds.dtype),
                                                        chunks,
                                                        ds.order,
                                                        chunk_selection,
@@ -445,7 +436,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        self._get_endpoint_url(),
                                                        bucket, object, offset,
                                                        size, compressor, filters,
-                                                       missing, np.dtype(ds.dtype),
+                                                       self.missing, np.dtype(ds.dtype),
                                                        chunks,
                                                        ds.order,
                                                        chunk_selection,
@@ -456,7 +447,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             # so neither the returned data or the interface should be considered stable
             # although we will version changes.
             tmp, count = reduce_chunk(self.filename, offset, size, compressor, filters,
-                                      missing, ds.dtype,
+                                      self.missing, ds.dtype,
                                       chunks, ds.order,
                                       chunk_selection, method=self.method)
 
@@ -467,25 +458,13 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                 tmp = np.squeeze(tmp, axis=drop_axes)
             return tmp, out_selection
 
-    def _mask_data(self, data, ds_var):
-        """ppp"""
-        # TODO: replace with cfdm.NetCDFIndexer, hopefully.
-        attrs = ds_var.attrs
-        missing_value = attrs.get('missing_value')
-        _FillValue = attrs.get('_FillValue')
-        valid_min = attrs.get('valid_min')
-        valid_max = attrs.get('valid_max')
-        valid_range = attrs.get('valid_range')
-
-        if valid_max is not None or valid_min is not None:
-            if valid_range is not None:
-                raise ValueError(
-                    "Invalid combination in the file of valid_min, "
-                    "valid_max, valid_range: "
-                    f"{valid_min}, {valid_max}, {valid_range}"
-                )
-        elif valid_range is not None:
-            valid_min, valid_max = valid_range
+    def _mask_data(self, data):
+        """ 
+        Missing values obtained at initial getitem, and are used here to 
+        mask data, if necessary
+        """
+        
+        _FillValue, missing_value, valid_min, valid_max = self.missing
         
         if _FillValue is not None:
             data = np.ma.masked_equal(data, _FillValue)

From 02d6120dc99ffca2bdff9f4fbbdc379100d4af49 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 08:25:43 +0000
Subject: [PATCH 20/85] Fixed the regression wrt files with no chunking
 introduced by thread safety.

---
 activestorage/active.py | 7 ++++---
 bnl/bnl_test.py         | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 4e730f0f..90b889f9 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -261,13 +261,12 @@ def _get_selection(self, ds, *args):
         
         indexer = pyfive.OrthogonalIndexer(*args, array)
         out_shape = indexer.shape
-        out_dtype =ds.dtype
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
 
         # we use array._chunks rather than ds.chunks, as the latter is none in the case of
         # unchunked data, and we need to tell the storage the array dimensions in this case.
-        return self._from_storage(ds, indexer, array._chunks, out_shape, out_dtype, compressor, filters, drop_axes)
+        return self._from_storage(ds, indexer, array._chunks, out_shape, dtype, compressor, filters, drop_axes)
 
     def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, filters, drop_axes):
         method = self.method
@@ -303,7 +302,9 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
         # Because we do this, we need to read the dataset b-tree now, not as we go, so
         # it is already in cache. If we remove the thread pool from here, we probably
         # wouldn't need to do it before the first one.
-        ds._get_chunk_addresses()
+        
+        if ds.chunks is not None:
+            ds._get_chunk_addresses()
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
             futures = []
             # Submit chunks for processing.
diff --git a/bnl/bnl_test.py b/bnl/bnl_test.py
index c5bb3c3d..ae8e4a32 100644
--- a/bnl/bnl_test.py
+++ b/bnl/bnl_test.py
@@ -28,7 +28,6 @@ def mytest():
     else:
         d = active[4:5, 1:2]
     mean_result = np.mean(d)
-
     active = Active(uri, v, None)
     active._version = 2
     active.method = "mean"

From 2ffd644b9e1ca055e78d0c79c67f92de4e686b60 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 09:26:31 +0000
Subject: [PATCH 21/85] Made some changes to Active just so we can use masked
 data as a standalone method which I don't like. We would be better to pull
 that out and do this differently. I also fixed the disgusting test ...

---
 activestorage/active.py | 69 +++++++++++++++++++++++++----------------
 tests/test_missing.py   | 57 ++++++++++------------------------
 2 files changed, 58 insertions(+), 68 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 90b889f9..c5e9dc8f 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -114,17 +114,11 @@ def __init__(
         self._method = None
         self._max_threads = max_threads
         self.missing = None
+        self.ds = None
 
-    def __getitem__(self, index):
-        """ 
-        Provides support for a standard get item.
-        #FIXME-BNL: Why is the argument index?
-        """
-        # In version one this is done by explicitly looping over each chunk in the file
-        # and returning the requested slice ourselves. In version 2, we can pass this
-        # through to the default method.
+    def __load_nc_file(self):
+        """ Get the netcdf file and it's b-tree"""
         ncvar = self.ncvar
-
         # in all cases we need an open netcdf file to get at attributes
         # we keep it open because we need it's b-tree
         if self.storage_type is None:
@@ -133,14 +127,21 @@ def __getitem__(self, index):
             nc = load_from_s3(self.uri, self.storage_options)
         self.filename = self.uri
 
-        ds = nc[ncvar]
-        # Get missing values
-        
-        _FillValue = ds.attrs.get('_FillValue')
-        missing_value = ds.attrs.get('missing_value')
-        valid_min = ds.attrs.get('valid_min')
-        valid_max = ds.attrs.get('valid_max')
-        valid_range = ds.attrs.get('valid_range')
+        self.ds = nc[ncvar]
+
+    def __get_missing_attributes(self):
+        """" 
+        Load all the missing attributes we need from a netcdf file
+        """
+
+        if self.ds is None:
+            self.__load_nc_file()
+
+        _FillValue = self.ds.attrs.get('_FillValue')
+        missing_value = self.ds.attrs.get('missing_value')
+        valid_min = self.ds.attrs.get('valid_min')
+        valid_max = self.ds.attrs.get('valid_max')
+        valid_range = self.ds.attrs.get('valid_range')
         if valid_max is not None or valid_min is not None:
             if valid_range is not None:
                 raise ValueError(
@@ -151,23 +152,36 @@ def __getitem__(self, index):
         elif valid_range is not None:            
             valid_min, valid_max = valid_range
         
-        self.missing = _FillValue, missing_value, valid_min, valid_max,
+        return _FillValue, missing_value, valid_min, valid_max
+
+    def __getitem__(self, index):
+        """ 
+        Provides support for a standard get item.
+        #FIXME-BNL: Why is the argument index?
+        """
+        if self.ds is None:
+            self.__load_nc_file()
+        # In version one this is done by explicitly looping over each chunk in the file
+        # and returning the requested slice ourselves. In version 2, we can pass this
+        # through to the default method.
+        
+        self.missing = self.__get_missing_attributes()
         
         if self.method is None and self._version == 0:
             
             # No active operation
-            data = ds[index]    
+            data = self.ds[index]    
             data = self._mask_data(data)
             return data
 
         elif self._version == 1:
 
             #FIXME: is the difference between version 1 and 2 still honoured?
-            return self._get_selection(ds, index)
+            return self._get_selection(index)
         
         elif self._version  == 2:
 
-            return self._get_selection(ds, index)
+            return self._get_selection(index)
           
         else:
             raise ValueError(f'Version {self._version} not supported')
@@ -238,7 +252,7 @@ def _get_active(self, method, *args):
         raise NotImplementedError
  
 
-    def _get_selection(self, ds, *args):
+    def _get_selection(self, *args):
         """ 
         At this point we have a Dataset object, but all the important information about
         how to use it is in the attribute DataoobjectDataset class. Here we gather 
@@ -248,11 +262,11 @@ def _get_selection(self, ds, *args):
         # stick this here for later, to discuss with David
         keepdims = True
 
-        name = ds.name
-        dtype = np.dtype(ds.dtype)
+        name = self.ds.name
+        dtype = np.dtype(self.ds.dtype)
         # hopefully fix pyfive to get a dtype directly
-        array = pyfive.ZarrArrayStub(ds.shape, ds.chunks)
-        ds = ds._dataobjects
+        array = pyfive.ZarrArrayStub(self.ds.shape, self.ds.chunks)
+        ds = self.ds._dataobjects
         
         if ds.filter_pipeline is None:
             compressor, filters = None, None
@@ -464,7 +478,8 @@ def _mask_data(self, data):
         Missing values obtained at initial getitem, and are used here to 
         mask data, if necessary
         """
-        
+        if self.missing is None:
+            self.missing = self.__get_missing_attributes()
         _FillValue, missing_value, valid_min, valid_max = self.missing
         
         if _FillValue is not None:
diff --git a/tests/test_missing.py b/tests/test_missing.py
index 722358af..4edc674a 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -7,7 +7,7 @@
 import tempfile
 import unittest
 
-import h5netcdf
+import pyfive
 
 from netCDF4 import Dataset
 
@@ -247,58 +247,33 @@ def test_validrange(tmp_path):
 def test_active_mask_data(tmp_path):
     testfile = str(tmp_path / 'test_partially_missing_data.nc')
 
-    # with valid min
-    r = dd.make_validmin_ncdata(testfile, valid_min=500)
-
-    # retrieve the actual numpy-ed result
-    actual_data = load_dataset(testfile)
+    def check_masking(testfile, testname):
 
-    # dataset variable
-    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
-    dsvar = ds["data"]
+        valid_masked_data = load_dataset(testfile)
+        ds = pyfive.File(testfile)
+        dsvar = ds["data"]
+        dsdata = dsvar[:]
+        ds.close()
+        a = Active(testfile, "data")
+        data = a._mask_data(dsdata)
+        np.testing.assert_array_equal(data, valid_masked_data,f'Failed masking for {testname}')
 
-    # test the function
-    data = Active._mask_data(None, actual_data, dsvar)
-    ds.close()
+    # with valid min
+    r = dd.make_validmin_ncdata(testfile, valid_min=500)
+    check_masking(testfile, "valid min")
 
     # with valid range
     r = dd.make_validrange_ncdata(testfile, valid_range=[750., 850.])
+    check_masking(testfile, "valid range")
 
     # retrieve the actual numpy-ed result
     actual_data = load_dataset(testfile)
 
-    # dataset variable
-    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
-    dsvar = ds["data"]
-
-    # test the function
-    data = Active._mask_data(None, actual_data, dsvar)
-    ds.close()
-
     # with missing
     r = dd.make_missing_ncdata(testfile)
-
-    # retrieve the actual numpy-ed result
-    actual_data = load_dataset(testfile)
-
-    # dataset variable
-    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
-    dsvar = ds["data"]
-
-    # test the function
-    data = Active._mask_data(None, actual_data, dsvar)
-    ds.close()
+    check_masking(testfile,'missing')
 
     # with _FillValue
     r = dd.make_fillvalue_ncdata(testfile)
+    check_masking(testfile,"_FillValue")
 
-    # retrieve the actual numpy-ed result
-    actual_data = load_dataset(testfile)
-
-    # dataset variable
-    ds = h5netcdf.File(testfile, 'r', invalid_netcdf=True)
-    dsvar = ds["data"]
-
-    # test the function
-    data = Active._mask_data(None, actual_data, dsvar)
-    ds.close()

From 408558f73858fb626fa3c0627a2c6417112d0d88 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:28:55 +0000
Subject: [PATCH 22/85] fix test compression

---
 tests/test_compression.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 1cba14b1..86982de9 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -91,6 +91,10 @@ def test_compression_and_filters_cmip6_data(storage_options, active_storage_url)
 
     check_dataset_filters(test_file, "tas", "zlib", False)
 
+    print("Test file and storage options", test_file, storage_options)
+    if not utils.get_storage_type():
+        storage_options = None
+        active_storage_url = None
     active = Active(test_file, 'tas', utils.get_storage_type(),
                     storage_options=storage_options,
                     active_storage_url=active_storage_url)
@@ -117,6 +121,10 @@ def test_compression_and_filters_obs4mips_data(storage_options, active_storage_u
 
     check_dataset_filters(test_file, "rlut", "zlib", False)
 
+    print("Test file and storage options", test_file, storage_options)
+    if not utils.get_storage_type():
+        storage_options = None
+        active_storage_url = None
     active = Active(test_file, 'rlut', utils.get_storage_type(),
                     storage_options=storage_options,
                     active_storage_url=active_storage_url)

From 3294052033dba987ba45aa37d3d17d1d136ce7bb Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:32:45 +0000
Subject: [PATCH 23/85] fix the other compression test

---
 tests/test_compression_remote_reductionist.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py
index 34e7bfaa..14eb722d 100644
--- a/tests/test_compression_remote_reductionist.py
+++ b/tests/test_compression_remote_reductionist.py
@@ -55,6 +55,10 @@ def test_compression_and_filters_cmip6_data(storage_options, active_storage_url)
         test_file_uri = os.path.join(S3_BUCKET, ofile)
     else:
         test_file_uri = test_file
+    print("Test file and storage options", test_file, storage_options)
+    if not utils.get_storage_type():
+        storage_options = None
+        active_storage_url = None
     active = Active(test_file_uri, 'tas', utils.get_storage_type(),
                     storage_options=storage_options,
                     active_storage_url=active_storage_url)

From ca30bb22f49a86f4f8539e1f083c68234c3e04c5 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:45:13 +0000
Subject: [PATCH 24/85] fix mocker tests

---
 tests/unit/test_storage_types.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 30c82b87..764c138e 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -3,9 +3,7 @@
 # interaction and replace with local file operations.
 
 import botocore
-import contextlib
 import os
-import h5netcdf
 import numpy as np
 import pyfive
 import pytest
@@ -85,8 +83,9 @@ def reduce_chunk(
 
     assert result == 999.0
 
-    # S3 loading is not done from Active anymore
-    mock_load.assert_not_called()
+    # S3 loading is done from Active
+    # but we should delegate that outside at some point
+    # mock_load.assert_not_called()
 
     # NOTE: This gets called multiple times with various arguments. Match on
     # the common ones.
@@ -113,9 +112,8 @@ def reduce_chunk(
 def test_reductionist_version_0(mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3 using version 0."""
 
-    @contextlib.contextmanager
     def load_from_s3(uri, storage_options=None):
-        yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
+        return pyfive.File(test_file)
 
     mock_load.side_effect = load_from_s3
 
@@ -148,9 +146,8 @@ def test_s3_load_failure(mock_load):
 def test_reductionist_connection(mock_reduce, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
 
-    @contextlib.contextmanager
     def load_from_s3(uri, storage_options=None):
-        yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
+        return pyfive.File(test_file)
 
     mock_load.side_effect = load_from_s3
     mock_reduce.side_effect = requests.exceptions.ConnectTimeout()
@@ -172,9 +169,8 @@ def load_from_s3(uri, storage_options=None):
 def test_reductionist_bad_request(mock_reduce, mock_load, tmp_path):
     """Test stack when call to Active contains storage_type == s3."""
 
-    @contextlib.contextmanager
     def load_from_s3(uri, storage_options=None):
-        yield h5netcdf.File(test_file, 'r', invalid_netcdf=True)
+        return pyfive.File(test_file)
 
     mock_load.side_effect = load_from_s3
     mock_reduce.side_effect = activestorage.reductionist.ReductionistError(400, "Bad request")

From c1b2823bd753b21085fed2bb5ff6373b657bff8c Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:47:04 +0000
Subject: [PATCH 25/85] minor fix to active

---
 activestorage/active.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index c5e9dc8f..601255ce 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -1,5 +1,4 @@
 import concurrent.futures
-import contextlib
 import os
 import numpy as np
 import pathlib
@@ -40,7 +39,7 @@ def load_from_s3(uri, storage_options=None):
    
     s3file = fs.open(uri, 'rb')
     ds = pyfive.File(s3file)
-    print(f"Dataset loaded from S3 via h5netcdf: {uri} with Pyfive File handler")
+    print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri}")
     return ds
 
 
From 3c1b4208bc66a00e5ae2cedbc5b1b48ac4963e17 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:49:45 +0000
Subject: [PATCH 26/85] ass test todo

---
 tests/test_harness.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_harness.py b/tests/test_harness.py
index f7b9db47..cdead544 100644
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@@ -33,6 +33,8 @@ def test_read0(tmp_path):
     active = Active(test_file, 'data', utils.get_storage_type())
     active._version = 0
     d = active[0:2,4:6,7:9]
+    # TODO Bryan look into this
+    print(d.data, type(d.data))
     nda = np.ndarray.flatten(d.data)
     assert np.array_equal(nda,np.array([740.,840.,750.,850.,741.,841.,751.,851.]))
 

From 589c772610f572d86cea2ff9158c2ed5b292fc45 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 13:59:21 +0000
Subject: [PATCH 27/85] fix last test

---
 tests/test_harness.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_harness.py b/tests/test_harness.py
index cdead544..315a3b46 100644
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@@ -24,7 +24,7 @@ def create_test_dataset(tmp_path):
     return test_file
 
 
-@pytest.mark.xfail(USE_S3, reason="descriptor 'flatten' for 'numpy.ndarray' objects doesn't apply to a 'memoryview' object")
+# @pytest.mark.xfail(USE_S3, reason="descriptor 'flatten' for 'numpy.ndarray' objects doesn't apply to a 'memoryview' object")
 def test_read0(tmp_path):
     """
     Test a normal read slicing the data an interesting way, using version 0 (native interface)
@@ -32,10 +32,10 @@ def test_read0(tmp_path):
     test_file = create_test_dataset(tmp_path)
     active = Active(test_file, 'data', utils.get_storage_type())
     active._version = 0
-    d = active[0:2,4:6,7:9]
-    # TODO Bryan look into this
-    print(d.data, type(d.data))
-    nda = np.ndarray.flatten(d.data)
+    d = active[0:2, 4:6, 7:9]
+    # d.data is a memoryview object in both local POSIX and remote S3 storages
+    # keep the current behaviour of the test to catch possible type changes
+    nda = np.ndarray.flatten(np.asarray(d.data))
     assert np.array_equal(nda,np.array([740.,840.,750.,850.,741.,841.,751.,851.]))
 
 def test_read1(tmp_path):

From c0220408812cfee9adb20c56fbe4249b56596b78 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 14:23:48 +0000
Subject: [PATCH 28/85] checkout and install pyfive in dev mode

---
 .github/workflows/run-tests.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 3221a3a8..97aaa651 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - main
+      - pyfive
   schedule:
     - cron: '0 0 * * *'  # nightly
 
@@ -34,6 +35,13 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
+      - name: Install development version of bnlawrence/Pyfive:issue6
+        run: |
+          cd ..
+          git clone https://github.com/bnlawrence/pyfive.git
+          cd pyfive
+          git checkout issue6
+          pip install -e .
       - run: pip install -e .
       - run: pytest -n 2
 

From 016242e2eb752531522a5685b204ff175d47457d Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 14:34:55 +0000
Subject: [PATCH 29/85] fix Bryans test

---
 bnl/test_missing_gubbins.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bnl/test_missing_gubbins.py b/bnl/test_missing_gubbins.py
index 38fdcaec..65f353a5 100644
--- a/bnl/test_missing_gubbins.py
+++ b/bnl/test_missing_gubbins.py
@@ -139,7 +139,3 @@ def test_partially_missing_data(tmp_path):
 
     np.testing.assert_array_equal(masked_numpy_mean, active_mean)
     np.testing.assert_array_equal(no_active_mean, active_mean)
-
-
-mypath = Path(__file__).parent
-test_partially_missing_data(mypath)
\ No newline at end of file

From 9bfb099b8c39f2d5c22f0cee78c97fac01b18cce Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 14:39:30 +0000
Subject: [PATCH 30/85] install Pyfive in OSX GHA too

---
 .github/workflows/run-tests.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 97aaa651..f69d0eba 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -66,5 +66,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
+      - name: Install development version of bnlawrence/Pyfive:issue6
+        run: |
+          cd ..
+          git clone https://github.com/bnlawrence/pyfive.git
+          cd pyfive
+          git checkout issue6
+          pip install -e .
       - run: pip install -e .
       - run: pytest

From 68827ebb8aa56d967e614bb1f89a9b4de090e9f4 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 14:40:49 +0000
Subject: [PATCH 31/85] run the s3 tests too with Minio

---
 .github/workflows/test_s3_minio.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/test_s3_minio.yml b/.github/workflows/test_s3_minio.yml
index c1399441..32d8e1cf 100644
--- a/.github/workflows/test_s3_minio.yml
+++ b/.github/workflows/test_s3_minio.yml
@@ -6,6 +6,7 @@ on:
   push:
     branches:
       - main  # keep this at all times
+      - pyfive
   pull_request:
   schedule:
     - cron: '0 0 * * *'  # nightly
@@ -57,6 +58,13 @@ jobs:
           miniforge-version: "latest"
           miniforge-variant: Mambaforge
           use-mamba: true
+      - name: Install development version of bnlawrence/Pyfive:issue6
+        run: |
+          cd ..
+          git clone https://github.com/bnlawrence/pyfive.git
+          cd pyfive
+          git checkout issue6
+          pip install -e .
       - name: Install PyActiveStorage
         run: |
           conda --version

From 1d925dcc29e667867de729d959633c2d9dc4a4f6 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 14:42:02 +0000
Subject: [PATCH 32/85] run the s3 tests too with remote reductionist

---
 .github/workflows/test_s3_remote_reductionist.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/test_s3_remote_reductionist.yml b/.github/workflows/test_s3_remote_reductionist.yml
index 1edc96c3..c2e9c457 100644
--- a/.github/workflows/test_s3_remote_reductionist.yml
+++ b/.github/workflows/test_s3_remote_reductionist.yml
@@ -7,6 +7,7 @@ on:
   push:
     branches:
       - main  # keep this at all times
+      - pyfive
   pull_request:
   schedule:
     - cron: '0 0 * * *'  # nightly
@@ -52,6 +53,13 @@ jobs:
           miniforge-version: "latest"
           miniforge-variant: Mambaforge
           use-mamba: true
+      - name: Install development version of bnlawrence/Pyfive:issue6
+        run: |
+          cd ..
+          git clone https://github.com/bnlawrence/pyfive.git
+          cd pyfive
+          git checkout issue6
+          pip install -e .
       - name: Install PyActiveStorage
         run: |
           conda --version

From 400e2303b7d0cf2db96bc1b745c4dffce4055320 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 16:01:30 +0000
Subject: [PATCH 33/85] Fixed incorrect handling of missing data in
 reductionist encode_missing

---
 activestorage/reductionist.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py
index 654b54ba..11a12851 100644
--- a/activestorage/reductionist.py
+++ b/activestorage/reductionist.py
@@ -116,6 +116,8 @@ def encode_missing(missing):
     if missing_value:
         if isinstance(missing_value, collections.abc.Sequence):
             return {"missing_values": [encode_dvalue(v) for v in missing_value]}
+        elif isinstance(missing_value, np.ndarray):
+            return {"missing_values": [encode_dvalue(v) for v in missing_value]}
         else:
             return {"missing_value": encode_dvalue(missing_value)}
     if valid_min and valid_max:

From afe35466613034d157109900f5a7c6aa14f7f1b9 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 16:08:48 +0000
Subject: [PATCH 34/85] it's not a test!

---
 bnl/bnl_s3test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bnl/bnl_s3test.py b/bnl/bnl_s3test.py
index dd7a72ba..9ae4ab35 100644
--- a/bnl/bnl_s3test.py
+++ b/bnl/bnl_s3test.py
@@ -27,7 +27,7 @@ def simple(filename, var):
 
     #f2 = load_from_s3(uri, storage_options={'client_kwargs':{"endpoint_url":S3_URL}})
       
-def test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var):
+def ex_test(ncfile, var):
     """
     Test use of datasets with compression and filters applied for a real
     CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
@@ -58,7 +58,7 @@ def test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var):
                     storage_options=storage_options,
                     active_storage_url=active_storage_url)
 
-    active._version = 1
+    active._version = 2
     active._method = "min"
 
     result = active[0:2,4:6,7:9]
@@ -66,7 +66,9 @@ def test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var):
     assert result == 239.25946044921875
 
 
+
 if __name__=="__main__":
     ncfile, var = 'CMIP6-test.nc','tas'
+    #ncfile, var = 'test_partially_missing_data.nc','data'
     simple(ncfile, var)
-    test_compression_and_filters_cmip6_forced_s3_from_local_2(ncfile, var)
\ No newline at end of file
+    ex_test(ncfile, var)
\ No newline at end of file

From 44980b7877184a6baeb3dc706e04e001ccd87617 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 16:51:23 +0000
Subject: [PATCH 35/85] Fixed HDF5 NetCDF attribute issue

---
 activestorage/active.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 601255ce..b15e6a69 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -133,6 +133,14 @@ def __get_missing_attributes(self):
         Load all the missing attributes we need from a netcdf file
         """
 
+        def hfix(x):
+            '''
+            return item if single element list/array
+            see https://github.com/h5netcdf/h5netcdf/issues/116
+            '''
+            if not np.isscalar(x) and len(x) == 1:
+                return x[0]
+
         if self.ds is None:
             self.__load_nc_file()
 
@@ -151,7 +159,7 @@ def __get_missing_attributes(self):
         elif valid_range is not None:            
             valid_min, valid_max = valid_range
         
-        return _FillValue, missing_value, valid_min, valid_max
+        return hfix(_FillValue), hfix(missing_value), hfix(valid_min), hfix(valid_max)
 
     def __getitem__(self, index):
         """ 

From 98a74c539376c62f8c8d5c75a4e907bdd2e3b34a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 6 Mar 2024 17:04:50 +0000
Subject: [PATCH 36/85] A little more care was needed ...

---
 activestorage/active.py  | 13 ++++++++-----
 activestorage/storage.py |  4 +++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index b15e6a69..81279945 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -138,17 +138,20 @@ def hfix(x):
             return item if single element list/array
             see https://github.com/h5netcdf/h5netcdf/issues/116
             '''
+            if x is None:
+                return x
             if not np.isscalar(x) and len(x) == 1:
                 return x[0]
+            return x
 
         if self.ds is None:
             self.__load_nc_file()
 
-        _FillValue = self.ds.attrs.get('_FillValue')
+        _FillValue = hfix(self.ds.attrs.get('_FillValue'))
         missing_value = self.ds.attrs.get('missing_value')
-        valid_min = self.ds.attrs.get('valid_min')
-        valid_max = self.ds.attrs.get('valid_max')
-        valid_range = self.ds.attrs.get('valid_range')
+        valid_min = hfix(self.ds.attrs.get('valid_min'))
+        valid_max = hfix(self.ds.attrs.get('valid_max'))
+        valid_range = hfix(self.ds.attrs.get('valid_range'))
         if valid_max is not None or valid_min is not None:
             if valid_range is not None:
                 raise ValueError(
@@ -159,7 +162,7 @@ def hfix(x):
         elif valid_range is not None:            
             valid_min, valid_max = valid_range
         
-        return hfix(_FillValue), hfix(missing_value), hfix(valid_min), hfix(valid_max)
+        return _FillValue, missing_value, valid_min, valid_max
 
     def __getitem__(self, index):
         """ 
diff --git a/activestorage/storage.py b/activestorage/storage.py
index 28d91f35..38a7e35d 100644
--- a/activestorage/storage.py
+++ b/activestorage/storage.py
@@ -3,7 +3,9 @@
 
 from numcodecs.compat import ensure_ndarray
 
-def reduce_chunk(rfile, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection, method=None):
+def reduce_chunk(rfile, 
+                 offset, size, compression, filters, missing, dtype, shape, 
+                 order, chunk_selection, method=None):
     """ We do our own read of chunks and decoding etc 
     
     rfile - the actual file with the data 

From 4bea4bbee5327bb1cc71698721ac7292a1e7a966 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 17:15:48 +0000
Subject: [PATCH 37/85] new pyfive File API

---
 tests/test_compression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 86982de9..bd2c34ad 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -17,8 +17,8 @@ def check_dataset_filters(temp_file: str, ncvar: str, compression: str, shuffle:
     if USE_S3:
         with load_from_s3(temp_file) as test_data:
             # NOTE: h5netcdf thinks zlib is gzip
-            assert test_data.variables[ncvar].compression == "gzip"
-            assert test_data.variables[ncvar].shuffle == shuffle
+            assert test_data[ncvar].attrs.get('compression') == "gzip"
+            assert test_data[ncvar].attrs.get('shuffle') == shuffle
     else:
         with Dataset(temp_file) as test_data:
             test_data_filters = test_data.variables[ncvar].filters()

From cf1ff2d735b982993977f96b068fc1e7baf1eb56 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 17:44:26 +0000
Subject: [PATCH 38/85] print out the attrs

---
 tests/test_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index bd2c34ad..4040a57c 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -16,7 +16,7 @@ def check_dataset_filters(temp_file: str, ncvar: str, compression: str, shuffle:
     # Sanity check that test data is compressed and filtered as expected.
     if USE_S3:
         with load_from_s3(temp_file) as test_data:
-            # NOTE: h5netcdf thinks zlib is gzip
+            print("Variable attrs", test_data[ncvar].attrs)
             assert test_data[ncvar].attrs.get('compression') == "gzip"
             assert test_data[ncvar].attrs.get('shuffle') == shuffle
     else:

From 2808dc93cc4bb99f8e093d2258291e894fdaabb4 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 6 Mar 2024 17:54:48 +0000
Subject: [PATCH 39/85] print out file attrs

---
 tests/test_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 4040a57c..781fe3fe 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -16,7 +16,7 @@ def check_dataset_filters(temp_file: str, ncvar: str, compression: str, shuffle:
     # Sanity check that test data is compressed and filtered as expected.
     if USE_S3:
         with load_from_s3(temp_file) as test_data:
-            print("Variable attrs", test_data[ncvar].attrs)
+            print("File attrs", test_data.attrs)
             assert test_data[ncvar].attrs.get('compression') == "gzip"
             assert test_data[ncvar].attrs.get('shuffle') == shuffle
     else:

From 38add11b5e036758f594f6acbccba99df1cdf701 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 7 Mar 2024 11:46:32 +0000
Subject: [PATCH 40/85] V made me do it

---
 activestorage/active.py |  2 +-
 tests/test_missing.py   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 81279945..4ff008ff 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -88,7 +88,7 @@ def __init__(
         # Assume NetCDF4 for now
         self.uri = uri
         if self.uri is None:
-            raise ValueError(f"Must use a valid file for uri. Got {self.uri}")
+            raise ValueError(f"Must use a valid file for uri. Got {uri}")
 
         # still allow for a passable storage_type
         # for special cases eg "special-POSIX" ie DDN
diff --git a/tests/test_missing.py b/tests/test_missing.py
index 4edc674a..5534b959 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -196,6 +196,18 @@ def test_validmax(tmp_path):
 
     # write file to storage
     testfile = utils.write_to_storage(testfile)
+    x = pyfive.File(testfile)
+    y = Dataset(testfile)
+    print('bnl',y['data'].getncattr('valid_max'))
+    print('bnl',x['data'].attrs.get('valid_max'))
+    import h5py
+    z = h5py.File(testfile)
+    print('bnl',z['data'].attrs.get('valid_max'))
+    import h5netcdf
+    a = h5netcdf.File(testfile)
+    print('bnl',a['data'].attrs.get('valid_max'))
+
+
 
     # numpy masked to check for correct Active behaviour
     no_active_mean = active_zero(testfile)

From 277762d322b0de9636e0d709cbf3f0e02a5713e6 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 7 Mar 2024 12:29:51 +0000
Subject: [PATCH 41/85] use correct attributes compression and shuffle

---
 tests/test_compression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 781fe3fe..0db5ad55 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -17,8 +17,8 @@ def check_dataset_filters(temp_file: str, ncvar: str, compression: str, shuffle:
     if USE_S3:
         with load_from_s3(temp_file) as test_data:
             print("File attrs", test_data.attrs)
-            assert test_data[ncvar].attrs.get('compression') == "gzip"
-            assert test_data[ncvar].attrs.get('shuffle') == shuffle
+            assert test_data[ncvar].compression == "gzip"
+            assert test_data[ncvar].shuffle == shuffle
     else:
         with Dataset(temp_file) as test_data:
             test_data_filters = test_data.variables[ncvar].filters()

From 5bd0fe17b7a1722fe2bf1e8e3f8c8f222ca7cc87 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 7 Mar 2024 20:49:15 +0000
Subject: [PATCH 42/85] use issue60 branch

---
 .github/workflows/run-tests.yml                   | 8 ++++----
 .github/workflows/test_s3_minio.yml               | 4 ++--
 .github/workflows/test_s3_remote_reductionist.yml | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index f69d0eba..f269f7b9 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -35,12 +35,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of bnlawrence/Pyfive:issue6
+      - name: Install development version of bnlawrence/Pyfive:issue60
         run: |
           cd ..
           git clone https://github.com/bnlawrence/pyfive.git
           cd pyfive
-          git checkout issue6
+          git checkout issue60
           pip install -e .
       - run: pip install -e .
       - run: pytest -n 2
@@ -66,12 +66,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of bnlawrence/Pyfive:issue6
+      - name: Install development version of bnlawrence/Pyfive:issue60
         run: |
           cd ..
           git clone https://github.com/bnlawrence/pyfive.git
           cd pyfive
-          git checkout issue6
+          git checkout issue60
           pip install -e .
       - run: pip install -e .
       - run: pytest
diff --git a/.github/workflows/test_s3_minio.yml b/.github/workflows/test_s3_minio.yml
index 32d8e1cf..51bb97b3 100644
--- a/.github/workflows/test_s3_minio.yml
+++ b/.github/workflows/test_s3_minio.yml
@@ -58,12 +58,12 @@ jobs:
           miniforge-version: "latest"
           miniforge-variant: Mambaforge
           use-mamba: true
-      - name: Install development version of bnlawrence/Pyfive:issue6
+      - name: Install development version of bnlawrence/Pyfive:issue60
         run: |
           cd ..
           git clone https://github.com/bnlawrence/pyfive.git
           cd pyfive
-          git checkout issue6
+          git checkout issue60
           pip install -e .
       - name: Install PyActiveStorage
         run: |
diff --git a/.github/workflows/test_s3_remote_reductionist.yml b/.github/workflows/test_s3_remote_reductionist.yml
index c2e9c457..55b388ec 100644
--- a/.github/workflows/test_s3_remote_reductionist.yml
+++ b/.github/workflows/test_s3_remote_reductionist.yml
@@ -53,12 +53,12 @@ jobs:
           miniforge-version: "latest"
           miniforge-variant: Mambaforge
           use-mamba: true
-      - name: Install development version of bnlawrence/Pyfive:issue6
+      - name: Install development version of bnlawrence/Pyfive:issue60
         run: |
           cd ..
           git clone https://github.com/bnlawrence/pyfive.git
           cd pyfive
-          git checkout issue6
+          git checkout issue60
           pip install -e .
       - name: Install PyActiveStorage
         run: |

From 74beccd07167a3981456046233813cd49f8ea83c Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 8 Mar 2024 11:21:12 +0000
Subject: [PATCH 43/85] Unit test for createding reductionist json without s3

---
 activestorage/active.py         | 67 +++++++++++++++++----------------
 tests/test_reductionist_json.py | 40 ++++++++++++++++++++
 2 files changed, 75 insertions(+), 32 deletions(-)
 create mode 100644 tests/test_reductionist_json.py

diff --git a/activestorage/active.py b/activestorage/active.py
index 4ff008ff..8c4db9d2 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -42,6 +42,40 @@ def load_from_s3(uri, storage_options=None):
     print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri}")
     return ds
 
+def get_missing_attributes(ds):
+        """" 
+        Load all the missing attributes we need from a netcdf file
+        """
+
+        def hfix(x):
+            '''
+            return item if single element list/array
+            see https://github.com/h5netcdf/h5netcdf/issues/116
+            '''
+            if x is None:
+                return x
+            if not np.isscalar(x) and len(x) == 1:
+                return x[0]
+            return x
+
+        _FillValue = hfix(ds.attrs.get('_FillValue'))
+        missing_value = ds.attrs.get('missing_value')
+        valid_min = hfix(ds.attrs.get('valid_min'))
+        valid_max = hfix(ds.attrs.get('valid_max'))
+        valid_range = hfix(ds.attrs.get('valid_range'))
+        if valid_max is not None or valid_min is not None:
+            if valid_range is not None:
+                raise ValueError(
+                    "Invalid combination in the file of valid_min, "
+                    "valid_max, valid_range: "
+                    f"{valid_min}, {valid_max}, {valid_range}"
+                )
+        elif valid_range is not None:            
+            valid_min, valid_max = valid_range
+        
+        return _FillValue, missing_value, valid_min, valid_max
+
+
 
 class Active:
     """ 
@@ -129,40 +163,9 @@ def __load_nc_file(self):
         self.ds = nc[ncvar]
 
     def __get_missing_attributes(self):
-        """" 
-        Load all the missing attributes we need from a netcdf file
-        """
-
-        def hfix(x):
-            '''
-            return item if single element list/array
-            see https://github.com/h5netcdf/h5netcdf/issues/116
-            '''
-            if x is None:
-                return x
-            if not np.isscalar(x) and len(x) == 1:
-                return x[0]
-            return x
-
         if self.ds is None:
             self.__load_nc_file()
-
-        _FillValue = hfix(self.ds.attrs.get('_FillValue'))
-        missing_value = self.ds.attrs.get('missing_value')
-        valid_min = hfix(self.ds.attrs.get('valid_min'))
-        valid_max = hfix(self.ds.attrs.get('valid_max'))
-        valid_range = hfix(self.ds.attrs.get('valid_range'))
-        if valid_max is not None or valid_min is not None:
-            if valid_range is not None:
-                raise ValueError(
-                    "Invalid combination in the file of valid_min, "
-                    "valid_max, valid_range: "
-                    f"{valid_min}, {valid_max}, {valid_range}"
-                )
-        elif valid_range is not None:            
-            valid_min, valid_max = valid_range
-        
-        return _FillValue, missing_value, valid_min, valid_max
+        return get_missing_attributes(self.ds)
 
     def __getitem__(self, index):
         """ 
diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
new file mode 100644
index 00000000..299ba6f9
--- /dev/null
+++ b/tests/test_reductionist_json.py
@@ -0,0 +1,40 @@
+from test_bigger_data import save_cl_file_with_a
+import pyfive
+from activestorage.active import Active, get_missing_attributes
+from activestorage.hdf2numcodec import decode_filters
+import numpy as np
+from activestorage import reductionist
+import json
+
+class MockActive:
+    def __init__(self, f,v):
+        self.f = pyfive.File(f)
+        ds = self.f[v]
+        self.dtype = np.dtype(ds.dtype)
+        self.array = pyfive.ZarrArrayStub(ds.shape, ds.chunks or ds.shape)
+        self.missing = get_missing_attributes(ds)
+        ds = ds._dataobjects
+        self.ds = ds
+    def __getitem__(self, args):
+        if self.ds.filter_pipeline is None:
+            compressor, filters = None, None
+        else:
+            compressor, filters = decode_filters(self.ds.filter_pipeline , self.dtype.itemsize, 'a')
+        if self.ds.chunks is not None:
+            self.ds._get_chunk_addresses()
+
+        indexer = pyfive.OrthogonalIndexer(args, self.array)
+        for chunk_coords, chunk_selection, out_selection in indexer:
+            offset, size, filter_mask = self.ds.get_chunk_details(chunk_coords)
+            jd = reductionist.build_request_data('a','b','c',
+                                offset, size, compressor, filters, self.missing, self.dtype,
+                                self.array._chunks,self.ds.order,chunk_selection)
+            js = json.dumps(jd)
+        return None
+
+def test_build_request(tmp_path):
+
+    ncfile, v = save_cl_file_with_a(tmp_path), 'cl'
+    A = MockActive(ncfile,v)
+    x = A[4:5, 1:2]
+    # not interested in what is returned, checking that the request builds ok

From e91d02c06f461c715266a8d982db35fe9fa78436 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 8 Mar 2024 11:40:32 +0000
Subject: [PATCH 44/85] Reductionist was incorrectly encoding offset and size,
 and the test for it had defaults of None (which it can never be). Also
 modified tests to make easier to extract and debug standalone.

---
 activestorage/reductionist.py   | 4 ++--
 tests/test_bigger_data.py       | 2 +-
 tests/test_reductionist_json.py | 2 +-
 tests/unit/test_reductionist.py | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py
index 11a12851..e96d9982 100644
--- a/activestorage/reductionist.py
+++ b/activestorage/reductionist.py
@@ -139,8 +139,8 @@ def build_request_data(source: str, bucket: str, object: str, offset: int,
         'object': object,
         'dtype': dtype.name,
         'byte_order': encode_byte_order(dtype),
-        'offset': offset,
-        'size': size,
+        'offset': int(offset),
+        'size': int(size),
         'order': order,
     }
     if shape:
diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py
index 2173558d..91adc9f9 100644
--- a/tests/test_bigger_data.py
+++ b/tests/test_bigger_data.py
@@ -11,7 +11,7 @@
 from activestorage.config import *
 from pyfive.core import InvalidHDF5File as InvalidHDF5Err
 
-import utils
+from . import utils
 
 
 @pytest.fixture
diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
index 299ba6f9..8053165c 100644
--- a/tests/test_reductionist_json.py
+++ b/tests/test_reductionist_json.py
@@ -1,4 +1,4 @@
-from test_bigger_data import save_cl_file_with_a
+from .test_bigger_data import save_cl_file_with_a
 import pyfive
 from activestorage.active import Active, get_missing_attributes
 from activestorage.hdf2numcodec import decode_filters
diff --git a/tests/unit/test_reductionist.py b/tests/unit/test_reductionist.py
index 417bad8b..043ccbd6 100644
--- a/tests/unit/test_reductionist.py
+++ b/tests/unit/test_reductionist.py
@@ -36,8 +36,8 @@ def test_reduce_chunk_defaults(mock_request):
     s3_url = "https://active.example.com"
     bucket = "fake-bucket"
     object = "fake-object"
-    offset = None
-    size = None
+    offset = 0
+    size = 0
     compression = None
     filters = None
     missing = (None, None, None, None)
@@ -67,6 +67,8 @@ def test_reduce_chunk_defaults(mock_request):
         "bucket": bucket,
         "object": object,
         "dtype": "int32",
+        'offset':0,
+        'size':0,
         "byte_order": sys.byteorder,
     }
     mock_request.assert_called_once_with(session, expected_url, expected_data)

From 3bb0b8b2c0dceb577cb775cd45c7c40ecf30d04a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 8 Mar 2024 11:41:14 +0000
Subject: [PATCH 45/85] Will this break actions?

---
 bnl/import_test.py | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 bnl/import_test.py

diff --git a/bnl/import_test.py b/bnl/import_test.py
new file mode 100644
index 00000000..e805f77c
--- /dev/null
+++ b/bnl/import_test.py
@@ -0,0 +1,7 @@
+from tests import test_reductionist_json
+from pathlib import Path 
+
+mypath = Path(__file__).parent
+
+test_reductionist_json.test_build_request(mypath)
+

From 7a03fa4e9c7d16bc583a1ca7f03ece7dd036a123 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 13:41:43 +0000
Subject: [PATCH 46/85] ignore bnl for testing

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 728ce2d3..1cfa061e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ addopts =
 #    --doctest-modules
     --ignore=old_code/
     --ignore=tests/s3_exploratory
+    --ignore=bnl
     --cov=activestorage
     --cov-report=xml:test-reports/coverage.xml
     --cov-report=html:test-reports/coverage_html

From aa1a4814c4be33a9b7f26624d6cf7b6459e0c0cf Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 13:42:02 +0000
Subject: [PATCH 47/85] correct imports

---
 tests/test_bigger_data.py       | 2 +-
 tests/test_reductionist_json.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py
index 91adc9f9..2173558d 100644
--- a/tests/test_bigger_data.py
+++ b/tests/test_bigger_data.py
@@ -11,7 +11,7 @@
 from activestorage.config import *
 from pyfive.core import InvalidHDF5File as InvalidHDF5Err
 
-from . import utils
+import utils
 
 
 @pytest.fixture
diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
index 8053165c..a88ead96 100644
--- a/tests/test_reductionist_json.py
+++ b/tests/test_reductionist_json.py
@@ -1,9 +1,11 @@
-from .test_bigger_data import save_cl_file_with_a
 import pyfive
 from activestorage.active import Active, get_missing_attributes
 from activestorage.hdf2numcodec import decode_filters
 import numpy as np
+
 from activestorage import reductionist
+from test_bigger_data import save_cl_file_with_a
+
 import json
 
 class MockActive:

From e5430fdc6509f7f9b8a1049f938cd460fe2ea2e3 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 14:48:08 +0000
Subject: [PATCH 48/85] fix test with correct loaders and imports

---
 tests/test_missing.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tests/test_missing.py b/tests/test_missing.py
index 5534b959..934d9039 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -7,11 +7,15 @@
 import tempfile
 import unittest
 
+# TODO remove in stable
+import h5py
+import h5netcdf
+
 import pyfive
 
 from netCDF4 import Dataset
 
-from activestorage.active import Active
+from activestorage.active import Active, load_from_s3
 from activestorage.config import *
 from activestorage import dummy_data as dd
 
@@ -194,17 +198,24 @@ def test_validmax(tmp_path):
     assert unmasked_numpy_mean != masked_numpy_mean
     print("Numpy masked result (mean)", masked_numpy_mean)
 
+    # load files via external protocols
+    y = Dataset(testfile)
+    z = h5py.File(testfile)
+    a = h5netcdf.File(testfile)
+
     # write file to storage
     testfile = utils.write_to_storage(testfile)
-    x = pyfive.File(testfile)
-    y = Dataset(testfile)
+
+    # load file via our protocols
+    if USE_S3:
+        x = load_from_s3(testfile)
+    else:
+        x = pyfive.File(testfile)
+
+    # print for Bryan
     print('bnl',y['data'].getncattr('valid_max'))
     print('bnl',x['data'].attrs.get('valid_max'))
-    import h5py
-    z = h5py.File(testfile)
     print('bnl',z['data'].attrs.get('valid_max'))
-    import h5netcdf
-    a = h5netcdf.File(testfile)
     print('bnl',a['data'].attrs.get('valid_max'))
 
 
From dd5766496390a040d151230cf42a812bcf1ffcf8 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 14:52:38 +0000
Subject: [PATCH 49/85] same for this one

---
 tests/test_reductionist_json.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
index a88ead96..bcf18735 100644
--- a/tests/test_reductionist_json.py
+++ b/tests/test_reductionist_json.py
@@ -4,13 +4,18 @@
 import numpy as np
 
 from activestorage import reductionist
+from activestorage.active import load_from_s3
+from activestorage.config import *
 from test_bigger_data import save_cl_file_with_a
 
 import json
 
 class MockActive:
-    def __init__(self, f,v):
-        self.f = pyfive.File(f)
+    def __init__(self, f, v):
+        if USE_S3:
+            self.f = load_from_s3(f)
+        else:
+            self.f = pyfive.File(f)
         ds = self.f[v]
         self.dtype = np.dtype(ds.dtype)
         self.array = pyfive.ZarrArrayStub(ds.shape, ds.chunks or ds.shape)

From f3171491a10f5fe3f3cc3660a84e70f42a664baa Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 15:03:54 +0000
Subject: [PATCH 50/85] clean test module

---
 tests/s3_exploratory/test_s3_arrange_files.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/s3_exploratory/test_s3_arrange_files.py b/tests/s3_exploratory/test_s3_arrange_files.py
index 18b1015e..8117ce17 100644
--- a/tests/s3_exploratory/test_s3_arrange_files.py
+++ b/tests/s3_exploratory/test_s3_arrange_files.py
@@ -8,10 +8,6 @@
 
 from activestorage.active import Active
 from activestorage.dummy_data import make_vanilla_ncdata
-import activestorage.storage as st
-from activestorage.reductionist import reduce_chunk as reductionist_reduce_chunk
-from activestorage.netcdf_to_zarr import gen_json
-from kerchunk.hdf import SingleHdf5ToZarr
 
 from numpy.testing import assert_allclose, assert_array_equal
 from pathlib import Path

From 4dafa3e73daca3cb29e66b482cdf33cc83101209 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 15:07:49 +0000
Subject: [PATCH 51/85] reduce test activity

---
 tests/s3_exploratory/test_s3_performance.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/s3_exploratory/test_s3_performance.py b/tests/s3_exploratory/test_s3_performance.py
index 41537c4b..c47d2ab7 100644
--- a/tests/s3_exploratory/test_s3_performance.py
+++ b/tests/s3_exploratory/test_s3_performance.py
@@ -4,10 +4,8 @@
 import ujson
 
 from pathlib import Path
-from kerchunk.hdf import SingleHdf5ToZarr
 
 from activestorage.active import Active
-from activestorage.netcdf_to_zarr import open_zarr_group
 from config_minio import *
 
 
@@ -17,6 +15,7 @@ def test_data_path():
     return Path(__file__).resolve().parent / 'test_data'
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_s3_SingleHdf5ToZarr():
     """Check Kerchunk's SingleHdf5ToZarr when S3."""
     # SingleHdf5ToZarr is VERY quick and MEM-light
@@ -33,6 +32,7 @@ def test_s3_SingleHdf5ToZarr():
                                     inline_threshold=0)
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_local_SingleHdf5ToZarr(test_data_path):
     """Check Kerchunk's SingleHdf5ToZarr when NO S3."""
     # SingleHdf5ToZarr is VERY quick and MEM-light
@@ -44,6 +44,7 @@ def test_local_SingleHdf5ToZarr(test_data_path):
                                     inline_threshold=0)
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_s3_kerchunk_to_json():
     """Check Kerchunk's SingleHdf5ToZarr dumped to JSON, when S3."""
     s3_file = "s3://pyactivestorage/s3_test_bizarre_large.nc"
@@ -64,6 +65,7 @@ def test_s3_kerchunk_to_json():
             f.write(ujson.dumps(h5chunks.translate()).encode())
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_local_kerchunk_to_json(test_data_path):
     """Check Kerchunk's SingleHdf5ToZarr dumped to JSON, when NO S3."""
     local_file = str(test_data_path / "test_bizarre.nc")
@@ -78,6 +80,7 @@ def test_local_kerchunk_to_json(test_data_path):
             f.write(ujson.dumps(h5chunks.translate()).encode())
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_s3_kerchunk_openZarrGroup():
     """Check Kerchunk's SingleHdf5ToZarr dumped to JSON, when S3."""
     s3_file = "s3://pyactivestorage/s3_test_bizarre_large.nc"
@@ -99,6 +102,7 @@ def test_s3_kerchunk_openZarrGroup():
     ref_ds = open_zarr_group(outf, "data")
 
 
+@pytest.mark.xfail(reason="Pyfive don't use Kerchunk")
 def test_local_kerchunk_openZarrGroup(test_data_path):
     """Check Kerchunk's SingleHdf5ToZarr dumped to JSON, when NO S3."""
     local_file = str(test_data_path / "test_bizarre.nc")

From e9aff71f3f393a6d0a543516e630591bf0badd69 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Fri, 8 Mar 2024 15:12:39 +0000
Subject: [PATCH 52/85] plop Pyfive in the PR test wkflow too

---
 .github/workflows/run-test-push.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/run-test-push.yml b/.github/workflows/run-test-push.yml
index db881ab4..718ca4ec 100644
--- a/.github/workflows/run-test-push.yml
+++ b/.github/workflows/run-test-push.yml
@@ -30,6 +30,13 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
+      - name: Install development version of bnlawrence/Pyfive:issue60
+        run: |
+          cd ..
+          git clone https://github.com/bnlawrence/pyfive.git
+          cd pyfive
+          git checkout issue60
+          pip install -e .
       - run: pip install -e .
       - run: pytest -n 2 --junitxml=report-1.xml
       - uses: codecov/codecov-action@v3

From 08c8b873e95eb888159677792d69afb6f7419a3a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 11 Mar 2024 13:20:04 +0000
Subject: [PATCH 53/85] Adding the ability for Active to optionally self report
 metrics

---
 activestorage/active.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/activestorage/active.py b/activestorage/active.py
index 8c4db9d2..36410450 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -4,6 +4,7 @@
 import pathlib
 import urllib
 import pyfive
+import time
 
 import s3fs
 
@@ -148,6 +149,10 @@ def __init__(
         self._max_threads = max_threads
         self.missing = None
         self.ds = None
+        self.metrics =  False
+
+    def toggle_metrics(self):
+        self.metrics = not self.metrics
 
     def __load_nc_file(self):
         """ Get the netcdf file and it's b-tree"""
@@ -331,7 +336,13 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
         # wouldn't need to do it before the first one.
         
         if ds.chunks is not None:
+            t1 = time.time()
             ds._get_chunk_addresses()
+            t2 = time.time()
+            if self.metrics:
+                chunk_count = 0
+                print(f'Index time: {t2-t1:.1}s ({len(ds._zchunk_index)} chunk entries)')
+        t1 = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
             futures = []
             # Submit chunks for processing.
@@ -348,6 +359,8 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                 except Exception as exc:
                     raise
                 else:
+                    if self.metrics:
+                        chunk_count +=1
                     if method is not None:
                         result, count = result
                         out.append(result)
@@ -393,6 +406,9 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                     # size.
                     out = out / np.sum(counts).reshape(shape1)
 
+        if self.metrics:
+            t2 = time.time()
+            print(f'Reduction over {chunk_count} chunks took {t2-t1:.1}s')
         return out
 
     def _get_endpoint_url(self):

From 20acfdd679a7309e54b786f74b93575e9421f520 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 12 Mar 2024 08:29:40 +0000
Subject: [PATCH 54/85] Adding some experimental data

---
 doc/data/home_experiments_bnl.txt | 248 ++++++++++++++++++++++++++++++
 doc/figures/sequence.pu           |  51 ++++++
 2 files changed, 299 insertions(+)
 create mode 100644 doc/data/home_experiments_bnl.txt
 create mode 100644 doc/figures/sequence.pu

diff --git a/doc/data/home_experiments_bnl.txt b/doc/data/home_experiments_bnl.txt
new file mode 100644
index 00000000..1d42b213
--- /dev/null
+++ b/doc/data/home_experiments_bnl.txt
@@ -0,0 +1,248 @@
+pyfive branch
+default_fill_cache : False
+default_cache_type : readahead
+default_block_size : 1048576
+
+r[0:3, 4:6, 7:9]
+r[0:3, 400:600, 7:9]
+r[0:3, 400:600, 800:1100]
+r[5,:,800]
+r[:,300:500,800:1000]
+r[1,:,:]
+
+bnl/ch330a.pc19790301-def.nc, 64 chunk entries
+
+Reduction over 1 chunks took 0.4s
+Overall time (version 1) - 5e+00s
+6.5275819301605225 : Regular
+6.11167573928833 : Active Regular
+4.966563940048218 : Active Remote
+5.777529239654541 : Regular
+5.894297122955322 : Active Regular
+4.513840913772583 : Active Remote
+5.653490781784058 : Regular
+5.954509019851685 : Active Regular
+4.394351005554199 : Active Remote
+5.651051044464111 : Regular
+5.728168964385986 : Active Regular
+4.569552898406982 : Active Remote
+5.659423112869263 : Regular
+5.652827024459839 : Active Regular
+5.071002960205078 : Active Remote
+
+Reduction over 2 chunks took 0.4s
+Overall time (version 1) - 4e+00s
+7.908946990966797 : Regular
+7.5908496379852295 : Active Regular
+4.481859922409058 : Active Remote
+8.732571840286255 : Regular
+8.1283700466156 : Active Regular
+4.558679103851318 : Active Remote
+7.707890033721924 : Regular
+7.721914052963257 : Active Regular
+4.404280185699463 : Active Remote
+7.809189796447754 : Regular
+7.916913986206055 : Active Regular
+4.436033010482788 : Active Remote
+7.775968790054321 : Regular
+7.735078811645508 : Active Regular
+4.649733781814575 : Active Remote
+
+Reduction over 2 chunks took 0.5s
+Overall time (version 1) - 5e+00s
+7.754213809967041 : Regular
+9.058350801467896 : Active Regular
+4.2825469970703125 : Active Remote
+7.6808648109436035 : Regular
+7.936631202697754 : Active Regular
+4.475581884384155 : Active Remote
+7.986822843551636 : Regular
+7.767880916595459 : Active Regular
+5.64799165725708 : Active Remote
+8.05326509475708 : Regular
+8.558978080749512 : Active Regular
+4.387536287307739 : Active Remote
+8.05326509475708 : Regular
+8.558978080749512 : Active Regular
+4.387536287307739 : Active Remote
+
+Reduction over 4 chunks took 1e+00s
+Overall time (version 1) - 5e+00s
+11.737549781799316 : Regular
+11.829929113388062 : Active Regular
+5.3570029735565186 : Active Remote
+12.96435260772705 : Regular
+11.708273887634277 : Active Regular
+4.427199125289917 : Active Remote
+12.082044124603271 : Regular
+11.663249969482422 : Active Regular
+4.448379039764404 : Active Remote
+12.160427808761597 : Regular
+12.381748914718628 : Active Regular
+4.864629030227661 : Active Remote
+11.940709114074707 : Regular
+12.424716234207153 : Active Regular
+4.6097517013549805 : Active Remote
+
+Reduction over 8 chunks took 0.7s
+Overall time (version 1) - 5e+00s
+20.35632634162903 : Regular
+18.701595783233643 : Active Regular
+4.780594825744629 : Active Remote
+18.91178870201111 : Regular
+20.01336669921875 : Active Regular
+4.806182861328125 : Active Remote
+19.31538200378418 : Regular
+19.239036321640015 : Active Regular
+4.886173963546753 : Active Remote
+18.862577199935913 : Regular
+18.919509172439575 : Active Regular
+4.527647972106934 : Active Remote
+19.1742422580719 : Regular
+19.013291835784912 : Active Regular
+4.555526971817017 : Active Remote
+
+Reduction over 16 chunks took 1e+00s
+Overall time (version 1) - 5e+00s
+33.903298139572144 : Regular
+33.95520091056824 : Active Regular
+5.479094982147217 : Active Remote
+33.47618818283081 : Regular
+33.14509582519531 : Active Regular
+4.9755659103393555 : Active Remote
+34.320921182632446 : Regular
+33.631829023361206 : Active Regular
+4.777320861816406 : Active Remote
+33.07414221763611 : Regular
+33.10297989845276 : Active Regular
+4.985634803771973 : Active Remote
+33.271361112594604 : Regular
+33.41872477531433 : Active Regular
+5.1617209911346436 : Active Remote
+
+Reduction over 64 chunks took 3e+00s
+Overall time (version 1) - 7e+00s
+135.66737484931946 : Regular
+125.51056480407715 : Active Regular
+6.838850021362305 : Active Remote
+125.06190586090088 : Regular
+123.48610496520996 : Active Regular
+7.4687440395355225 : Active Remote
+124.92097496986389 : Regular
+123.4994330406189 : Active Regular
+6.4895899295806885 : Active Remote
+127.3746292591095 : Regular
+129.92956972122192 : Active Regular
+7.572640895843506 : Active Remote
+123.77195906639099 : Regular
+124.58749127388 : Active Regular
+7.2448132038116455 : Active Remote
+
+bnl/ch330a.pc19790301-bnl.nc, 3400 chunk entries
+
+Overall time (version 1) - 4e+01s
+54.81114602088928 : Regular
+40.206737995147705 : Active Regular
+39.09910821914673 : Active Remote
+39.46443700790405 : Regular
+39.027403116226196 : Active Regular
+37.75058889389038 : Active Remote
+37.9528648853302 : Regular
+41.495553731918335 : Active Regular
+38.07294011116028 : Active Remote
+37.73332500457764 : Regular
+40.39740324020386 : Active Regular
+38.49332809448242 : Active Remote
+39.069754123687744 : Regular
+40.393052101135254 : Active Regular
+39.88633894920349 : Active Remote
+
+Reduction over 3 chunks took 0.4s
+Overall time (version 1) - 4e+01s
+40.984943151474 : Regular
+39.61718392372131 : Active Regular
+39.1941978931427 : Active Remote
+39.66099715232849 : Regular
+39.43976306915283 : Active Regular
+38.039302825927734 : Active Remote
+39.62373924255371 : Regular
+39.98037099838257 : Active Regular
+38.41215372085571 : Active Remote
+39.09483599662781 : Regular
+39.73953199386597 : Active Regular
+41.10827708244324 : Active Remote
+40.30459427833557 : Regular
+39.665311098098755 : Active Regular
+39.48872089385986 : Active Remote
+
+Reduction over 9 chunks took 0.5s
+Overall time (version 1) - 4e+01s
+45.93427586555481 : Regular
+51.21455001831055 : Active Regular
+42.55308794975281 : Active Remote
+43.54172086715698 : Regular
+43.66517210006714 : Active Regular
+38.02682185173035 : Active Remote
+43.521965980529785 : Regular
+46.44435095787048 : Active Regular
+38.40240478515625 : Active Remote
+43.9306001663208 : Regular
+43.83222007751465 : Active Regular
+38.353585958480835 : Active Remote
+43.72931981086731 : Regular
+45.22025394439697 : Active Regular
+39.67265009880066 : Active Remote
+
+Reduction over 17 chunks took 0.9s
+Overall time (version 1) - 4e+01s
+47.17348670959473 : Regular
+48.371257066726685 : Active Regular
+39.67051911354065 : Active Remote
+46.177210092544556 : Regular
+48.33781385421753 : Active Regular
+38.0770800113678 : Active Remote
+48.514657974243164 : Regular
+49.18296504020691 : Active Regular
+39.39418125152588 : Active Remote
+50.68878698348999 : Regular
+45.65637493133545 : Active Regular
+38.83580207824707 : Active Remote
+47.00814175605774 : Regular
+46.89705014228821 : Active Regular
+38.41860294342041 : Active Remote
+
+Reduction over 60 chunks took 2e+00s
+Overall time (version 1) - 4e+01s
+71.09002494812012 : Regular
+72.1630642414093 : Active Regular
+39.93671202659607 : Active Remote
+71.39048480987549 : Regular
+69.36404490470886 : Active Regular
+40.83954119682312 : Active Remote
+73.05522775650024 : Regular
+70.12043809890747 : Active Regular
+42.23869585990906 : Active Remote
+71.30378913879395 : Regular
+71.10070180892944 : Active Regular
+43.04809808731079 : Active Remote
+72.75630617141724 : Regular
+71.60838198661804 : Active Regular
+40.027408838272095 : Active Remote
+
+Reduction over 340 chunks took 8e+00s
+Overall time (version 1) - 5e+01s
+216.78450202941895 : Regular
+208.11211395263672 : Active Regular
+45.26900100708008 : Active Remote
+222.73459100723267 : Regular
+208.41155195236206 : Active Regular
+48.21009302139282 : Active Remote
+207.01012992858887 : Regular
+212.10812401771545 : Active Regular
+47.88032793998718 : Active Remote
+209.2855679988861 : Regular
+208.8525538444519 : Active Regular
+46.665223836898804 : Active Remote
+203.70479106903076 : Regular
+213.17324495315552 : Active Regular
+49.18446397781372 : Active Remote
diff --git a/doc/figures/sequence.pu b/doc/figures/sequence.pu
new file mode 100644
index 00000000..dfaeccfe
--- /dev/null
+++ b/doc/figures/sequence.pu
@@ -0,0 +1,51 @@
+@startuml
+skinparam backgroundColor #EEEBDC
+'skinparam handwritten true
+skinparam notebackgroundcolor white
+
+skinparam sequence {
+    participantBackgroundColor White
+    BackgroundColor White
+}
+
+
+hide footbox
+title Key Actors in Active Storage 
+
+box python #lightblue
+participant Application 
+participant Active
+end box
+box server side #beige
+participant Reductionist
+participant S3
+end box
+Application -> Active: Open File
+activate Active
+Active -> S3: Read file metadata
+S3 -> Active: Metadata blocks
+Application -> Active: Active(getitem)\ne.g. mean(X[1,:])
+Active -> S3: Read B-Tree
+S3 -> Active: Chunk Index
+activate Active
+Active -> Active: Identify Chunks
+loop 
+Active -> Reductionist: Reduce Chunk
+Reductionist -> S3 : Read chunk
+Reductionist -> Active: f(chunk)
+end 
+Active -> Active: f(chunks)
+Active -> Application: return\nresult=\nf(getitem)
+note  left of Application
+Multiple getitems 
+and function calls 
+can reuse index, 
+until:
+end note
+Application -> Active: Close File
+deactivate Active
+
+
+
+
+@enduml
\ No newline at end of file

From 54967d2ad5157c8c213cbc81a11d80ed58eba5ef Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 12 Mar 2024 21:06:45 +0000
Subject: [PATCH 55/85] Plotting and data code.

---
 doc/data/home_experiments_bnl.txt |   2 +
 doc/data/plot_bnl.py              | 130 +++++++++++++++
 doc/data/work_experiments_bnl.txt | 266 ++++++++++++++++++++++++++++++
 3 files changed, 398 insertions(+)
 create mode 100644 doc/data/plot_bnl.py
 create mode 100644 doc/data/work_experiments_bnl.txt

diff --git a/doc/data/home_experiments_bnl.txt b/doc/data/home_experiments_bnl.txt
index 1d42b213..bc1af2f2 100644
--- a/doc/data/home_experiments_bnl.txt
+++ b/doc/data/home_experiments_bnl.txt
@@ -138,8 +138,10 @@ Overall time (version 1) - 7e+00s
 124.58749127388 : Active Regular
 7.2448132038116455 : Active Remote
 
+
 bnl/ch330a.pc19790301-bnl.nc, 3400 chunk entries
 
+Reduction over 1 chunks took 0.4s
 Overall time (version 1) - 4e+01s
 54.81114602088928 : Regular
 40.206737995147705 : Active Regular
diff --git a/doc/data/plot_bnl.py b/doc/data/plot_bnl.py
new file mode 100644
index 00000000..fb76301d
--- /dev/null
+++ b/doc/data/plot_bnl.py
@@ -0,0 +1,130 @@
+from pathlib import Path
+from matplotlib import pyplot as plt
+import numpy as np
+
+dimensions = np.array([40, 1920, 2560])
+requests = 3*2*2, 3*199*2,3*199*299,1920,40*199*199,1920*2560
+big_chunks = np.array([10,480,640])
+small_chunks = np.array([4, 113, 128])
+
+
+def get_data(fname, big=True):
+
+    big, small = {}, {}
+    block = False
+    with open(fname,'r') as f:
+        for line in f:
+
+            if line.startswith('bnl'):
+                if '-def' in line:
+                    inplay = big
+                else:
+                    inplay = small
+
+            if line.startswith('Reduction over'):
+                block=True
+                r1,r2,r3 = [],[],[]
+                key = line[line.find('over')+5:line.find('chunks')]
+                print(key)
+                continue
+            if line.startswith('Overall'):
+                continue
+            if block:
+                try:
+                    v, t = tuple([x.strip() for x in line.split(':')])
+                    if t.startswith('Active Regular'):
+                        r2.append(v)
+                    elif t.startswith('Regular'):
+                        r1.append(v)
+                    elif t.startswith('Active Remote'):
+                        r3.append(v)
+                except:
+                    inplay[key]=r1,r2,r3
+                    print('Skipping End of Block')
+                    block = False
+            else:
+                print('Skipping')
+            continue
+    inplay[key]=r1,r2,r3
+    print(big)
+    print(small)
+    print(big.keys(), small.keys())
+    return small, big
+
+def do_all(hs,hb,ws,wb):
+
+    titles = ['Home - Small Chunks',
+              'Home - Big Chunks',
+              'Uni - Small Chunks',
+              'Uni - Big Chunks']
+
+    fig, axes = plt.subplots(nrows=2, ncols=2)  
+    fig.set_size_inches(8, 8)
+    axes = axes.flatten()
+
+    for a, d, t in zip(axes, [hs,hb,ws,wb], titles):
+        do_plot(a, d, t)
+    plt.tight_layout()
+    plt.show()
+
+def do_plot(ax, data, t):
+
+    def stats4(d1):
+        dd = np.array([float(v) for v in d1])
+        return [np.mean(dd),np.min(dd),np.max(dd)]
+
+    keys = list(data.keys())
+
+    x = []
+    regular, local, remote = [], [], []
+    for k in keys:
+        # three time series for each key
+        reg, loc, rem = data[k]
+        sreg, sloc, srem = stats4(reg), stats4(loc), stats4(rem)
+        regular.append(sreg)
+        local.append(sloc)
+        remote.append(srem)
+        x.append(float(k))
+
+    delta = 0
+    all = True
+    for d,c in zip([regular, local, remote],['g','r','b']):
+        x=np.array(x)+delta*0.2
+        y = [r[0] for r in d]
+        err = [[r[0]-r[1] for r in d],[r[2]-r[0] for r in d]]
+        if c == 'b' or all:
+            ax.errorbar(x, y, fmt='o', yerr=err, color=c)
+        delta+=1
+    ax.set_title(t)
+    ax.set_xlabel('Chunks Processed')
+    ax.set_ylabel('s')
+
+    if t.find('Small') > 0:
+        cv = np.prod(small_chunks)*4/1e6
+    else:
+        cv = np.prod(big_chunks)*4/1e6
+    v = np.prod(dimensions)*4/1e6
+    r = v/cv
+    print(f'Chunk volume {cv}Mb, Dataset volume {v}Mb - ratio {r}')
+
+
+    def c2v(x):
+        return x*cv 
+    def v2c(x):
+        return x/cv
+    
+
+    tax = ax.secondary_xaxis(-0.2, functions=(c2v,v2c))
+    tax.set_xlabel('Reductionist Read (MB)')
+
+            
+
+if __name__=="__main__":
+    mypath = Path(__file__).parent
+    fname = mypath/'work_experiments_bnl.txt'
+    ws, wb = get_data(fname)
+    fname = mypath/'home_experiments_bnl.txt'
+    hs, hb = get_data(fname)
+    do_all(hs, hb, ws, wb)
+
+
diff --git a/doc/data/work_experiments_bnl.txt b/doc/data/work_experiments_bnl.txt
new file mode 100644
index 00000000..18e2f1d5
--- /dev/null
+++ b/doc/data/work_experiments_bnl.txt
@@ -0,0 +1,266 @@
+pyfive branch
+default_fill_cache : False
+default_cache_type : readahead
+default_block_size : 1048576
+
+r[0:3, 4:6, 7:9]
+r[0:3, 400:600, 7:9]
+r[0:3, 400:600, 800:1100]
+r[5,:,800]
+r[:,300:500,800:1000]
+r[1,:,:]
+
+bnl/ch330a.pc19790301-def.nc, 64 chunk entries
+
+Reduction over 1 chunks took 0.6s
+Overall time (version 1) - 3e+00s
+8.344115972518921 : Regular
+4.220659971237183 : Active Regular
+3.1975197792053223 : Active Remote
+3.014167070388794 : Regular
+3.355847120285034 : Active Regular
+2.804471015930176 : Active Remote
+2.9698619842529297 : Regular
+2.705400228500366 : Active Regular
+3.429996967315674 : Active Remote
+2.8196191787719727 : Regular
+2.7991769313812256 : Active Regular
+2.6798598766326904 : Active Remote
+2.8660669326782227 : Regular
+3.903325080871582 : Active Regular
+2.599932909011841 : Active Remote
+
+Reduction over 2 chunks took 0.3s
+Overall time (version 1) - 3e+00s
+2.8660669326782227 : Regular
+3.903325080871582 : Active Regular
+2.599932909011841 : Active Remote
+3.29656720161438 : Regular
+3.525721788406372 : Active Regular
+2.75189471244812 : Active Remote
+3.4454238414764404 : Regular
+3.665536880493164 : Active Regular
+2.571530818939209 : Active Remote
+3.538390874862671 : Regular
+4.367408990859985 : Active Regular
+2.61018705368042 : Active Remote
+3.660421848297119 : Regular
+3.500704050064087 : Active Regular
+2.6997482776641846 : Active Remote
+
+Reduction over 2 chunks took 0.4s
+Overall time (version 1) - 3e+00s
+4.397391080856323 : Regular
+3.7289319038391113 : Active Regular
+2.8331010341644287 : Active Remote
+3.8689942359924316 : Regular
+3.6623740196228027 : Active Regular
+2.6047868728637695 : Active Remote
+3.592682123184204 : Regular
+3.7491321563720703 : Active Regular
+2.693451166152954 : Active Remote
+3.557407855987549 : Regular
+4.505606174468994 : Active Regular
+3.362746000289917 : Active Remote
+4.008091926574707 : Regular
+3.4869918823242188 : Active Regular
+2.7830257415771484 : Active Remote
+
+Reduction over 4 chunks took 0.5s
+Overall time (version 1) - 3e+00s
+4.989245176315308 : Regular
+5.01286506652832 : Active Regular
+2.7912440299987793 : Active Remote
+5.019068002700806 : Regular
+5.267661094665527 : Active Regular
+2.693160057067871 : Active Remote
+5.072666883468628 : Regular
+4.841257095336914 : Active Regular
+3.7721190452575684 : Active Remote
+4.9821202754974365 : Regular
+5.039299726486206 : Active Regular
+3.1108968257904053 : Active Remote
+5.485536098480225 : Regular
+4.931081295013428 : Active Regular
+2.8403587341308594 : Active Remote
+
+Reduction over 8 chunks took 0.6s
+Overall time (version 1) - 3e+00s
+8.201773881912231 : Regular
+8.027722120285034 : Active Regular
+3.062329053878784 : Active Remote
+8.066512107849121 : Regular
+7.513127326965332 : Active Regular
+2.738662004470825 : Active Remote
+7.728527069091797 : Regular
+8.574406147003174 : Active Regular
+3.0061259269714355 : Active Remote
+8.332377195358276 : Regular
+7.553836107254028 : Active Regular
+2.7933387756347656 : Active Remote
+9.098789930343628 : Regular
+8.0898118019104 : Active Regular
+2.934377908706665 : Active Remote
+
+Reduction over 16 chunks took 0.9s
+Overall time (version 1) - 3e+00s
+13.034955978393555 : Regular
+12.481261968612671 : Active Regular
+3.220118999481201 : Active Remote
+12.732249736785889 : Regular
+14.228748798370361 : Active Regular
+3.1314401626586914 : Active Remote
+13.192391157150269 : Regular
+12.807947874069214 : Active Regular
+2.9496419429779053 : Active Remote
+13.663172721862793 : Regular
+12.501967191696167 : Active Regular
+3.0187788009643555 : Active Remote
+13.663172721862793 : Regular
+12.501967191696167 : Active Regular
+3.0187788009643555 : Active Remote
+
+Reduction over 64 chunks took 4e+00s
+Overall time (version 1) - 7e+00s
+48.40218806266785 : Regular
+46.85928988456726 : Active Regular
+6.921466827392578 : Active Remote
+48.40218806266785 : Regular
+46.85928988456726 : Active Regular
+6.921466827392578 : Active Remote
+46.171929121017456 : Regular
+47.95656108856201 : Active Regular
+5.700792074203491 : Active Remote
+46.47747492790222 : Regular
+46.345592975616455 : Active Regular
+5.459475040435791 : Active Remote
+51.872729778289795 : Regular
+68.54537105560303 : Active Regular
+8.05989122390747 : Active Remote
+
+
+bnl/ch330a.pc19790301-bnl.nc, 3400 chunk entries
+
+Reduction over 1 chunks took 0.4s
+Overall time (version 1) - 2e+01s
+45.14613080024719 : Regular
+27.162260055541992 : Active Regular
+23.115849256515503 : Active Remote
+23.882447004318237 : Regular
+23.903882026672363 : Active Regular
+21.457104921340942 : Active Remote
+25.383798837661743 : Regular
+24.16863775253296 : Active Regular
+23.5748450756073 : Active Remote
+22.512950897216797 : Regular
+23.25707483291626 : Active Regular
+25.665203094482422 : Active Remote
+26.175257205963135 : Regular
+22.014382123947144 : Active Regular
+20.443835020065308 : Active Remote
+
+Reduction over 3 chunks took 0.6s
+Overall time (version 1) - 2e+01s
+20.11542320251465 : Regular
+20.76677107810974 : Active Regular
+22.79468083381653 : Active Remote
+27.61694622039795 : Regular
+26.036365032196045 : Active Regular
+24.752694129943848 : Active Remote
+24.694957971572876 : Regular
+23.442779064178467 : Active Regular
+25.0538170337677 : Active Remote
+25.87760305404663 : Regular
+25.108362913131714 : Active Regular
+20.546348094940186 : Active Remote
+22.047937870025635 : Regular
+21.188127040863037 : Active Regular
+22.21766209602356 : Active Remote
+
+Reduction over 9 chunks took 0.4s
+Overall time (version 1) - 2e+01s
+25.24668002128601 : Regular
+28.922791957855225 : Active Regular
+22.928779125213623 : Active Remote
+22.633398056030273 : Regular
+23.74671173095703 : Active Regular
+21.821603775024414 : Active Remote
+24.363435983657837 : Regular
+27.717058897018433 : Active Regular
+21.293541193008423 : Active Remote
+25.705342292785645 : Regular
+26.323739051818848 : Active Regular
+25.679578065872192 : Active Remote
+23.84442114830017 : Regular
+33.383033990859985 : Active Regular
+36.04991698265076 : Active Remote
+
+Reduction over 17 chunks took 0.6s
+Overall time (version 1) - 2e+01s
+48.07362198829651 : Regular
+37.93207597732544 : Active Regular
+22.77363395690918 : Active Remote
+28.11515712738037 : Regular
+28.385287046432495 : Active Regular
+22.595709085464478 : Active Remote
+27.156584978103638 : Regular
+27.49553680419922 : Active Regular
+23.837636947631836 : Active Remote
+25.69541597366333 : Regular
+25.940948009490967 : Active Regular
+24.014315128326416 : Active Remote
+27.273848056793213 : Regular
+35.16158103942871 : Active Regular
+23.360612869262695 : Active Remote
+
+Reduction over 60 chunks took 3e+00s
+Overall time (version 1) - 2e+01s
+41.20513677597046 : Regular
+38.56750798225403 : Active Regular
+24.23797583580017 : Active Remote
+40.17050385475159 : Regular
+40.017329931259155 : Active Regular
+26.986143827438354 : Active Remote
+40.651535987854004 : Regular
+42.30823493003845 : Active Regular
+25.00485610961914 : Active Remote
+40.953128814697266 : Regular
+40.59316611289978 : Active Regular
+23.82962989807129 : Active Remote
+39.43385100364685 : Regular
+40.663326263427734 : Active Regular
+23.716075897216797 : Active Remote
+
+Reduction over 340 chunks took 8e+00s
+Overall time (version 1) - 3e+01s
+116.074697971344 : Regular
+116.11710214614868 : Active Regular
+30.019917964935303 : Active Remote
+119.91504836082458 : Regular
+116.83721113204956 : Active Regular
+29.05119490623474 : Active Remote
+114.2572910785675 : Regular
+124.5028350353241 : Active Regular
+34.20670700073242 : Active Remote
+118.08085703849792 : Regular
+123.08248209953308 : Active Regular
+28.80124282836914 : Active Remote
+125.51857686042786 : Regular
+143.63352608680725 : Active Regular
+35.165050983428955 : Active Remote
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 20b36d173522a5922568945763b1e083a327a403 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 14 Mar 2024 10:09:37 +0000
Subject: [PATCH 56/85] Added more organised metrics to Active, available after
 a getitem from .metric_data attribute

---
 activestorage/active.py | 140 ++++++++++++++++++++++++----------------
 1 file changed, 84 insertions(+), 56 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 36410450..ae9efc0e 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -38,45 +38,62 @@ def load_from_s3(uri, storage_options=None):
     else:
         fs = s3fs.S3FileSystem(**storage_options)  # use passed-in dictionary
    
+    t1=time.time()   
     s3file = fs.open(uri, 'rb')
+    t2=time.time()
     ds = pyfive.File(s3file)
-    print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri}")
+    t3=time.time()
+    print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri} ({t2-t1:.2},{t3-t2:.2})")
     return ds
 
-def get_missing_attributes(ds):
-        """" 
-        Load all the missing attributes we need from a netcdf file
-        """
+def _metricise(method):
+    """ Decorator for class methods loads into metric_data"""
+    def timed(self, *args, **kw):
+        ts = time.time()
+        metric_name=''
+        if '__metric_name' in kw:
+            metric_name = kw['__metric_name']
+            del kw['__metric_name']
+        result = method(self,*args, **kw)
+        te = time.time()
+        if metric_name:
+            self.metric_data[metric_name] = te-ts
+        return result
+    return timed
 
-        def hfix(x):
-            '''
-            return item if single element list/array
-            see https://github.com/h5netcdf/h5netcdf/issues/116
-            '''
-            if x is None:
-                return x
-            if not np.isscalar(x) and len(x) == 1:
-                return x[0]
-            return x
-
-        _FillValue = hfix(ds.attrs.get('_FillValue'))
-        missing_value = ds.attrs.get('missing_value')
-        valid_min = hfix(ds.attrs.get('valid_min'))
-        valid_max = hfix(ds.attrs.get('valid_max'))
-        valid_range = hfix(ds.attrs.get('valid_range'))
-        if valid_max is not None or valid_min is not None:
-            if valid_range is not None:
-                raise ValueError(
-                    "Invalid combination in the file of valid_min, "
-                    "valid_max, valid_range: "
-                    f"{valid_min}, {valid_max}, {valid_range}"
-                )
-        elif valid_range is not None:            
-            valid_min, valid_max = valid_range
-        
-        return _FillValue, missing_value, valid_min, valid_max
 
+def get_missing_attributes(ds):
+    """" 
+    Load all the missing attributes we need from a netcdf file
+    """
 
+    def hfix(x):
+        '''
+        return item if single element list/array
+        see https://github.com/h5netcdf/h5netcdf/issues/116
+        '''
+        if x is None:
+            return x
+        if not np.isscalar(x) and len(x) == 1:
+            return x[0]
+        return x
+
+    _FillValue = hfix(ds.attrs.get('_FillValue'))
+    missing_value = ds.attrs.get('missing_value')
+    valid_min = hfix(ds.attrs.get('valid_min'))
+    valid_max = hfix(ds.attrs.get('valid_max'))
+    valid_range = hfix(ds.attrs.get('valid_range'))
+    if valid_max is not None or valid_min is not None:
+        if valid_range is not None:
+            raise ValueError(
+                "Invalid combination in the file of valid_min, "
+                "valid_max, valid_range: "
+                f"{valid_min}, {valid_max}, {valid_range}"
+            )
+    elif valid_range is not None:            
+        valid_min, valid_max = valid_range
+    
+    return _FillValue, missing_value, valid_min, valid_max
 
 class Active:
     """ 
@@ -149,11 +166,10 @@ def __init__(
         self._max_threads = max_threads
         self.missing = None
         self.ds = None
-        self.metrics =  False
-
-    def toggle_metrics(self):
-        self.metrics = not self.metrics
+        self.metric_data = {}
+        self.data_read = 0
 
+    @_metricise
     def __load_nc_file(self):
         """ Get the netcdf file and it's b-tree"""
         ncvar = self.ncvar
@@ -177,33 +193,41 @@ def __getitem__(self, index):
         Provides support for a standard get item.
         #FIXME-BNL: Why is the argument index?
         """
+        self.metric_data = {}
         if self.ds is None:
-            self.__load_nc_file()
-        # In version one this is done by explicitly looping over each chunk in the file
-        # and returning the requested slice ourselves. In version 2, we can pass this
-        # through to the default method.
+            self.__load_nc_file(__metric_name='load nc time')
+            #self.__metricise('Load','__load_nc_file')
         
         self.missing = self.__get_missing_attributes()
+
+        self.data_read = 0
         
         if self.method is None and self._version == 0:
-            
+        
             # No active operation
-            data = self.ds[index]    
-            data = self._mask_data(data)
-            return data
+            return self._get_vanilla(index, __metric_name='vanilla_time')
 
         elif self._version == 1:
 
             #FIXME: is the difference between version 1 and 2 still honoured?
-            return self._get_selection(index)
+            return self._get_selection(index, __metric_name='selection 1 time (s)')
         
         elif self._version  == 2:
 
-            return self._get_selection(index)
+            return self._get_selection(index, __metric_name='selection 2 time (s)')
           
         else:
             raise ValueError(f'Version {self._version} not supported')
 
+    @_metricise
+    def _get_vanilla(self, index):
+        """ 
+        Get the data without any active operation
+        """
+        data = self.ds[index]    
+        data = self._mask_data(data)
+        return data
+
     @property
     def components(self):
         """Return or set the components flag.
@@ -269,7 +293,7 @@ def _get_active(self, method, *args):
         """
         raise NotImplementedError
  
-
+    @_metricise
     def _get_selection(self, *args):
         """ 
         At this point we have a Dataset object, but all the important information about
@@ -286,6 +310,9 @@ def _get_selection(self, *args):
         array = pyfive.ZarrArrayStub(self.ds.shape, self.ds.chunks)
         ds = self.ds._dataobjects
         
+        self.metric_data['args'] = args
+        self.metric_data['dataset shape'] = self.ds.shape
+        self.metric_data['dataset chunks'] = self.ds.chunks
         if ds.filter_pipeline is None:
             compressor, filters = None, None
         else:
@@ -338,10 +365,10 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
         if ds.chunks is not None:
             t1 = time.time()
             ds._get_chunk_addresses()
-            t2 = time.time()
-            if self.metrics:
-                chunk_count = 0
-                print(f'Index time: {t2-t1:.1}s ({len(ds._zchunk_index)} chunk entries)')
+            t2 = time.time() - t1
+            self.metric_data['indexing time (s)'] = t2
+            self.metric_data['chunk number'] = len(ds._zchunk_index)
+        chunk_count = 0
         t1 = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
             futures = []
@@ -359,8 +386,7 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                 except Exception as exc:
                     raise
                 else:
-                    if self.metrics:
-                        chunk_count +=1
+                    chunk_count +=1
                     if method is not None:
                         result, count = result
                         out.append(result)
@@ -406,9 +432,10 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
                     # size.
                     out = out / np.sum(counts).reshape(shape1)
 
-        if self.metrics:
-            t2 = time.time()
-            print(f'Reduction over {chunk_count} chunks took {t2-t1:.1}s')
+        t2 = time.time()
+        self.metric_data['reduction time (s)'] = t2-t1
+        self.metric_data['chunks processed'] = chunk_count
+        self.metric_data['storage read (B)'] = self.data_read
         return out
 
     def _get_endpoint_url(self):
@@ -439,6 +466,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         """
         
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
+        self.data_read += size
 
         # S3: pass in pre-configured storage options (credentials)
         if self.storage_type == "s3":

From ddcff5cba090508b9f778c1f375e8f04efeb4c83 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 14 Mar 2024 17:09:16 +0000
Subject: [PATCH 57/85] Active _version1 and _version2 differ again

---
 activestorage/active.py | 19 +++++++++++++++----
 bnl/bnl_s3test.py       | 23 +++++++++++++++--------
 bnl/bnl_test.py         | 39 +++++++++++++++++++++------------------
 3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index ae9efc0e..34cb9bb3 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -10,7 +10,7 @@
 
 from activestorage.config import *
 from activestorage import reductionist
-from activestorage.storage import reduce_chunk
+from activestorage.storage import reduce_chunk, reduce_opens3_chunk
 from activestorage.hdf2numcodec import decode_filters
 
 
@@ -338,7 +338,7 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
             counts = None  # should never get touched with no method!
 
         # Create a shared session object.
-        if self.storage_type == "s3":
+        if self.storage_type == "s3" and self._version==2:
             if self.storage_options is not None:
                 key, secret = None, None
                 if "key" in self.storage_options:
@@ -468,8 +468,16 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
         self.data_read += size
 
-        # S3: pass in pre-configured storage options (credentials)
-        if self.storage_type == "s3":
+        if self.storage_type == 'S3' and self._version == 1:
+
+            tmp, count = reduce_opens3_chunk(self.ds._fh, offset, size, compressor, filters,
+                            self.missing, ds.dtype,
+                            chunks, ds.order,
+                            chunk_selection, method=self.method
+            )
+
+        elif self.storage_type == "s3" and self._version==2:
+            # S3: pass in pre-configured storage options (credentials)
             # print("S3 rfile is:", self.filename)
             parsed_url = urllib.parse.urlparse(self.filename)
             bucket = parsed_url.netloc
@@ -513,6 +521,9 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        ds.order,
                                                        chunk_selection,
                                                        operation=self._method)
+        elif self.storage_type=='ActivePosix' and self.version==2:
+            # This is where the DDN Fuse and Infinia wrappers go
+            raise NotImplementedError
         else:
             # note there is an ongoing discussion about this interface, and what it returns
             # see https://github.com/valeriupredoi/PyActiveStorage/issues/33
diff --git a/bnl/bnl_s3test.py b/bnl/bnl_s3test.py
index 9ae4ab35..938e98c0 100644
--- a/bnl/bnl_s3test.py
+++ b/bnl/bnl_s3test.py
@@ -54,16 +54,23 @@ def ex_test(ncfile, var):
         ofile
     )
     print("S3 Test file path:", test_file_uri)
-    active = Active(test_file_uri, var, storage_type="s3",
-                    storage_options=storage_options,
-                    active_storage_url=active_storage_url)
 
-    active._version = 2
-    active._method = "min"
+    for av in [0,1,2]:
 
-    result = active[0:2,4:6,7:9]
-    assert nc_min == result
-    assert result == 239.25946044921875
+        active = Active(test_file_uri, var, storage_type="s3",
+                        storage_options=storage_options,
+                        active_storage_url=active_storage_url)
+
+        active._version = av
+        if av > 0: 
+            active._method = "min"
+
+        result = active[0:2,4:6,7:9]
+        print(active.metric_data)
+        if av == 0:
+            result = np.min(result)
+        assert nc_min == result
+        assert result == 239.25946044921875
 
 
diff --git a/bnl/bnl_test.py b/bnl/bnl_test.py
index ae8e4a32..fd9e4d29 100644
--- a/bnl/bnl_test.py
+++ b/bnl/bnl_test.py
@@ -18,27 +18,30 @@ def mytest():
     ncfile, v = "cesm2_native.nc","TREFHT"
     ncfile, v  = "CMIP6-test.nc", 'tas'
     #ncfile, v = "chunked.hdf5", "dataset1"
-    ncfile, v = 'daily_data.nc', 'ta'
+    #ncfile, v = 'daily_data.nc', 'ta'
     mypath = Path(__file__).parent
     uri = str(mypath/ncfile)
-    active = Active(uri, v, None)
-    active._version = 0
-    if v == "dataset1":
-        d = active[2,:]
-    else:
-        d = active[4:5, 1:2]
-    mean_result = np.mean(d)
-    active = Active(uri, v, None)
-    active._version = 2
-    active.method = "mean"
-    active.components = True
-    if  v == "dataset1":
-        result2 = active[2,:]
-    else:
-        result2 = active[4:5, 1:2]
-    print(result2, ncfile)
+    results = []
+    for av in [0,1,2]:
+
+        active = Active(uri, v, None)
+        active._version = av
+        if av > 0:
+            active.method="mean"
+        if v == "dataset1":
+            d = active[2,:]
+        else:
+            d = active[4:5, 1:2]
+        print(active.metric_data)
+        if av == 0:
+            d = np.mean(d)
+        results.append(d)
+
+
     # check for active
-    np.testing.assert_allclose(mean_result, result2["sum"]/result2["n"], rtol=1e-6)
+    np.testing.assert_allclose(results[0],results[1], rtol=1e-6)
+    np.testing.assert_allclose(results[1],results[2], rtol=1e-6)
+
 
 
 if __name__=="__main__":

From df6b5bbbbd72946092746b91898e04a00352b723 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 19 Mar 2024 11:57:52 +0000
Subject: [PATCH 58/85] Active v1 on S3 now works. Stopping default verbosity
 from reductionist

---
 activestorage/active.py       |  8 ++++----
 activestorage/reductionist.py |  5 ++++-
 activestorage/storage.py      | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 34cb9bb3..78f56e3d 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -468,9 +468,9 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
         self.data_read += size
 
-        if self.storage_type == 'S3' and self._version == 1:
+        if self.storage_type == 's3' and self._version == 1:
 
-            tmp, count = reduce_opens3_chunk(self.ds._fh, offset, size, compressor, filters,
+            tmp, count = reduce_opens3_chunk(ds.fh, offset, size, compressor, filters,
                             self.missing, ds.dtype,
                             chunks, ds.order,
                             chunk_selection, method=self.method
@@ -535,11 +535,11 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                       chunk_selection, method=self.method)
 
         if self.method is not None:
-            return tmp, count
+            return tmp, count 
         else:
             if drop_axes:
                 tmp = np.squeeze(tmp, axis=drop_axes)
-            return tmp, out_selection
+            return tmp, out_selection 
 
     def _mask_data(self, data):
         """ 
diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py
index e96d9982..13c3974b 100644
--- a/activestorage/reductionist.py
+++ b/activestorage/reductionist.py
@@ -9,6 +9,8 @@
 import sys
 import typing
 
+DEBUG = 0
+
 
 def get_session(username: str, password: str, cacert: typing.Optional[str]) -> requests.Session:
     """Create and return a client session object.
@@ -53,7 +55,8 @@ def reduce_chunk(session, server, source, bucket, object,
     """
 
     request_data = build_request_data(source, bucket, object, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection)
-    print("Reductionist request data dictionary:", request_data)
+    if DEBUG:
+        print(f"Reductionist request data dictionary: {request_data}")
     api_operation = "sum" if operation == "mean" else operation or "select"
     url = f'{server}/v1/{api_operation}/'
     response = request(session, url, request_data)
diff --git a/activestorage/storage.py b/activestorage/storage.py
index 38a7e35d..a6b1a4e4 100644
--- a/activestorage/storage.py
+++ b/activestorage/storage.py
@@ -100,3 +100,36 @@ def read_block(open_file, offset, size):
     data = open_file.read(size)
     open_file.seek(place)
     return data
+
+
+def reduce_opens3_chunk(fh, 
+        offset, size, compression, filters, missing, dtype, shape, 
+        order, chunk_selection, method=None):
+    """ 
+    Same function as reduce_chunk, but this mimics what is done
+    deep in the bowels of H5py/pyfive. The reason for doing this is
+    so we can get per chunk metrics
+    """
+    fh.seek(offset)
+    chunk_buffer = fh.read(size)
+    chunk = filter_pipeline(chunk_buffer, compression, filters)
+    # make it a numpy array of bytes
+    chunk = ensure_ndarray(chunk)
+    # convert to the appropriate data type
+    chunk = chunk.view(dtype)
+    # sort out ordering and convert to the parent hyperslab dimensions
+    chunk = chunk.reshape(-1, order='A')
+    chunk = chunk.reshape(shape, order=order)
+
+    tmp = chunk[chunk_selection]
+    if method:
+        if missing != (None, None, None, None):
+            tmp = remove_missing(tmp, missing)
+        # check on size of tmp; method(empty) returns nan
+        if tmp.any():
+            return method(tmp), tmp.size
+        else:
+            return tmp, None
+    else:
+        return tmp, None
+

From ef001eee9e61a7afcf33deb3bb0bae4c05a2e1f2 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 23 Mar 2024 08:03:22 +0000
Subject: [PATCH 59/85] test runner

---
 bnl/bnl_timeseries.py        | 68 ++++++++++++++++++++++++++++++++++++
 bnl/bnl_timeseries_plot.py   | 67 +++++++++++++++++++++++++++++++++++
 bnl/bnl_timeseries_runner.py | 15 ++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 bnl/bnl_timeseries.py
 create mode 100644 bnl/bnl_timeseries_plot.py
 create mode 100644 bnl/bnl_timeseries_runner.py

diff --git a/bnl/bnl_timeseries.py b/bnl/bnl_timeseries.py
new file mode 100644
index 00000000..92d64238
--- /dev/null
+++ b/bnl/bnl_timeseries.py
@@ -0,0 +1,68 @@
+# Simple code to get hemispheric mean of the lowest level of air temperature from 
+# float UM_m01s30i204_vn1106(time, air_pressure_3, latitude_1, longitude_1) ;
+#		UM_m01s30i204_vn1106:long_name = "TEMPERATURE ON P LEV/UV GRID" ;" ;
+#		UM_m01s30i204_vn1106:units = "K" ;
+#		UM_m01s30i204_vn1106:cell_methods = "time: point" ;
+# with
+# 	time = 40 ;
+#	latitude_1 = 1921 ;
+#	latitude = 1920 ;
+#	longitude_1 = 2560 ;
+#	air_pressure_3 = 11 ;
+
+#  This variable is held in an 18 GB file on S3 storage
+
+from activestorage.active import Active
+import numpy as np
+from time import time
+
+S3_BUCKET = "bnl"
+S3_URL = "https://uor-aces-o.s3-ext.jc.rl.ac.uk"
+ACTIVE_URL = "https://192.171.169.248:8080"
+
+def timeseries(location='uni', blocks_MB=1, version=2, threads=100):
+
+    invoke = time()
+    storage_options = {
+        'key': "f2d55c6dcfc7618b2c34e00b58df3cef",
+        'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT",
+        'client_kwargs': {'endpoint_url': f"{S3_URL}"}, 
+            'default_fill_cache':False,
+            'default_cache_type':"readahead",
+            'default_block_size': blocks_MB * 2**20
+    }
+
+    filename = 'ch330a.pc19790301-def.nc'
+    uri = S3_BUCKET + '/' + filename
+    var = "UM_m01s30i204_vn1106"
+
+    active = Active(uri, var, storage_type="s3", max_threads=threads,
+                            storage_options=storage_options,
+                            active_storage_url=ACTIVE_URL)
+
+    # set active to use the remote reductionist.
+    # (we intend to change the invocation method
+    # to something more obvious and transparent.)
+    active._version = version
+
+    # and set the operation, again, the API will change
+    active._method = "mean"
+
+    # get hemispheric mean timeseries:
+    # (this would be more elegant in cf-python)
+    ts = []
+    md = []
+    for i in range(40):
+        ts.append(active[i,0,0:960,:][0])
+        # get some performance diagnostics from pyactive
+        print(active.metric_data)
+        if i == 0:
+            nct = active.metric_data['load nc time']
+        md.append(active.metric_data['reduction time (s)'])
+
+    result = np.array(ts)
+    print(result)
+    complete = time()
+    method = {1:'Local',2:'Active'}[version]
+    titlestring = f"{location}:{method} (T{threads},BS{blocks_MB}): {nct:.3}s,{sum(md):.4}s,{complete-invoke:.4}s"
+    print('Summary: ',titlestring)
diff --git a/bnl/bnl_timeseries_plot.py b/bnl/bnl_timeseries_plot.py
new file mode 100644
index 00000000..c654411a
--- /dev/null
+++ b/bnl/bnl_timeseries_plot.py
@@ -0,0 +1,67 @@
+from pathlib import Path
+import numpy as np
+from matplotlib import pyplot as plt
+import os
+
+
+plt.rcParams["figure.figsize"] = (8,12)
+
+data = {'timeseries1':'Hm1, Active: 5 MB Blks',
+        'timeseries2':'Hm1, Active: 1 MB Blks',
+        'timeseries2v1':'Hm1, Local: 1 MB Blks',
+        'timeseries2-uor-t1-a':'Uni, Active: T1, 1 MB Blks',
+        'timeseries2-uor-t1-b':'Uni, Active: T1, 1 MB Blks',
+        'timeseries2-uor-t1-c':'Uni, Active: T1, 1 MB Blks',
+        'timeseries2-uor-t100-a':'Uni, Active: T100, 1 MB Blks',
+        'timeseries2-uor-t100-b':'Uni, Active: T100, 1 MB Blks',
+        'timeseries1-uor-t100-a':'Uni, Local: T100, 1 MB Blks',
+        'timeseries1-uor-t100-b':'Uni, Local: T100, 1 MB Blks',
+        'timeseries2v1-uor-t1':'Uni, Local: T1, 1 MB Blks',
+        'timeseries1-uor-t1':'Uni, Active: T1, 5 MB Blks',
+                }
+tt =  []
+
+mypath = Path(__file__).parent
+
+logfiles = mypath.glob('timeseries3-H*.log')
+
+#for d,t in data.items():
+
+for ff in logfiles:
+      
+    #fname1 = mypath/f'{d}.log'
+    #fname2 = mypath/f'{d}.metrics.txt'
+    #os.system(f'grep -a  "dataset shape" {fname1} > {fname2}')
+    #with open(fname2,'r') as f:
+    with open(ff,'r') as f:
+        lines = f.readlines()
+        dicts = [eval(l) for l in lines if l.startswith('{') ]
+        nct = dicts[0]['load nc time']
+        rt = [v['reduction time (s)'] for v in dicts]
+        summary = lines[-1][9:]
+        overall_time = float(summary.split(',')[-1][:-2])
+        tt.append((overall_time, rt, summary))
+    #os.system(f'rm {fname2}')
+
+tts = sorted(tt)
+curve = {k:v for i,v,k in tts}
+
+for k in curve.keys():
+
+    if 'Local' in k: 
+        ls = 'dashed'
+    else:
+        ls = 'solid'
+
+    plt.plot(curve[k], label=k, linestyle=ls)
+
+
+plt.legend(bbox_to_anchor=(0.7, -0.1), fontsize=8)
+plt.title('Comparison of reduction parameters')
+plt.ylabel('Time to reduce each timestep (s)')
+plt.tight_layout()
+plt.savefig('home.png')
+plt.show()
+
+
+
diff --git a/bnl/bnl_timeseries_runner.py b/bnl/bnl_timeseries_runner.py
new file mode 100644
index 00000000..dda35478
--- /dev/null
+++ b/bnl/bnl_timeseries_runner.py
@@ -0,0 +1,15 @@
+from bnl_timeseries import timeseries
+import sys
+
+location = 'Hm1'
+blocks = [1,5]
+version = [1,2]
+threads = [1,100]
+iterations = ['a','b','c']
+
+for b in blocks:
+    for v in version:
+        for t in threads:
+            for i in iterations:
+                with open(f'timeseries3-{location}-{b}-{v}-{t}-{i}.log','w') as sys.stdout:
+                    timeseries(location, b, v, t)

From 7c1e704a764f0ef0b825422b9d99ccfc262c9ab5 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 25 Mar 2024 11:31:27 +0000
Subject: [PATCH 60/85] all-zero chunks

---
 activestorage/storage.py   |  5 +++--
 tests/unit/test_storage.py | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/activestorage/storage.py b/activestorage/storage.py
index a6b1a4e4..65c648e7 100644
--- a/activestorage/storage.py
+++ b/activestorage/storage.py
@@ -44,8 +44,9 @@ def reduce_chunk(rfile,
     if method:
         if missing != (None, None, None, None):
             tmp = remove_missing(tmp, missing)
-        # check on size of tmp; method(empty) returns nan
-        if tmp.any():
+        # Check on size of tmp; method(empty) fails or gives incorrect
+        # results
+        if tmp.size:
             return method(tmp), tmp.size
         else:
             return tmp, None
diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py
index 22e6dfbb..1f041218 100644
--- a/tests/unit/test_storage.py
+++ b/tests/unit/test_storage.py
@@ -118,3 +118,22 @@ def test_reduced_chunk_fully_masked_data_vmax():
                          method=np.mean)
     assert rc[0].size == 0
     assert rc[1] is None
+
+
+def test_zero_data():
+    """Test method with zero data."""
+    rfile = "tests/test_data/zero_chunked.nc"
+    offset = 8760
+    size = 48
+
+    # no compression
+    ch_sel = (slice(0, 3, 1), slice(0, 4, 1))
+    rc = st.reduce_chunk(rfile, offset, size,
+                         compression=None, filters=None,
+                         missing=(None, None, None, None),
+                         dtype="float32", shape=(3, 4),
+                         order="C", chunk_selection=ch_sel,
+                         method=np.mean)
+    assert rc[0].size == 1
+    assert rc[0] == 0
+    assert rc[1] is 12

From 209d3fa80465cb5146c078b9a822d49af9f628b5 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 25 Mar 2024 13:23:01 +0000
Subject: [PATCH 61/85] new test file

---
 tests/test_data/zero_chunked.nc | Bin 0 -> 8808 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/test_data/zero_chunked.nc

diff --git a/tests/test_data/zero_chunked.nc b/tests/test_data/zero_chunked.nc
new file mode 100644
index 0000000000000000000000000000000000000000..aad62297f3ef0b30fe34c1ba93302479ba2ccca0
GIT binary patch
literal 8808
zcmeHLOKTHR6h1SP#34=}YBfFzj>TO|NZTk?h_p<mrkEy@R0R=Y+Dz#{lSneQDxynq
z>&lgD_xb|_e}KQB8xdUU!o8kz?>$XTThxtMyoYwqeVjS>yWf2GwA|-~{QN-jbkekK
zVEe9YZA^}2)I?$H%4oTeub9qTQ%+fyn&MabebNMjLWYOw`8iHcfTnbi7@>v{6Cu=k
zFm1}I#0fd0x?pY8JDX~;<D(G6S<8U_)wmC0HJPdRjLv*|Bj69r7=~EcGz^1xu<b_(
z@sV?9#UcPMq-;n%#WZoT7b_|E@}4lDy^szVRcv2W&NziykN}%<d&RkG6^`K$p{0%H
z7Z&|eb)j6URlT{Q56*3*1Yd~D7fN%BzKEYy@k?uA*TtW@+X^BtKkqieX3*+}?UuWI
zKXO~`s5gcDh4tn2AnmeE0|7EI{~$O7s8$N1mBwl|A2!1V45J?87F_`j*F?Ly^78d=
z4iyzD9{2d09NtiRIItXM|BOSa^IM1RzLFfy8pe5I)kGn({+P>MeaWf2hc1((SZiFq
z`Sr1#qMVj#!AWY6<m9KM0G6q_zf67k&IqakPYJ8SYqvX%uvL$O?$6r5GsLHqJtd&M
zI(MXihfv~}s;BSbLUjpFVFXE^;8O9S)5sav3&vDsqV~={#pJg>Hp^b&ntc>ISQ5Ab
zIMhXR)@l0|3_*qvRQ!3r;+H(XCJ<c26ojBrkLukZ>Ot8)NMO^vWERf>A`}R4j6GK+
z91`{{;cV`Oi1LvAJw%M~!?%IMgovxH#DP0__(e4x^~ncip{bR;>z($!pc94ov8h6h
z;V;p>5p<}$<fbOD;$W@FWz(0^Qxj{A)eE`F^kg<YJ@x3xWtCOC1HiyluUr&DQTk-j
z@P7j=5&HUMJkEcp<tV_rA#Q1}8PE)91~dbj0nLDBKr^5j&<tn>Gy|Fe&A`9Lz*5Ec
v4ZO_ai!=6_UjUt<E1@0Me_hGfm^(srQ!}6$&<tn>Gy|Fe&46a$KVskqE1pO*

literal 0
HcmV?d00001


From 620b4031c623dd483224944583b19561d3fcd7b6 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 25 Mar 2024 13:33:38 +0000
Subject: [PATCH 62/85] fix previously failing Anon bkt test

---
 tests/test_compression_remote_reductionist.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py
index 14eb722d..f7d527df 100644
--- a/tests/test_compression_remote_reductionist.py
+++ b/tests/test_compression_remote_reductionist.py
@@ -103,13 +103,8 @@ def test_compression_and_filters_cmip6_forced_s3_from_local(storage_options, act
     active._version = 1
     active._method = "min"
 
-    # for now anon=True S3 buckets are not supported by Reductionist
-    with pytest.raises(RedErr) as rederr:
-        result = active[0:2,4:6,7:9]
-    access_denied_err = 'code: \\"AccessDenied\\"'
-    assert access_denied_err in str(rederr.value)
-    # assert nc_min == result
-    # assert result == 239.25946044921875
+    result = active[0:2,4:6,7:9]
+    assert result == 239.25946044921875
 
 
 def test_compression_and_filters_cmip6_forced_s3_from_local_2():

From b49ffa06b516cf5661558cc01e18f626c55c27bc Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 25 Mar 2024 13:38:57 +0000
Subject: [PATCH 63/85] timeOut for v2, v1 does the pancakes now

---
 tests/unit/test_storage_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 764c138e..09e775be 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -157,7 +157,7 @@ def load_from_s3(uri, storage_options=None):
     make_vanilla_ncdata(test_file)
 
     active = Active(uri, "data", "s3")
-    active._version = 1
+    active._version = 2
     active._method = "max"
 
     with pytest.raises(requests.exceptions.ConnectTimeout):

From e99a509f4f774b1135191004d56f993fbf3878bb Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 25 Mar 2024 13:40:17 +0000
Subject: [PATCH 64/85] timeOut for v2, v1 does the pancakes now

---
 tests/unit/test_storage_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index 09e775be..fee79019 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -180,7 +180,7 @@ def load_from_s3(uri, storage_options=None):
     make_vanilla_ncdata(test_file)
 
     active = Active(uri, "data", "s3")
-    active._version = 1
+    active._version = 2
     active._method = "max"
 
     with pytest.raises(activestorage.reductionist.ReductionistError):

From d1dc658c89f6fd0097671f48dd421b17c01887ae Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 25 Mar 2024 13:42:05 +0000
Subject: [PATCH 65/85] and again v2

---
 tests/unit/test_storage_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py
index fee79019..70b23559 100644
--- a/tests/unit/test_storage_types.py
+++ b/tests/unit/test_storage_types.py
@@ -72,7 +72,7 @@ def reduce_chunk(
     make_vanilla_ncdata(test_file)
 
     active = Active(uri, "data", "s3")
-    active._version = 1
+    active._version = 2
     active._method = "max"
 
     print("This test has severe flakiness:")

From b43dd213b2a0e9152253575f6b4b229d8eedb7c0 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 25 Mar 2024 13:52:37 +0000
Subject: [PATCH 66/85] another pancake

---
 tests/test_compression_remote_reductionist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py
index f7d527df..927c4c97 100644
--- a/tests/test_compression_remote_reductionist.py
+++ b/tests/test_compression_remote_reductionist.py
@@ -62,7 +62,7 @@ def test_compression_and_filters_cmip6_data(storage_options, active_storage_url)
     active = Active(test_file_uri, 'tas', utils.get_storage_type(),
                     storage_options=storage_options,
                     active_storage_url=active_storage_url)
-    active._version = 1
+    active._version = 2
     active._method = "min"
 
     if USE_S3:

From b5d5925c3420d36f975cbb75cb97d951d3d73696 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Tue, 26 Mar 2024 16:11:31 +0000
Subject: [PATCH 67/85] zero count

---
 activestorage/storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/storage.py b/activestorage/storage.py
index 65c648e7..80a575ba 100644
--- a/activestorage/storage.py
+++ b/activestorage/storage.py
@@ -49,7 +49,7 @@ def reduce_chunk(rfile,
         if tmp.size:
             return method(tmp), tmp.size
         else:
-            return tmp, None
+            return tmp, 0
     else:
         return tmp, None
 

From 8b46b4b034c2db577d46f50d5d3f0e6a9cb7b080 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 27 Mar 2024 13:38:55 +0000
Subject: [PATCH 68/85] fix test

---
 tests/unit/test_storage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py
index 1f041218..8e279725 100644
--- a/tests/unit/test_storage.py
+++ b/tests/unit/test_storage.py
@@ -60,7 +60,7 @@ def test_reduced_chunk_fully_masked_data_fill():
                          order="C", chunk_selection=ch_sel,
                          method=np.mean)
     assert rc[0].size == 0
-    assert rc[1] is None
+    assert rc[1] == 0
 
 
 def test_reduced_chunk_fully_masked_data_missing():
@@ -79,7 +79,7 @@ def test_reduced_chunk_fully_masked_data_missing():
                          order="C", chunk_selection=ch_sel,
                          method=np.mean)
     assert rc[0].size == 0
-    assert rc[1] is None
+    assert rc[1] == 0
 
 
 def test_reduced_chunk_fully_masked_data_vmin():
@@ -98,7 +98,7 @@ def test_reduced_chunk_fully_masked_data_vmin():
                          order="C", chunk_selection=ch_sel,
                          method=np.mean)
     assert rc[0].size == 0
-    assert rc[1] is None
+    assert rc[1] == 0
 
 
 def test_reduced_chunk_fully_masked_data_vmax():
@@ -117,7 +117,7 @@ def test_reduced_chunk_fully_masked_data_vmax():
                          order="C", chunk_selection=ch_sel,
                          method=np.mean)
     assert rc[0].size == 0
-    assert rc[1] is None
+    assert rc[1] == 0
 
 
 def test_zero_data():

From 2152ac5ec53a732cf7ee044988facfa7b2ff7699 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 22 Jul 2024 13:33:50 +0100
Subject: [PATCH 69/85] add a conda list cmd

---
 .github/workflows/run-test-push.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/run-test-push.yml b/.github/workflows/run-test-push.yml
index 718ca4ec..52e0b1e0 100644
--- a/.github/workflows/run-test-push.yml
+++ b/.github/workflows/run-test-push.yml
@@ -38,5 +38,6 @@ jobs:
           git checkout issue60
           pip install -e .
       - run: pip install -e .
+      - run: conda list
       - run: pytest -n 2 --junitxml=report-1.xml
       - uses: codecov/codecov-action@v3

From 8b0874350a9d5145cdc98323ea7f40c73f78d6d2 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 20 Jan 2025 17:34:43 +0000
Subject: [PATCH 70/85] start work to port new Pyfive

---
 activestorage/active.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 78f56e3d..dd74bc5d 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -307,8 +307,8 @@ def _get_selection(self, *args):
         name = self.ds.name
         dtype = np.dtype(self.ds.dtype)
         # hopefully fix pyfive to get a dtype directly
-        array = pyfive.ZarrArrayStub(self.ds.shape, self.ds.chunks)
-        ds = self.ds._dataobjects
+        array = pyfive.indexing.ZarrArrayStub(self.ds.shape, self.ds.chunks)
+        ds = self.ds.id
         
         self.metric_data['args'] = args
         self.metric_data['dataset shape'] = self.ds.shape
@@ -318,7 +318,7 @@ def _get_selection(self, *args):
         else:
             compressor, filters = decode_filters(ds.filter_pipeline , dtype.itemsize, name)
         
-        indexer = pyfive.OrthogonalIndexer(*args, array)
+        indexer = pyfive.indexing.OrthogonalIndexer(*args, array)
         out_shape = indexer.shape
         #stripped_indexer = [(a, b, c) for a,b,c in indexer]
         drop_axes = indexer.drop_axes and keepdims
@@ -364,10 +364,10 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
         
         if ds.chunks is not None:
             t1 = time.time()
-            ds._get_chunk_addresses()
+            # ds._get_chunk_addresses()
             t2 = time.time() - t1
             self.metric_data['indexing time (s)'] = t2
-            self.metric_data['chunk number'] = len(ds._zchunk_index)
+            # self.metric_data['chunk number'] = len(ds._zchunk_index)
         chunk_count = 0
         t1 = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor:
@@ -464,8 +464,9 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         #FIXME: Do, we, it's not actually used?
 
         """
-        
-        offset, size, filter_mask = ds.get_chunk_details(chunk_coords)
+        # This should contain all the valid chunks
+        print(ds._index.keys(), len(ds._index.keys()))
+        offset, size, filter_mask = ds._index[chunk_coords]
         self.data_read += size
 
         if self.storage_type == 's3' and self._version == 1:

From ab648175c612b537218d36064e0918ec0780e191 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 20 Jan 2025 17:35:16 +0000
Subject: [PATCH 71/85] correct test for new Pyive API

---
 tests/test_reductionist_json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
index bcf18735..3f7afa39 100644
--- a/tests/test_reductionist_json.py
+++ b/tests/test_reductionist_json.py
@@ -18,7 +18,7 @@ def __init__(self, f, v):
             self.f = pyfive.File(f)
         ds = self.f[v]
         self.dtype = np.dtype(ds.dtype)
-        self.array = pyfive.ZarrArrayStub(ds.shape, ds.chunks or ds.shape)
+        self.array = pyfive.indexing.ZarrArrayStub(ds.shape, ds.chunks or ds.shape)
         self.missing = get_missing_attributes(ds)
         ds = ds._dataobjects
         self.ds = ds

From 5c8095e5248d1319dbadd69f7e760b3f1d299916 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 10:24:04 +0000
Subject: [PATCH 72/85] Conforming to current index structure a bit better

---
 activestorage/active.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index dd74bc5d..88f27316 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -5,6 +5,8 @@
 import urllib
 import pyfive
 import time
+from operator import mul
+from pyfive.h5d import StoreInfo
 
 import s3fs
 
@@ -334,7 +336,7 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, f
             out = []
             counts = []
         else:
-            out = np.empty(out_shape, dtype=out_dtype, order=ds.order)
+            out = np.empty(out_shape, dtype=out_dtype, order=ds._order)
             counts = None  # should never get touched with no method!
 
         # Create a shared session object.
@@ -464,16 +466,19 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         #FIXME: Do, we, it's not actually used?
 
         """
-        # This should contain all the valid chunks
-        print(ds._index.keys(), len(ds._index.keys()))
-        offset, size, filter_mask = ds._index[chunk_coords]
+        # map into correct coordinate space for h5py/pyfive
+        chunk_coords = tuple(map(mul, chunk_coords, chunks))
+        # retrieve coordinates from chunk index
+        storeinfo = ds._index[chunk_coords]
+        # extract what we need here.
+        offset, size = storeinfo.byte_offset, storeinfo.size
         self.data_read += size
 
         if self.storage_type == 's3' and self._version == 1:
 
             tmp, count = reduce_opens3_chunk(ds.fh, offset, size, compressor, filters,
                             self.missing, ds.dtype,
-                            chunks, ds.order,
+                            chunks, ds_order,
                             chunk_selection, method=self.method
             )
 
@@ -500,7 +505,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        size, compressor, filters,
                                                        self.missing, np.dtype(ds.dtype),
                                                        chunks,
-                                                       ds.order,
+                                                       ds._order,
                                                        chunk_selection,
                                                        operation=self._method)
             else:
@@ -519,7 +524,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
                                                        size, compressor, filters,
                                                        self.missing, np.dtype(ds.dtype),
                                                        chunks,
-                                                       ds.order,
+                                                       ds._order,
                                                        chunk_selection,
                                                        operation=self._method)
         elif self.storage_type=='ActivePosix' and self.version==2:
@@ -532,7 +537,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
             # although we will version changes.
             tmp, count = reduce_chunk(self.filename, offset, size, compressor, filters,
                                       self.missing, ds.dtype,
-                                      chunks, ds.order,
+                                      chunks, ds._order,
                                       chunk_selection, method=self.method)
 
         if self.method is not None:

From fe880266b46784a0a7ed935d9a8bb77d39d9113c Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 21 Jan 2025 12:32:03 +0000
Subject: [PATCH 73/85] new GH repo and branch for Pyfive in GAs

---
 .github/workflows/run-test-push.yml               |  6 +++---
 .github/workflows/run-tests.yml                   | 12 ++++++------
 .github/workflows/test_s3_minio.yml               |  6 +++---
 .github/workflows/test_s3_remote_reductionist.yml |  6 +++---
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/run-test-push.yml b/.github/workflows/run-test-push.yml
index 64f238eb..c92940c8 100644
--- a/.github/workflows/run-test-push.yml
+++ b/.github/workflows/run-test-push.yml
@@ -29,12 +29,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of bnlawrence/Pyfive:issue60
+      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
         run: |
           cd ..
-          git clone https://github.com/bnlawrence/pyfive.git
+          git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout issue60
+          git checkout h5netcdf
           pip install -e .
       - run: pip install -e .
       - run: conda list
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 6b8a7192..f0719b1b 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -34,12 +34,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of bnlawrence/Pyfive:issue60
+      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
         run: |
           cd ..
-          git clone https://github.com/bnlawrence/pyfive.git
+          git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout issue60
+          git checkout h5netcdf
           pip install -e .
       - run: conda list
       - run: pip install -e .
@@ -66,12 +66,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of bnlawrence/Pyfive:issue60
+      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
         run: |
           cd ..
-          git clone https://github.com/bnlawrence/pyfive.git
+          git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout issue60
+          git checkout h5netcdf
           pip install -e .
       - run: conda list
       - run: mamba install -c conda-forge git
diff --git a/.github/workflows/test_s3_minio.yml b/.github/workflows/test_s3_minio.yml
index 044c5d21..6638627c 100644
--- a/.github/workflows/test_s3_minio.yml
+++ b/.github/workflows/test_s3_minio.yml
@@ -56,12 +56,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           miniforge-version: "latest"
           use-mamba: true
-      - name: Install development version of bnlawrence/Pyfive:issue60
+      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
         run: |
           cd ..
-          git clone https://github.com/bnlawrence/pyfive.git
+          git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout issue60
+          git checkout h5netcdf
           pip install -e .
       - name: Install PyActiveStorage
         run: |
diff --git a/.github/workflows/test_s3_remote_reductionist.yml b/.github/workflows/test_s3_remote_reductionist.yml
index 31c4ee55..bf1f4608 100644
--- a/.github/workflows/test_s3_remote_reductionist.yml
+++ b/.github/workflows/test_s3_remote_reductionist.yml
@@ -51,12 +51,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           miniforge-version: "latest"
           use-mamba: true
-      - name: Install development version of bnlawrence/Pyfive:issue60
+      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
         run: |
           cd ..
-          git clone https://github.com/bnlawrence/pyfive.git
+          git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout issue60
+          git checkout h5netcdf
           pip install -e .
       - name: Install PyActiveStorage
         run: |

From e68a0fe20a0791ba1414b9c007119bcecf1ba3bc Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 14:20:41 +0000
Subject: [PATCH 74/85] Use the new pyfive method to get chunk position in
 file. Includes bad hack for contiguous data.

---
 activestorage/active.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 88f27316..7b6fc0f4 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -466,11 +466,9 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
         #FIXME: Do, we, it's not actually used?
 
         """
-        # map into correct coordinate space for h5py/pyfive
-        chunk_coords = tuple(map(mul, chunk_coords, chunks))
+
         # retrieve coordinates from chunk index
-        storeinfo = ds._index[chunk_coords]
-        # extract what we need here.
+        storeinfo = ds.get_chunk_info_from_chunk_coord(chunk_coords)
         offset, size = storeinfo.byte_offset, storeinfo.size
         self.data_read += size
 

From a728ef08cfd47b51266b34b7cfab4928415a011a Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 21 Jan 2025 14:38:26 +0000
Subject: [PATCH 75/85] fix the mocker json test

---
 tests/test_reductionist_json.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py
index 3f7afa39..c7cc09c0 100644
--- a/tests/test_reductionist_json.py
+++ b/tests/test_reductionist_json.py
@@ -20,7 +20,7 @@ def __init__(self, f, v):
         self.dtype = np.dtype(ds.dtype)
         self.array = pyfive.indexing.ZarrArrayStub(ds.shape, ds.chunks or ds.shape)
         self.missing = get_missing_attributes(ds)
-        ds = ds._dataobjects
+        ds = ds.id
         self.ds = ds
     def __getitem__(self, args):
         if self.ds.filter_pipeline is None:
@@ -30,12 +30,13 @@ def __getitem__(self, args):
         if self.ds.chunks is not None:
             self.ds._get_chunk_addresses()
 
-        indexer = pyfive.OrthogonalIndexer(args, self.array)
+        indexer = pyfive.indexing.OrthogonalIndexer(args, self.array)
         for chunk_coords, chunk_selection, out_selection in indexer:
-            offset, size, filter_mask = self.ds.get_chunk_details(chunk_coords)
+            storeinfo = self.ds.get_chunk_info_from_chunk_coord(chunk_coords)
+            offset, size = storeinfo.byte_offset, storeinfo.size
             jd = reductionist.build_request_data('a','b','c',
                                 offset, size, compressor, filters, self.missing, self.dtype,
-                                self.array._chunks,self.ds.order,chunk_selection)
+                                self.array._chunks,self.ds._order,chunk_selection)
             js = json.dumps(jd)
         return None
 

From 3df5b36580c559c8ac0ebc18aaf533662be0e4f5 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 21 Jan 2025 14:46:02 +0000
Subject: [PATCH 76/85] new fh method

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 7b6fc0f4..29fc124e 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -474,7 +474,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
 
         if self.storage_type == 's3' and self._version == 1:
 
-            tmp, count = reduce_opens3_chunk(ds.fh, offset, size, compressor, filters,
+            tmp, count = reduce_opens3_chunk(ds._fh, offset, size, compressor, filters,
                             self.missing, ds.dtype,
                             chunks, ds_order,
                             chunk_selection, method=self.method

From 762d85a79fabe28041432459cc10cdb32ba1b3c1 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 21 Jan 2025 14:54:02 +0000
Subject: [PATCH 77/85] small typo

---
 activestorage/active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 29fc124e..cf397656 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -476,7 +476,7 @@ def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, cou
 
             tmp, count = reduce_opens3_chunk(ds._fh, offset, size, compressor, filters,
                             self.missing, ds.dtype,
-                            chunks, ds_order,
+                            chunks, ds._order,
                             chunk_selection, method=self.method
             )
 

From f3c35e06240f4a57bac062e2bba8a1dda49599e3 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 15:32:20 +0000
Subject: [PATCH 78/85] Cleared up the compression problem

---
 activestorage/active.py       | 1 -
 activestorage/hdf2numcodec.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/activestorage/active.py b/activestorage/active.py
index 7b6fc0f4..834e2d44 100644
--- a/activestorage/active.py
+++ b/activestorage/active.py
@@ -5,7 +5,6 @@
 import urllib
 import pyfive
 import time
-from operator import mul
 from pyfive.h5d import StoreInfo
 
 import s3fs
diff --git a/activestorage/hdf2numcodec.py b/activestorage/hdf2numcodec.py
index 96796855..fc90cad3 100644
--- a/activestorage/hdf2numcodec.py
+++ b/activestorage/hdf2numcodec.py
@@ -28,7 +28,7 @@ def decode_filters(filter_pipeline, itemsize, name):
     for filter in filter_pipeline:
 
         filter_id=filter['filter_id']
-        properties = filter['client_data_values']
+        properties = filter['client_data']
 
 
         # We suppor the following

From 2fabbe336b54010886b88f32c836f98bfb9c3f33 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 21 Jan 2025 17:09:43 +0000
Subject: [PATCH 79/85] reinstate tests

---
 tests/test_compression_remote_reductionist.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py
index 888dbcff..d447d536 100644
--- a/tests/test_compression_remote_reductionist.py
+++ b/tests/test_compression_remote_reductionist.py
@@ -29,7 +29,7 @@
 
 
 # CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-@pytest.mark.xfail(reason='JASMIN messing about with SOF.')
+# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
 @pytest.mark.parametrize("storage_options, active_storage_url", storage_options_paramlist)
 def test_compression_and_filters_cmip6_data(storage_options, active_storage_url):
     """
@@ -83,7 +83,7 @@ def test_compression_and_filters_cmip6_data(storage_options, active_storage_url)
 
 
 # CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-@pytest.mark.xfail(reason='JASMIN messing about with SOF.')
+# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
 @pytest.mark.parametrize("storage_options, active_storage_url", storage_options_paramlist)
 def test_compression_and_filters_cmip6_forced_s3_from_local(storage_options, active_storage_url):
     """
@@ -113,7 +113,7 @@ def test_compression_and_filters_cmip6_forced_s3_from_local(storage_options, act
 
 
 # CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-@pytest.mark.xfail(reason='JASMIN messing about with SOF.')
+# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
 def test_compression_and_filters_cmip6_forced_s3_from_local_2():
     """
     Test use of datasets with compression and filters applied for a real
@@ -151,7 +151,7 @@ def test_compression_and_filters_cmip6_forced_s3_from_local_2():
 
 
 # CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-@pytest.mark.xfail(reason='JASMIN messing about with SOF.')
+# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
 @pytest.mark.skipif(not USE_S3, reason="we need only localhost Reductionist in GA CI")
 @pytest.mark.skipif(REMOTE_RED, reason="we need only localhost Reductionist in GA CI")
 def test_compression_and_filters_cmip6_forced_s3_using_local_Reductionist():

From 68851e4c5c880e81bfbc31d4949c66c24ec3cc6d Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 22 Jan 2025 13:11:16 +0000
Subject: [PATCH 80/85] remove bnl references

---
 tests/test_compression_remote_reductionist.py | 192 ------------------
 1 file changed, 192 deletions(-)
 delete mode 100644 tests/test_compression_remote_reductionist.py

diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py
deleted file mode 100644
index d447d536..00000000
--- a/tests/test_compression_remote_reductionist.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import os
-import numpy as np
-import pytest
-
-from netCDF4 import Dataset
-from pathlib import Path
-
-from activestorage.active import Active, load_from_s3
-from activestorage.config import *
-from activestorage.dummy_data import make_compressed_ncdata
-from activestorage.reductionist import ReductionistError as RedErr
-
-import utils
-
-
-# Bryan's S3 machine + Bryan's reductionist
-STORAGE_OPTIONS_Bryan = {
-    'anon': True,
-    'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"},
-}
-S3_ACTIVE_URL_Bryan = "https://192.171.169.248:8080"
-# TODO include all supported configuration types
-storage_options_paramlist = [
-    (STORAGE_OPTIONS_Bryan, S3_ACTIVE_URL_Bryan)
-]
-# bucket needed too for this test only
-# otherwise, bucket is extracted automatically from full file uri
-S3_BUCKET = "bnl"
-
-
-# CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
-@pytest.mark.parametrize("storage_options, active_storage_url", storage_options_paramlist)
-def test_compression_and_filters_cmip6_data(storage_options, active_storage_url):
-    """
-    Test use of datasets with compression and filters applied for a real
-    CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
-
-    This test will always pass when USE_S3 = False; equally, it will always
-    fail if USE_S3 = True until Reductionist supports anon=True S3 buckets.
-    See following test below with a forced storage_type="s3" that mimicks
-    locally the fail, and catches it. Equally, we catch the same exception when USE_S3=True
-
-    Important info on session data:
-    S3 Storage options to Reductionist: {'anon': True, 'client_kwargs': {'endpoint_url': 'https://uor-aces-o.s3-ext.jc.rl.ac.uk'}}
-    S3 anon=True Bucket and File: bnl CMIP6-test.nc
-    Reductionist request data dictionary: {'source': 'https://uor-aces-o.s3-ext.jc.rl.ac.uk', 'bucket': 'bnl', 'object': 'CMIP6-test.nc', 'dtype': 'float32', 'byte_order': 'little', 'offset': 29385, 'size': 942518, 'order': 'C', 'shape': (15, 143, 144), 'selection': [[0, 2, 1], [4, 6, 1], [7, 9, 1]], 'compression': {'id': 'zlib'}}
-    """
-    test_file = str(Path(__file__).resolve().parent / 'test_data' / 'CMIP6-test.nc')
-    with Dataset(test_file) as nc_data:
-        nc_min = np.min(nc_data["tas"][0:2,4:6,7:9])
-    print(f"Numpy min from compressed file {nc_min}")
-
-    # TODO remember that the special case for "anon=True" buckets is that
-    # the actual file uri = "bucket/filename"
-    if USE_S3:
-        ofile = os.path.basename(test_file)
-        test_file_uri = os.path.join(S3_BUCKET, ofile)
-    else:
-        test_file_uri = test_file
-    print("Test file and storage options", test_file, storage_options)
-    if not utils.get_storage_type():
-        storage_options = None
-        active_storage_url = None
-    active = Active(test_file_uri, 'tas', utils.get_storage_type(),
-                    storage_options=storage_options,
-                    active_storage_url=active_storage_url)
-    active._version = 2
-    active._method = "min"
-
-    if USE_S3:
-        # for now anon=True S3 buckets are not supported by Reductionist
-        with pytest.raises(RedErr) as rederr:
-            result = active[0:2,4:6,7:9]
-        access_denied_err = 'code: \\"AccessDenied\\"'
-        assert access_denied_err in str(rederr.value)
-        # assert nc_min == result
-        # assert result == 239.25946044921875
-    else:
-        result = active[0:2,4:6,7:9]
-        assert nc_min == result
-        assert result == 239.25946044921875
-
-
-# CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
-@pytest.mark.parametrize("storage_options, active_storage_url", storage_options_paramlist)
-def test_compression_and_filters_cmip6_forced_s3_from_local(storage_options, active_storage_url):
-    """
-    Test use of datasets with compression and filters applied for a real
-    CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
-
-    This is for a special anon=True bucket ONLY.
-    """
-    test_file = str(Path(__file__).resolve().parent / 'test_data' / 'CMIP6-test.nc')
-    with Dataset(test_file) as nc_data:
-        nc_min = np.min(nc_data["tas"][0:2,4:6,7:9])
-    print(f"Numpy min from compressed file {nc_min}")
-
-    # TODO remember that the special case for "anon=True" buckets is that
-    # the actual file uri = "bucket/filename"
-    ofile = os.path.basename(test_file)
-    test_file_uri = os.path.join(S3_BUCKET, ofile)
-    active = Active(test_file_uri, 'tas', storage_type="s3",
-                    storage_options=storage_options,
-                    active_storage_url=active_storage_url)
-
-    active._version = 1
-    active._method = "min"
-
-    result = active[0:2,4:6,7:9]
-    assert result == 239.25946044921875
-
-
-# CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
-def test_compression_and_filters_cmip6_forced_s3_from_local_2():
-    """
-    Test use of datasets with compression and filters applied for a real
-    CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
-
-    This is for a special anon=True bucket connected to via valid key.secret
-    """
-    storage_options = {
-        'key': "f2d55c6dcfc7618b2c34e00b58df3cef",
-        'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT",
-        'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}
-    }
-    active_storage_url = "https://192.171.169.248:8080"
-    test_file = str(Path(__file__).resolve().parent / 'test_data' / 'CMIP6-test.nc')
-    with Dataset(test_file) as nc_data:
-        nc_min = np.min(nc_data["tas"][0:2,4:6,7:9])
-    print(f"Numpy min from compressed file {nc_min}")
-
-    ofile = os.path.basename(test_file)
-    test_file_uri = os.path.join(
-        S3_BUCKET,
-        ofile
-    )
-    print("S3 Test file path:", test_file_uri)
-    active = Active(test_file_uri, 'tas', storage_type="s3",
-                    storage_options=storage_options,
-                    active_storage_url=active_storage_url)
-
-    active._version = 1
-    active._method = "min"
-
-    result = active[0:2,4:6,7:9]
-    assert nc_min == result
-    assert result == 239.25946044921875
-
-
-# CMIP6_test.nc keeps being unavailable due to BNL bucket unavailable
-# @pytest.mark.xfail(reason='JASMIN messing about with SOF.')
-@pytest.mark.skipif(not USE_S3, reason="we need only localhost Reductionist in GA CI")
-@pytest.mark.skipif(REMOTE_RED, reason="we need only localhost Reductionist in GA CI")
-def test_compression_and_filters_cmip6_forced_s3_using_local_Reductionist():
-    """
-    Test use of datasets with compression and filters applied for a real
-    CMIP6 dataset (CMIP6-test.nc) - an IPSL file.
-
-    This is for a special anon=True bucket connected to via valid key.secret
-    and uses the locally deployed Reductionist via container.
-    """
-    print("Reductionist URL", S3_ACTIVE_STORAGE_URL)
-    storage_options = {
-        'key': "f2d55c6dcfc7618b2c34e00b58df3cef",
-        'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT",
-        'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}
-    }
-
-    test_file = str(Path(__file__).resolve().parent / 'test_data' / 'CMIP6-test.nc')
-    with Dataset(test_file) as nc_data:
-        nc_min = np.min(nc_data["tas"][0:2,4:6,7:9])
-    print(f"Numpy min from compressed file {nc_min}")
-
-    ofile = os.path.basename(test_file)
-    test_file_uri = os.path.join(
-        S3_BUCKET,
-        ofile
-    )
-    print("S3 Test file path:", test_file_uri)
-    active = Active(test_file_uri, 'tas', storage_type="s3",
-                    storage_options=storage_options,
-                    active_storage_url=S3_ACTIVE_STORAGE_URL)
-
-    active._version = 1
-    active._method = "min"
-
-    result = active[0:2,4:6,7:9]
-    assert nc_min == result
-    assert result == 239.25946044921875

From 1d13c6aaa74c1a26deb0ef38ae69a3167eb6592d Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 22 Jan 2025 13:13:02 +0000
Subject: [PATCH 81/85] remove workflow that runs bnl bucket test

---
 .../workflows/test_s3_remote_reductionist.yml | 75 -------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 .github/workflows/test_s3_remote_reductionist.yml

diff --git a/.github/workflows/test_s3_remote_reductionist.yml b/.github/workflows/test_s3_remote_reductionist.yml
deleted file mode 100644
index bf1f4608..00000000
--- a/.github/workflows/test_s3_remote_reductionist.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-# adapted GA workflow from https://github.com/stackhpc/reductionist-rs
-# This runs Active with a remote Reductionist and S3 data stored elsewhere
----
-name: S3/Remote Reductionist
-
-on:
-  push:
-    branches:
-      - main  # keep this at all times
-      - pyfive
-  pull_request:
-  schedule:
-    - cron: '0 0 * * *'  # nightly
-
-# Required shell entrypoint to have properly configured bash shell
-defaults:
-  run:
-    shell: bash -l {0}
-
-jobs:
-  linux-test:
-    runs-on: "ubuntu-latest"
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13"]
-      fail-fast: false
-    name: Linux Python ${{ matrix.python-version }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - uses: conda-incubator/setup-miniconda@v3
-        with:
-          python-version: ${{ matrix.python-version }}
-          miniforge-version: "latest"
-          use-mamba: true
-      - name: Get conda and Python versions
-        run: |
-          conda --version
-          python -V
-      - name: Export proxy
-        run: |
-          echo 'USE_S3 = True' >> activestorage/config.py
-          echo 'REMOTE_RED = True' >> activestorage/config.py
-      - name: Ping remote Reductionist
-        run: curl -k https://192.171.169.248:8080/.well-known/reductionist-schema
-      - uses: conda-incubator/setup-miniconda@v3
-        with:
-          activate-environment: activestorage-minio
-          environment-file: environment.yml
-          python-version: ${{ matrix.python-version }}
-          miniforge-version: "latest"
-          use-mamba: true
-      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
-        run: |
-          cd ..
-          git clone https://github.com/NCAS-CMS/pyfive.git
-          cd pyfive
-          git checkout h5netcdf
-          pip install -e .
-      - name: Install PyActiveStorage
-        run: |
-          conda --version
-          python -V
-          which python
-          pip install -e .
-      - name: Run one single test
-        run: |
-          pytest tests/test_compression_remote_reductionist.py
-      - name: Upload HTML report artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: html-report
-          path: test-reports/
-        if: always()

From 29fd64880f97c7dc42966290047524241ca463ed Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 22 Jan 2025 13:15:48 +0000
Subject: [PATCH 82/85] rplace bnl with relevant info

---
 tests/test_missing.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_missing.py b/tests/test_missing.py
index 934d9039..66aa4830 100644
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -212,11 +212,11 @@ def test_validmax(tmp_path):
     else:
         x = pyfive.File(testfile)
 
-    # print for Bryan
-    print('bnl',y['data'].getncattr('valid_max'))
-    print('bnl',x['data'].attrs.get('valid_max'))
-    print('bnl',z['data'].attrs.get('valid_max'))
-    print('bnl',a['data'].attrs.get('valid_max'))
+    # print stuff
+    print('y-valid-max', y['data'].getncattr('valid_max'))
+    print('x-valid-max', x['data'].attrs.get('valid_max'))
+    print('z-valid-max', z['data'].attrs.get('valid_max'))
+    print('a-valid-max', a['data'].attrs.get('valid_max'))
 
 
From 4792342cd6005e8659daea3a0612e6d86493eabe Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 22 Jan 2025 13:19:57 +0000
Subject: [PATCH 83/85] remove commented out Bryan server

---
 tests/test_compression.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 0db5ad55..20d41704 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -46,18 +46,14 @@ def create_compressed_dataset(tmp_path: str, compression: str, shuffle: bool):
     'client_kwargs': {'endpoint_url': S3_URL},
 }
 S3_ACTIVE_URL_MINIO = S3_ACTIVE_STORAGE_URL
-S3_ACTIVE_URL_Bryan = "https://192.171.169.248:8080"
 
 # TODO include all supported configuration types
 # so far test three possible configurations for storage_options:
 # - storage_options = None, active_storage_url = None (Minio and local Reductionist, preset credentials from config.py)
 # - storage_options = CLASSIC, active_storage_url = CLASSIC (Minio and local Reductionist, preset credentials from config.py but folded in storage_options and active_storage_url)
-# - storage_options = CLASSIC, active_storage_url = Bryan's machine (Minio BUT Reductionist moved on Bryan's machine)
-#   (this invariably fails due to data URL being //localhost:9000 closed to outside Reductionist
 storage_options_paramlist = [
     (None, None),
     (STORAGE_OPTIONS_CLASSIC, S3_ACTIVE_URL_MINIO),
-#    (STORAGE_OPTIONS_CLASSIC, S3_ACTIVE_URL_Bryan)
 ]
 
 
From 43ea85355f07812e2a24bf4def2389d0340ce6ae Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Wed, 22 Jan 2025 13:20:58 +0000
Subject: [PATCH 84/85] rm ref Bryan

---
 tests/s3_exploratory/test_s3_reduction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/s3_exploratory/test_s3_reduction.py b/tests/s3_exploratory/test_s3_reduction.py
index 3546a528..f2e11071 100644
--- a/tests/s3_exploratory/test_s3_reduction.py
+++ b/tests/s3_exploratory/test_s3_reduction.py
@@ -19,13 +19,13 @@ def make_tempfile():
     """Make dummy data."""
     temp_folder = tempfile.mkdtemp()
     s3_testfile = os.path.join(temp_folder,
-                               's3_test_bizarre.nc')  # Bryan likes this name
+                               's3_test_bizarre.nc')
     print(f"S3 Test file is {s3_testfile}")
     if not os.path.exists(s3_testfile):
         make_vanilla_ncdata(filename=s3_testfile)
 
     local_testfile = os.path.join(temp_folder,
-                                  'local_test_bizarre.nc')  # Bryan again
+                                  'local_test_bizarre.nc')
     print(f"Local Test file is {local_testfile}")
     if not os.path.exists(local_testfile):
         make_vanilla_ncdata(filename=local_testfile)

From e73712aacea5ad556fe625bbf37bc02af28d18ba Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Tue, 11 Feb 2025 13:52:05 +0000
Subject: [PATCH 85/85] use wacasoft branch of Pyfive fork

---
 .github/workflows/run-test-push.yml | 4 ++--
 .github/workflows/run-tests.yml     | 8 ++++----
 .github/workflows/test_s3_minio.yml | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run-test-push.yml b/.github/workflows/run-test-push.yml
index c92940c8..33471813 100644
--- a/.github/workflows/run-test-push.yml
+++ b/.github/workflows/run-test-push.yml
@@ -29,12 +29,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
         run: |
           cd ..
           git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout h5netcdf
+          git checkout wacasoft
           pip install -e .
       - run: pip install -e .
       - run: conda list
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index f0719b1b..8a278038 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -34,12 +34,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
         run: |
           cd ..
           git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout h5netcdf
+          git checkout wacasoft
           pip install -e .
       - run: conda list
       - run: pip install -e .
@@ -66,12 +66,12 @@ jobs:
           use-mamba: true
       - run: conda --version
       - run: python -V
-      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
         run: |
           cd ..
           git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout h5netcdf
+          git checkout wacasoft
           pip install -e .
       - run: conda list
       - run: mamba install -c conda-forge git
diff --git a/.github/workflows/test_s3_minio.yml b/.github/workflows/test_s3_minio.yml
index 823f1daf..cceb63d9 100644
--- a/.github/workflows/test_s3_minio.yml
+++ b/.github/workflows/test_s3_minio.yml
@@ -56,12 +56,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           miniforge-version: "latest"
           use-mamba: true
-      - name: Install development version of NCAS-CMS/Pyfive:h5netcdf
+      - name: Install development version of NCAS-CMS/Pyfive:wacasoft
         run: |
           cd ..
           git clone https://github.com/NCAS-CMS/pyfive.git
           cd pyfive
-          git checkout h5netcdf
+          git checkout wacasoft
           pip install -e .
       - name: Install PyActiveStorage
         run: |