From 80fedcde9a6768761ee2f36bb2ae63b6310d4492 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 13:39:25 -0400 Subject: [PATCH] reorganize --- kerchunk/hdf.py | 51 ++------------------------------------ kerchunk/tests/test_hdf.py | 14 ++++++++--- kerchunk/utils.py | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 53 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 6bb1692..6b7b443 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -1,7 +1,7 @@ import base64 import io import logging -from typing import Union, BinaryIO, Any, cast +from typing import Union, BinaryIO from packaging.version import Version import fsspec.core @@ -11,7 +11,7 @@ import numcodecs from .codecs import FillStringsCodec -from .utils import _encode_for_JSON +from .utils import _encode_for_JSON, encode_fill_value try: import h5py @@ -22,12 +22,6 @@ "for more details." ) -# try: -# from zarr.meta import encode_fill_value -# except ModuleNotFoundError: -# # https://github.com/zarr-developers/zarr-python/issues/2021 -# from zarr.v2.meta import encode_fill_value - lggr = logging.getLogger("h5-to-zarr") _HIDDEN_ATTRS = { # from h5netcdf.attrs "REFERENCE_LIST", @@ -504,7 +498,6 @@ def _translator( lggr.debug(f"Created Zarr array: {za}") self._transfer_attrs(h5obj, za) - # za.attrs["_ARRAY_DIMENSIONS"] = adims lggr.debug(f"_ARRAY_DIMENSIONS = {adims}") if "data" in kwargs: @@ -705,43 +698,3 @@ def _is_netcdf_variable(dataset: h5py.Dataset): def has_visititems_links(): return hasattr(h5py.Group, "visititems_links") - -def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: - # early out - if v is None: - return v - if dtype.kind == "V" and dtype.hasobject: - if object_codec is None: - raise ValueError("missing object_codec for object array") - v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), "ascii") - return v - if dtype.kind == "f": - if np.isnan(v): - return "NaN" - elif np.isposinf(v): - return "Infinity" - elif np.isneginf(v): - return "-Infinity" - else: - return float(v) - elif dtype.kind in "ui": - return int(v) - elif dtype.kind == "b": - return bool(v) - elif dtype.kind in "c": - c = cast(np.complex128, np.dtype(complex).type()) - v = ( - encode_fill_value(v.real, c.real.dtype, object_codec), - encode_fill_value(v.imag, c.imag.dtype, object_codec), - ) - return v - elif dtype.kind in "SV": - v = str(base64.standard_b64encode(v), "ascii") - return v - elif dtype.kind == "U": - return v - elif dtype.kind in "mM": - return int(v.view("i8")) - else: - return v diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index ace4547..665cd39 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,5 +1,6 @@ -import asyncio +from typing import Any import fsspec +import json import os.path as osp import fsspec.implementations @@ -23,25 +24,29 @@ async def list_dir(store, path): [x async for x in store.list_dir(path)] -def create_store(test_dict: dict): +def create_store(test_dict: dict, remote_options: Any = None): if Version(zarr.__version__) < Version("3.0.0.a0"): return fsspec.get_mapper( "reference://", fo=test_dict, remote_protocol="s3", remote_options=so ) else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) return zarr.storage.RemoteStore(fs, mode="r") def test_single(): """Test creating references for a single HDF file""" - url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" + #url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" + url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") with fsspec.open(url, **so) as f: h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() + with open("test_dict.json", "w") as f: + json.dump(test_dict, f) + store = create_store(test_dict) ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) @@ -97,6 +102,7 @@ def test_multizarr(generate_mzz): """Test creating a combined reference file with MultiZarrToZarr""" mzz = generate_mzz test_dict = mzz.translate() + store = create_store(test_dict) ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index b52a9c0..a0f9e96 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -1,11 +1,13 @@ import base64 import copy import itertools +from typing import Any, cast import warnings import ujson import fsspec +import numpy as np import zarr @@ -134,6 +136,48 @@ def _encode_for_JSON(store): return store + +def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + # early out + if v is None: + return v + if dtype.kind == "V" and dtype.hasobject: + if object_codec is None: + raise ValueError("missing object_codec for object array") + v = object_codec.encode(v) + v = str(base64.standard_b64encode(v), "ascii") + return v + if dtype.kind == "f": + if np.isnan(v): + return "NaN" + elif np.isposinf(v): + return "Infinity" + elif np.isneginf(v): + return "-Infinity" + else: + return float(v) + elif dtype.kind in "ui": + return int(v) + elif dtype.kind == "b": + return bool(v) + elif dtype.kind in "c": + c = cast(np.complex128, np.dtype(complex).type()) + v = ( + encode_fill_value(v.real, c.real.dtype, object_codec), + encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) + return v + elif dtype.kind in "SV": + v = str(base64.standard_b64encode(v), "ascii") + return v + elif dtype.kind == "U": + return v + elif dtype.kind in "mM": + return int(v.view("i8")) + else: + return v + + def do_inline(store, threshold, remote_options=None, remote_protocol=None): """Replace short chunks with the value of that chunk and inline metadata