Skip to content

Commit

Permalink
reorganize
Browse files Browse the repository at this point in the history
  • Loading branch information
mpiannucci committed Oct 10, 2024
1 parent 5c8806b commit 80fedcd
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 53 deletions.
51 changes: 2 additions & 49 deletions kerchunk/hdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import base64
import io
import logging
from typing import Union, BinaryIO, Any, cast
from typing import Union, BinaryIO
from packaging.version import Version

import fsspec.core
Expand All @@ -11,7 +11,7 @@
import numcodecs

from .codecs import FillStringsCodec
from .utils import _encode_for_JSON
from .utils import _encode_for_JSON, encode_fill_value

try:
import h5py
Expand All @@ -22,12 +22,6 @@
"for more details."
)

# try:
# from zarr.meta import encode_fill_value
# except ModuleNotFoundError:
# # https://github.com/zarr-developers/zarr-python/issues/2021
# from zarr.v2.meta import encode_fill_value

lggr = logging.getLogger("h5-to-zarr")
_HIDDEN_ATTRS = { # from h5netcdf.attrs
"REFERENCE_LIST",
Expand Down Expand Up @@ -504,7 +498,6 @@ def _translator(
lggr.debug(f"Created Zarr array: {za}")
self._transfer_attrs(h5obj, za)

# za.attrs["_ARRAY_DIMENSIONS"] = adims
lggr.debug(f"_ARRAY_DIMENSIONS = {adims}")

if "data" in kwargs:
Expand Down Expand Up @@ -705,43 +698,3 @@ def _is_netcdf_variable(dataset: h5py.Dataset):
def has_visititems_links():
return hasattr(h5py.Group, "visititems_links")


def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
# early out
if v is None:
return v
if dtype.kind == "V" and dtype.hasobject:
if object_codec is None:
raise ValueError("missing object_codec for object array")
v = object_codec.encode(v)
v = str(base64.standard_b64encode(v), "ascii")
return v
if dtype.kind == "f":
if np.isnan(v):
return "NaN"
elif np.isposinf(v):
return "Infinity"
elif np.isneginf(v):
return "-Infinity"
else:
return float(v)
elif dtype.kind in "ui":
return int(v)
elif dtype.kind == "b":
return bool(v)
elif dtype.kind in "c":
c = cast(np.complex128, np.dtype(complex).type())
v = (
encode_fill_value(v.real, c.real.dtype, object_codec),
encode_fill_value(v.imag, c.imag.dtype, object_codec),
)
return v
elif dtype.kind in "SV":
v = str(base64.standard_b64encode(v), "ascii")
return v
elif dtype.kind == "U":
return v
elif dtype.kind in "mM":
return int(v.view("i8"))
else:
return v
14 changes: 10 additions & 4 deletions kerchunk/tests/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
from typing import Any
import fsspec
import json
import os.path as osp

import fsspec.implementations
Expand All @@ -23,25 +24,29 @@ async def list_dir(store, path):
[x async for x in store.list_dir(path)]


def create_store(test_dict: dict):
def create_store(test_dict: dict, remote_options: Any = None):
if Version(zarr.__version__) < Version("3.0.0.a0"):
return fsspec.get_mapper(
"reference://", fo=test_dict, remote_protocol="s3", remote_options=so
)
else:
fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict)
fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options)
return zarr.storage.RemoteStore(fs, mode="r")


def test_single():
"""Test creating references for a single HDF file"""
url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
#url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc"
so = dict(anon=True, default_fill_cache=False, default_cache_type="none")

with fsspec.open(url, **so) as f:
h5chunks = SingleHdf5ToZarr(f, url, storage_options=so)
test_dict = h5chunks.translate()

with open("test_dict.json", "w") as f:
json.dump(test_dict, f)

store = create_store(test_dict)

ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False))
Expand Down Expand Up @@ -97,6 +102,7 @@ def test_multizarr(generate_mzz):
"""Test creating a combined reference file with MultiZarrToZarr"""
mzz = generate_mzz
test_dict = mzz.translate()

store = create_store(test_dict)
ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False))

Expand Down
44 changes: 44 additions & 0 deletions kerchunk/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import base64
import copy
import itertools
from typing import Any, cast
import warnings

import ujson

import fsspec
import numpy as np
import zarr


Expand Down Expand Up @@ -134,6 +136,48 @@ def _encode_for_JSON(store):
return store



def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
# early out
if v is None:
return v
if dtype.kind == "V" and dtype.hasobject:
if object_codec is None:
raise ValueError("missing object_codec for object array")
v = object_codec.encode(v)
v = str(base64.standard_b64encode(v), "ascii")
return v
if dtype.kind == "f":
if np.isnan(v):
return "NaN"
elif np.isposinf(v):
return "Infinity"
elif np.isneginf(v):
return "-Infinity"
else:
return float(v)
elif dtype.kind in "ui":
return int(v)
elif dtype.kind == "b":
return bool(v)
elif dtype.kind in "c":
c = cast(np.complex128, np.dtype(complex).type())
v = (
encode_fill_value(v.real, c.real.dtype, object_codec),
encode_fill_value(v.imag, c.imag.dtype, object_codec),
)
return v
elif dtype.kind in "SV":
v = str(base64.standard_b64encode(v), "ascii")
return v
elif dtype.kind == "U":
return v
elif dtype.kind in "mM":
return int(v.view("i8"))
else:
return v


def do_inline(store, threshold, remote_options=None, remote_protocol=None):
"""Replace short chunks with the value of that chunk and inline metadata
Expand Down

0 comments on commit 80fedcd

Please sign in to comment.