diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 5b479d43..ed1bc4a5 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -7,6 +7,7 @@ import zarr from zarr.meta import encode_fill_value import numcodecs + from .codecs import FillStringsCodec from .utils import _encode_for_JSON @@ -358,8 +359,26 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): ) for v in h5obj.dtype.names ] - else: + elif self.vlen == "embed": # embed fails due to https://github.com/zarr-developers/numcodecs/issues/333 + data = h5obj[:].tolist() + data2 = [] + for d in data: + data2.append( + [ + ( + _.decode(errors="ignore") + if isinstance(_, bytes) + else _ + ) + for _ in d + ] + ) + dt = "O" + kwargs["data"] = data2 + kwargs["object_codec"] = numcodecs.JSON() + fill = None + else: raise NotImplementedError # Add filter for shuffle if h5obj.shuffle and h5obj.dtype.kind != "O": diff --git a/kerchunk/tests/NEONDSTowerTemperatureData.hdf5 b/kerchunk/tests/NEONDSTowerTemperatureData.hdf5 new file mode 100644 index 00000000..de6b34c2 Binary files /dev/null and b/kerchunk/tests/NEONDSTowerTemperatureData.hdf5 differ diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 50f1dba7..a9557ffe 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -286,3 +286,23 @@ def test_compact(): m = fsspec.get_mapper("reference://", fo=out) g = zarr.open(m) assert np.allclose(g.ancillary_data.atlas_sdp_gps_epoch[:], 1.19880002e09) + + +def test_embed(): + fn = osp.join(here, "NEONDSTowerTemperatureData.hdf5") + h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed") + out = h.translate() + + fs = fsspec.filesystem("reference", fo=out) + z = zarr.open(fs.get_mapper()) + data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] + assert data[0].tolist() == [ + "2014-04-01 00:00:00.0", + "60", + "6.72064364129017", + "6.667845743708792", + "6.774491093631761", + "0.0012746926446369846", + "0.004609216572327277", + "0.01298182345556785", + ]