Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

H5 context management #24

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 85 additions & 47 deletions src/fibsem_tools/io/h5.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,78 +7,116 @@

H5_ACCESS_MODES = ("r", "r+", "w", "w-", "x", "a")

H5_DATASET_KWDS = ("name",
"shape",
"dtype",
"data",
"chunks",
"compression",
"compression_opts",
"scaleoffset",
"shuffle",
"fletcher32",
"maxshape",
"fillvalue",
"track_times",
"track_order",
"external",
"allow_unknown_filter")

H5_GROUP_KWDS = ("name",
"track_order")

H5_FILE_KWDS = ("name",
"mode",
"driver",
"libver",
"userblock_size",
"swmr",
"rdcc_nslots",
"rdcc_nbytes",
"rdcc_w0",
"track_order",
"fs_strategy",
"fs_persist",
"fs_threshold")
H5_DATASET_KWDS = (
"name",
"shape",
"dtype",
"data",
"chunks",
"compression",
"compression_opts",
"scaleoffset",
"shuffle",
"fletcher32",
"maxshape",
"fillvalue",
"track_times",
"track_order",
"dcpl",
"external",
"allow_unknown_filter",
)

H5_GROUP_KWDS = ("name", "track_order")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since track_order is a common keyword to Datasets, Group's, and Files perhaps this parameter should be passed to all objects when they are created.


H5_FILE_KWDS = (
"name",
"mode",
"driver",
"libver",
"userblock_size",
"swmr",
"rdcc_nslots",
"rdcc_nbytes",
"rdcc_w0",
"track_order",
"fs_strategy",
"fs_persist",
"fs_threshold",
)


# Could use multiple inheritance here
class ManagedDataset(h5py.Dataset):
"""
h5py.Dataset with context manager behavior
"""

def __enter__(self):
return self

def __exit__(self, ex_type, ex_value, ex_traceback):
self.file.close()


class ManagedGroup(h5py.Group):
"""
h5py.Group with context manager behavior
"""

def __enter__(self):
return self

def __exit__(self, ex_type, ex_value, ex_traceback):
self.file.close()


def partition_h5_kwargs(**kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
partition kwargs into file-creation kwargs and dataset-creation kwargs
"""
file_kwargs = kwargs.copy()
dataset_kwargs = {}
for key in H5_DATASET_KWDS:
if key in file_kwargs:
for key in kwargs:
if key in H5_DATASET_KWDS:
dataset_kwargs[key] = file_kwargs.pop(key)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider keeping track_order as a File keyword. Also return Group keywords that may contain track_order.

return file_kwargs, dataset_kwargs


def access_h5(
store: Pathlike, path: Pathlike, mode: str, **kwargs
store: Union[h5py.File, Pathlike], path: Pathlike, **kwargs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
store: Union[h5py.File, Pathlike], path: Pathlike, **kwargs
store: Union[h5py.File, Pathlike], path: Union[Pathlike, NoneType], **kwargs

None is a valid dataset name and is used to create anonymous datasets.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting, how is that different from naming a dataset with the empty string?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For one, providing an empty string results in a TypeError in h5py:

In [22]: with h5py.File("test.hdf5", "w") as h5f:
    ...:     h5f.create_dataset(None, data = np.zeros((5,5)))
    ...:     print(list(h5f.keys()))
    ...:
    ...:
[]

In [23]: with h5py.File("test.hdf5", "w") as h5f:
    ...:     h5f.create_dataset("", data = np.zeros((5,5)))
    ...:     print(list(h5f.keys()))
    ...:
    ...:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-23-5230edf15b52> in <module>
      1 with h5py.File("test.hdf5", "w") as h5f:
----> 2     h5f.create_dataset("", data = np.zeros((5,5)))
      3     print(list(h5f.keys()))
      4
      5

~\.julia\conda\3\lib\site-packages\h5py\_hl\group.py in create_dataset(self, name, shape, dtype, data, **kwds)
    147                     group = self.require_group(parent_path)
    148
--> 149             dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
    150             dset = dataset.Dataset(dsid)
    151             return dset

~\.julia\conda\3\lib\site-packages\h5py\_hl\dataset.py in make_new_dset(parent, shape, dtype, data, name, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times, external, track_order, dcpl, allow_unknown_filter)
    140
    141
--> 142     dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl)
    143
    144     if (data is not None) and (not isinstance(data, Empty)):

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\h5d.pyx in h5py.h5d.create()

TypeError: expected bytes, str found

In [24]: with h5py.File("test.hdf5", "w") as h5f:
    ...:     h5f.create_dataset(b"", data = np.zeros((5,5)))
    ...:     print(list(h5f.keys()))
    ...:
    ...:
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-7f1869f34ebe> in <module>
      1 with h5py.File("test.hdf5", "w") as h5f:
----> 2     h5f.create_dataset(b"", data = np.zeros((5,5)))
      3     print(list(h5f.keys()))
      4
      5

~\.julia\conda\3\lib\site-packages\h5py\_hl\group.py in create_dataset(self, name, shape, dtype, data, **kwds)
    147                     group = self.require_group(parent_path)
    148
--> 149             dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
    150             dset = dataset.Dataset(dsid)
    151             return dset

~\.julia\conda\3\lib\site-packages\h5py\_hl\dataset.py in make_new_dset(parent, shape, dtype, data, name, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times, external, track_order, dcpl, allow_unknown_filter)
    140
    141
--> 142     dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl)
    143
    144     if (data is not None) and (not isinstance(data, Empty)):

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\h5d.pyx in h5py.h5d.create()

ValueError: Unable to create dataset (no name given)

) -> Union[h5py.Dataset, h5py.Group]:
"""
Docstring
"""
if mode not in H5_ACCESS_MODES:
raise ValueError(f"Invalid access mode. Got {mode}, expected one of {H5_ACCESS_MODES}.")

attrs = kwargs.pop("attrs", {})
mode = kwargs.get("mode", "r")
file_kwargs, dataset_kwargs = partition_h5_kwargs(**kwargs)

h5f = h5py.File(store, mode=mode, **file_kwargs)

if mode in ("r", "r+", "a") and (result := h5f.get(path)) is not None:
return result
if isinstance(store, h5py.File):
h5f = store
else:
h5f = h5py.File(store, **file_kwargs)

if mode in ("r", "r+", "a"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps if path is empty, just return the File, h5f.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are in r+ or a mode, perhaps the path does not exist because we are trying to create it. In those cases, perhaps you should catch the error.

# let h5py handle keyerrors
result = h5f[path]
else:
if len(dataset_kwargs) > 0:
if 'name' in dataset_kwargs:
warnings.warn('"Name" was provided to this function as a keyword argument. This value will be replaced with the second argument to this function.')
if "name" in dataset_kwargs:
warnings.warn(
'"Name" was provided to this function as a keyword argument. This value will be replaced with the second argument to this function.'
)
dataset_kwargs["name"] = path
result = h5f.create_dataset(**dataset_kwargs)
else:
result = h5f.require_group(path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Allow track_order to be passed to the group as well.


result.attrs.update(**attrs)

return result
if isinstance(result, h5py.Group):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A File is also a Group. In the case of a File, you do not need a new context manager.

result = ManagedGroup(result.id)
else:
result = ManagedDataset(result.id)

return result
20 changes: 20 additions & 0 deletions tests/test_h5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from h5py._hl.dataset import make_new_dset
from fibsem_tools.io.h5 import partition_h5_kwargs
from inspect import signature, Parameter


def test_kwarg_partition():
dataset_creation_sig = signature(make_new_dset)
dataset_kwargs = {
k: None
for k, v in filter(
lambda p: p[1].default is not Parameter.empty,
dataset_creation_sig.parameters.items(),
)
}
file_kwargs = {"foo": None, "bar": None}
file_kwargs_out, dataset_kwargs_out = partition_h5_kwargs(
**dataset_kwargs, **file_kwargs
)
assert file_kwargs == file_kwargs_out
assert dataset_kwargs == dataset_kwargs_out
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test for common keyword arguments like track_order.

35 changes: 21 additions & 14 deletions tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,29 +110,36 @@ def test_access_array_h5():
data = np.random.randint(0, 255, size=(10, 10, 10), dtype="uint8")
attrs = {"resolution": "1000"}
with tempfile.TemporaryFile(suffix=".h5") as store:
arr = access_h5(store, key, data=data, attrs=attrs, mode="w")
assert dict(arr.attrs) == attrs
assert np.array_equal(arr[:], data)
arr.file.close()
with access_h5(store, key, data=data, attrs=attrs, mode="w") as arr1:
assert dict(arr1.attrs) == attrs
assert np.array_equal(arr1[:], data)

arr2 = access_h5(store, key, mode="r")
assert dict(arr2.attrs) == attrs
assert np.array_equal(arr2[:], data)
arr2.file.close()
with access_h5(store, key, mode="r") as arr2:
assert dict(arr2.attrs) == attrs
assert np.array_equal(arr2[:], data)

with access_h5(store, key, mode="r") as arr3:
h5d = arr3.file[key]
assert h5d.shape == arr3.shape
assert h5d.attrs == arr3.attrs
assert h5d.chunks == arr3.chunks
assert h5d.compression == arr3.compression


def test_access_group_h5():
key = "s0"
attrs = {"resolution": "1000"}

with tempfile.TemporaryFile(suffix=".h5") as store:
grp = access_h5(store, key, attrs=attrs, mode="w")
assert dict(grp.attrs) == attrs
grp.file.close()
with access_h5(store, key, attrs=attrs, mode="w") as grp1:
assert dict(grp1.attrs) == attrs

with access_h5(store, key, mode="r") as grp2:
assert dict(grp2.attrs) == attrs

grp2 = access_h5(store, key, mode="r")
assert dict(grp2.attrs) == attrs
grp2.file.close()
with access_h5(store, key, mode="r") as grp3:
h5g = grp3.file[key]
assert h5g.attrs == grp3.attrs


def test_list_files():
Expand Down