Skip to content

Commit

Permalink
Add option to disable verify_chunk_reuse (#337)
Browse files Browse the repository at this point in the history
* Add option to disable verify_chunk_reuse

Currently, the verify_chunk_reuse check is relatively slow, see #336.
This commit adds an option to disable it by an additional argument
to `stage_version(..., verify_chunk_reuse=False)`.

* Enable chunk reuse validation during tests via env variable

* Use an environment variable to enable chunk reuse verification

* Add note about new env var in the docs

* Add test to ensure chunk reuse verification is disabled by default

* Move env var handling out of the loops

---------

Co-authored-by: pdmurray <[email protected]>
  • Loading branch information
ArvidJB and peytondmurray authored Jun 10, 2024
1 parent 2445d7d commit 91e2598
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 28 deletions.
6 changes: 6 additions & 0 deletions docs/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,12 @@ version groups. The relevant modules are `versioned_hdf5.backend` and
it to the raw data for the given dataset. The data in each chunk of the
dataset is SHA256 hashed, and the hash is looked up in the hashtable dataset.
If it already exists in the raw data, that chunk in the raw data is reused.

To enable a check that the reused chunk matches the data that the user is
intending to write, set the following environment variable:
`ENABLE_CHUNK_REUSE_VALIDATION = 1`. This option is enabled during tests, but
is disabled by default for better performance.

The hashtable maps `SHA256 hash -> (start, stop)` where `(start, stop)` gives
a slice range for the chunk in the raw dataset (chunks in the `raw_data`
dataset are concatenated along the first axis only). All chunks that do not
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,13 @@ ignore = [

[project.optional-dependencies]
dev = ["pre-commit>=3.6.0"]
test = ["pytest"]
test = ["pytest", "pytest-env"]
doc = ["sphinx", "sphinx-multiversion", "myst-parser"]

[tool.setuptools_scm]

[tool.isort]
profile = "black"

[tool.pytest_env]
ENABLE_CHUNK_REUSE_VALIDATION = 1
6 changes: 5 additions & 1 deletion versioned_hdf5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,11 @@ def __iter__(self):

@contextmanager
def stage_version(
self, version_name: str, prev_version=None, make_current=True, timestamp=None
self,
version_name: str,
prev_version=None,
make_current=True,
timestamp=None,
):
"""
Return a context manager to stage a new version
Expand Down
66 changes: 40 additions & 26 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import os
from typing import Dict, Optional

import numpy as np
from h5py import VirtualLayout, VirtualSource, h5s, Dataset
from h5py import Dataset, VirtualLayout, VirtualSource, h5s
from h5py._hl.filters import guess_chunk
from ndindex import ChunkSize, Slice, Tuple, ndindex
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -62,7 +63,10 @@ def create_base_dataset(
shape = data.shape
else:
shape = (shape,) if isinstance(shape, int) else tuple(shape)
if data is not None and (np.prod(shape, dtype=np.ulonglong) != np.prod(data.shape, dtype=np.ulonglong)):
if data is not None and (
np.prod(shape, dtype=np.ulonglong)
!= np.prod(data.shape, dtype=np.ulonglong)
):
raise ValueError("Shape tuple is incompatible with data")

ndims = len(shape)
Expand Down Expand Up @@ -175,6 +179,10 @@ def write_dataset(
slices_to_write = {}
chunk_size = chunks[0]

validate_reused_chunks = os.environ.get(
"ENABLE_CHUNK_REUSE_VALIDATION", "false"
).lower() in ("1", "true")

with Hashtable(f, name) as hashtable:
old_chunks = hashtable.largest_index
chunks_reused = 0
Expand All @@ -188,14 +196,15 @@ def write_dataset(
hashed_slice = hashtable[data_hash]
slices[data_slice] = hashed_slice

_verify_new_chunk_reuse(
raw_dataset=ds,
new_data=data,
data_hash=data_hash,
hashed_slice=hashed_slice,
chunk_being_written=data_s,
slices_to_write=slices_to_write,
)
if validate_reused_chunks:
_verify_new_chunk_reuse(
raw_dataset=ds,
new_data=data,
data_hash=data_hash,
hashed_slice=hashed_slice,
chunk_being_written=data_s,
slices_to_write=slices_to_write,
)

chunks_reused += 1

Expand All @@ -217,10 +226,10 @@ def write_dataset(
new_chunks = hashtable.largest_index

logging.debug(
" %s: "
"New chunks written: %d; "
"Number of chunks reused: %d",
name, new_chunks - old_chunks, chunks_reused
" %s: " "New chunks written: %d; " "Number of chunks reused: %d",
name,
new_chunks - old_chunks,
chunks_reused,
)

return slices
Expand Down Expand Up @@ -346,6 +355,10 @@ def write_dataset_chunks(f, name, data_dict):
chunks = tuple(raw_data.attrs["chunks"])
chunk_size = chunks[0]

validate_reused_chunks = os.environ.get(
"ENABLE_CHUNK_REUSE_VALIDATION", "false"
).lower() in ("1", "true")

with Hashtable(f, name) as hashtable:
old_chunks = hashtable.largest_index
chunks_reused = 0
Expand All @@ -370,14 +383,15 @@ def write_dataset_chunks(f, name, data_dict):
hashed_slice = hashtable[data_hash]
slices[chunk] = hashed_slice

_verify_new_chunk_reuse(
raw_dataset=raw_data,
new_data=data_s,
data_hash=data_hash,
hashed_slice=hashed_slice,
chunk_being_written=data_s,
data_to_write=data_to_write,
)
if validate_reused_chunks:
_verify_new_chunk_reuse(
raw_dataset=raw_data,
new_data=data_s,
data_hash=data_hash,
hashed_slice=hashed_slice,
chunk_being_written=data_s,
data_to_write=data_to_write,
)

chunks_reused += 1

Expand All @@ -400,10 +414,10 @@ def write_dataset_chunks(f, name, data_dict):
raw_data[c] = data_s

logging.debug(
" %s: "
"New chunks written: %d; "
"Number of chunks reused: %d",
name, new_chunks - old_chunks, chunks_reused
" %s: " "New chunks written: %d; " "Number of chunks reused: %d",
name,
new_chunks - old_chunks,
chunks_reused,
)

return slices
Expand Down
48 changes: 48 additions & 0 deletions versioned_hdf5/tests/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,51 @@ def test_verify_chunk_reuse_multidim_1(tmp_path):
values_ds = group["values"]
values_ds.resize((8, 8))
values_ds[:] = np.array([[i + (j % 3) for i in range(8)] for j in range(8)])


def test_verify_chunk_disabled_by_default(tmp_path, monkeypatch):
"""Check that we skip chunk reuse verification if the environment variable is not set."""
monkeypatch.delenv("ENABLE_CHUNK_REUSE_VALIDATION", raising=False)

# This is the same test as test_verify_chunk_reuse_data_version_2,
# but with verification turned off with the environment variable.
def data_version_2_hash(self, data: np.ndarray):
"""
Compute hash for `data` array.
(Copied from commit 1f968f4 Hashtable.hash. This version hashes the encoded
data, not the data itself.)
"""
if data.dtype == "object":
hash_value = self.hash_function()
for value in data.flat:
hash_value.update(bytes(str(value), "utf-8"))
hash_value.update(bytes(str(data.shape), "utf-8"))
return hash_value.digest()
else:
return self.hash_function(
data.data.tobytes() + bytes(str(data.shape), "ascii")
).digest()

with mock.patch.object(Hashtable, "hash", autospec=True) as mocked_hash:
mocked_hash.side_effect = data_version_2_hash

data1 = np.array(["b'hello'", "b'world'"], dtype="O")
data2 = np.array([b"hello", b"world"], dtype="O")

filename = tmp_path / "data.h5"
with h5py.File(filename, mode="w") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r0") as group:
group.create_dataset(
"values",
dtype=h5py.string_dtype(encoding="ascii"),
data=data1,
maxshape=(None,),
chunks=(2,),
)

# This should raise an error, but will not because chunk
# reuse verification is turned off.
with vf.stage_version("r1") as group:
group["values"] = np.concatenate((data2, data2))

0 comments on commit 91e2598

Please sign in to comment.