Add option to disable verify_chunk_reuse (#337)

* Add option to disable verify_chunk_reuse Currently, the verify_chunk_reuse check is relatively slow, see #336. This commit adds an option to disable it by an additional argument to `stage_version(..., verify_chunk_reuse=False)`. * Enable chunk reuse validation during tests via env variable * Use an environment variable to enable chunk reuse verification * Add note about new env var in the docs * Add test to ensure chunk reuse verification is disabled by default * Move env var handling out of the loops --------- Co-authored-by: pdmurray <[email protected]>
deshaw · Jun 10, 2024 · 91e2598 · 91e2598
1 parent 2445d7d
commit 91e2598
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 28 deletions.
diff --git a/docs/design.md b/docs/design.md
@@ -175,6 +175,12 @@ version groups. The relevant modules are `versioned_hdf5.backend` and
 it to the raw data for the given dataset. The data in each chunk of the
 dataset is SHA256 hashed, and the hash is looked up in the hashtable dataset.
 If it already exists in the raw data, that chunk in the raw data is reused.
+
+To enable a check that the reused chunk matches the data that the user is
+intending to write, set the following environment variable:
+`ENABLE_CHUNK_REUSE_VALIDATION = 1`. This option is enabled during tests, but
+is disabled by default for better performance.
+
 The hashtable maps `SHA256 hash -> (start, stop)` where `(start, stop)` gives
 a slice range for the chunk in the raw dataset (chunks in the `raw_data`
 dataset are concatenated along the first axis only). All chunks that do not

diff --git a/pyproject.toml b/pyproject.toml
@@ -95,10 +95,13 @@ ignore = [
 
 [project.optional-dependencies]
 dev = ["pre-commit>=3.6.0"]
-test = ["pytest"]
+test = ["pytest", "pytest-env"]
 doc = ["sphinx", "sphinx-multiversion", "myst-parser"]
 
 [tool.setuptools_scm]
 
 [tool.isort]
 profile = "black"
+
+[tool.pytest_env]
+ENABLE_CHUNK_REUSE_VALIDATION = 1
diff --git a/versioned_hdf5/api.py b/versioned_hdf5/api.py
@@ -266,7 +266,11 @@ def __iter__(self):
 
     @contextmanager
     def stage_version(
-        self, version_name: str, prev_version=None, make_current=True, timestamp=None
+        self,
+        version_name: str,
+        prev_version=None,
+        make_current=True,
+        timestamp=None,
     ):
         """
         Return a context manager to stage a new version

diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py
@@ -1,8 +1,9 @@
 import logging
+import os
 from typing import Dict, Optional
 
 import numpy as np
-from h5py import VirtualLayout, VirtualSource, h5s, Dataset
+from h5py import Dataset, VirtualLayout, VirtualSource, h5s
 from h5py._hl.filters import guess_chunk
 from ndindex import ChunkSize, Slice, Tuple, ndindex
 from numpy.testing import assert_array_equal
@@ -62,7 +63,10 @@ def create_base_dataset(
         shape = data.shape
     else:
         shape = (shape,) if isinstance(shape, int) else tuple(shape)
-        if data is not None and (np.prod(shape, dtype=np.ulonglong) != np.prod(data.shape, dtype=np.ulonglong)):
+        if data is not None and (
+            np.prod(shape, dtype=np.ulonglong)
+            != np.prod(data.shape, dtype=np.ulonglong)
+        ):
             raise ValueError("Shape tuple is incompatible with data")
 
     ndims = len(shape)
@@ -175,6 +179,10 @@ def write_dataset(
     slices_to_write = {}
     chunk_size = chunks[0]
 
+    validate_reused_chunks = os.environ.get(
+        "ENABLE_CHUNK_REUSE_VALIDATION", "false"
+    ).lower() in ("1", "true")
+
     with Hashtable(f, name) as hashtable:
         old_chunks = hashtable.largest_index
         chunks_reused = 0
@@ -188,14 +196,15 @@ def write_dataset(
                     hashed_slice = hashtable[data_hash]
                     slices[data_slice] = hashed_slice
 
-                    _verify_new_chunk_reuse(
-                        raw_dataset=ds,
-                        new_data=data,
-                        data_hash=data_hash,
-                        hashed_slice=hashed_slice,
-                        chunk_being_written=data_s,
-                        slices_to_write=slices_to_write,
-                    )
+                    if validate_reused_chunks:
+                        _verify_new_chunk_reuse(
+                            raw_dataset=ds,
+                            new_data=data,
+                            data_hash=data_hash,
+                            hashed_slice=hashed_slice,
+                            chunk_being_written=data_s,
+                            slices_to_write=slices_to_write,
+                        )
 
                     chunks_reused += 1
 
@@ -217,10 +226,10 @@ def write_dataset(
         new_chunks = hashtable.largest_index
 
     logging.debug(
-        "  %s: "
-        "New chunks written: %d; "
-        "Number of chunks reused: %d",
-        name, new_chunks - old_chunks, chunks_reused
+        "  %s: " "New chunks written: %d; " "Number of chunks reused: %d",
+        name,
+        new_chunks - old_chunks,
+        chunks_reused,
     )
 
     return slices
@@ -346,6 +355,10 @@ def write_dataset_chunks(f, name, data_dict):
     chunks = tuple(raw_data.attrs["chunks"])
     chunk_size = chunks[0]
 
+    validate_reused_chunks = os.environ.get(
+        "ENABLE_CHUNK_REUSE_VALIDATION", "false"
+    ).lower() in ("1", "true")
+
     with Hashtable(f, name) as hashtable:
         old_chunks = hashtable.largest_index
         chunks_reused = 0
@@ -370,14 +383,15 @@ def write_dataset_chunks(f, name, data_dict):
                     hashed_slice = hashtable[data_hash]
                     slices[chunk] = hashed_slice
 
-                    _verify_new_chunk_reuse(
-                        raw_dataset=raw_data,
-                        new_data=data_s,
-                        data_hash=data_hash,
-                        hashed_slice=hashed_slice,
-                        chunk_being_written=data_s,
-                        data_to_write=data_to_write,
-                    )
+                    if validate_reused_chunks:
+                        _verify_new_chunk_reuse(
+                            raw_dataset=raw_data,
+                            new_data=data_s,
+                            data_hash=data_hash,
+                            hashed_slice=hashed_slice,
+                            chunk_being_written=data_s,
+                            data_to_write=data_to_write,
+                        )
 
                     chunks_reused += 1
 
@@ -400,10 +414,10 @@ def write_dataset_chunks(f, name, data_dict):
         raw_data[c] = data_s
 
     logging.debug(
-        "  %s: "
-        "New chunks written: %d; "
-        "Number of chunks reused: %d",
-        name, new_chunks - old_chunks, chunks_reused
+        "  %s: " "New chunks written: %d; " "Number of chunks reused: %d",
+        name,
+        new_chunks - old_chunks,
+        chunks_reused,
     )
 
     return slices

diff --git a/versioned_hdf5/tests/test_hashtable.py b/versioned_hdf5/tests/test_hashtable.py
@@ -294,3 +294,51 @@ def test_verify_chunk_reuse_multidim_1(tmp_path):
             values_ds = group["values"]
             values_ds.resize((8, 8))
             values_ds[:] = np.array([[i + (j % 3) for i in range(8)] for j in range(8)])
+
+
+def test_verify_chunk_disabled_by_default(tmp_path, monkeypatch):
+    """Check that we skip chunk reuse verification if the environment variable is not set."""
+    monkeypatch.delenv("ENABLE_CHUNK_REUSE_VALIDATION", raising=False)
+
+    # This is the same test as test_verify_chunk_reuse_data_version_2,
+    # but with verification turned off with the environment variable.
+    def data_version_2_hash(self, data: np.ndarray):
+        """
+        Compute hash for `data` array.
+
+        (Copied from commit 1f968f4 Hashtable.hash. This version hashes the encoded
+        data, not the data itself.)
+        """
+        if data.dtype == "object":
+            hash_value = self.hash_function()
+            for value in data.flat:
+                hash_value.update(bytes(str(value), "utf-8"))
+            hash_value.update(bytes(str(data.shape), "utf-8"))
+            return hash_value.digest()
+        else:
+            return self.hash_function(
+                data.data.tobytes() + bytes(str(data.shape), "ascii")
+            ).digest()
+
+    with mock.patch.object(Hashtable, "hash", autospec=True) as mocked_hash:
+        mocked_hash.side_effect = data_version_2_hash
+
+        data1 = np.array(["b'hello'", "b'world'"], dtype="O")
+        data2 = np.array([b"hello", b"world"], dtype="O")
+
+        filename = tmp_path / "data.h5"
+        with h5py.File(filename, mode="w") as f:
+            vf = VersionedHDF5File(f)
+            with vf.stage_version("r0") as group:
+                group.create_dataset(
+                    "values",
+                    dtype=h5py.string_dtype(encoding="ascii"),
+                    data=data1,
+                    maxshape=(None,),
+                    chunks=(2,),
+                )
+
+            # This should raise an error, but will not because chunk
+            # reuse verification is turned off.
+            with vf.stage_version("r1") as group:
+                group["values"] = np.concatenate((data2, data2))