Skip to content

Commit

Permalink
sha256 hashes for XarrayZarrRecipe & FilePattern
Browse files Browse the repository at this point in the history
  • Loading branch information
cisaacstern committed Apr 26, 2022
1 parent aae3cf5 commit 5171d4d
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
6 changes: 5 additions & 1 deletion pangeo_forge_recipes/patterns.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
Filename / URL patterns.
"""
import hashlib
import inspect
import pickle
import warnings
from dataclasses import dataclass, field, replace
from enum import Enum, auto
Expand Down Expand Up @@ -187,7 +189,9 @@ def __hash__(self):
(op.name, op.nitems_per_file) for op in self.combine_dims if op.name in self.concat_dims
]
dict_to_hash.update({"concat_dims_nitems_per_file": tuple(concat_dims_nitems_per_file)})
return hash(tuple([(k, v) for k, v in dict_to_hash.items()]))
as_bytes = pickle.dumps(tuple([(k, v) for k, v in dict_to_hash.items()]))
hexdigest = hashlib.sha256(as_bytes).hexdigest()
return int(hexdigest, base=16)

@property
def dims(self) -> Dict[str, int]:
Expand Down
12 changes: 10 additions & 2 deletions pangeo_forge_recipes/recipes/xarray_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
"""
from __future__ import annotations

import hashlib
import itertools
import logging
import os
import pickle
import warnings
from contextlib import ExitStack, contextmanager
from dataclasses import dataclass, field, replace
Expand Down Expand Up @@ -801,8 +803,14 @@ class XarrayZarrRecipe(BaseRecipe, StorageMixin, FilePatternMixin):
"""How many items per input along concat_dim."""

def __hash__(self):
dict_to_hash = make_dict_with_hashable_vals(self.__dict__, ignore_keys=["storage_config"])
return hash(tuple([(k, v) for k, v in dict_to_hash.items()]))
dict_to_hash = make_dict_with_hashable_vals(
self.__dict__,
ignore_keys=["storage_config"],
use_custom_hash=["file_pattern"],
)
as_bytes = pickle.dumps(tuple([(k, v) for k, v in dict_to_hash.items()]))
hexdigest = hashlib.sha256(as_bytes).hexdigest()
return int(hexdigest, base=16)

def __post_init__(self):
self._validate_file_pattern()
Expand Down
14 changes: 12 additions & 2 deletions pangeo_forge_recipes/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging
from contextlib import contextmanager
from typing import List, Sequence, Tuple
from typing import List, Optional, Sequence, Tuple

import numpy as np
from dask.distributed import Lock, get_client
Expand Down Expand Up @@ -120,19 +120,29 @@ def lock_for_conflicts(conflicts, base_name="pangeo-forge", timeout=None):
logger.debug(f"Released lock {lock.name}")


def make_dict_with_hashable_vals(dictionary: dict, ignore_keys: List[str]) -> dict:
def make_dict_with_hashable_vals(
dictionary: dict,
ignore_keys: List[str],
use_custom_hash: Optional[List[str]] = None,
) -> dict:
"""Make a dictionary in which all values are hashable. Assumes all keys are strings, and that
the only unhashable values of the input dictionary are either ``dict``s or ``list``s. Used as
a helper in the custom ``__hash__`` methods of recipe classes and ``FilePattern``.
:param dictionary: The input dictionary.
:param ignore_keys: The keys to drop from the output dictionary.
:use_custom_hash: Keys to include in the return ``dict`` but for which values should be hashed.
This is for ``pangeo-forge-recipes`` objects with custom ``__hash__`` methods; e.g. recipe
classes with a ``file_pattern`` field should set ``use_custom_hash=["file_pattern"]``, because
we want to call ``self.__hash__`` on these objects.
"""
dict_with_hashable_vals = {}

for k, v in dictionary.items():
if k in ignore_keys:
pass
elif use_custom_hash and k in use_custom_hash:
dict_with_hashable_vals.update({k: hash(v)})
elif not isinstance(v, (dict, list)):
dict_with_hashable_vals.update({k: v})
elif isinstance(v, dict):
Expand Down

0 comments on commit 5171d4d

Please sign in to comment.