Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recipe hashes #349

Merged
merged 24 commits into from
Apr 30, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
694f9d4
add serialization.py
cisaacstern Apr 27, 2022
1095095
add test_serialization.py
cisaacstern Apr 27, 2022
93c8231
add .sha256 methods to FilePattern and BaseRecipe
cisaacstern Apr 27, 2022
50b522d
in pattern.sha256 root, add nitems_per_file, remove is_opendap
cisaacstern Apr 27, 2022
e34afda
shorter date range for serialization test
cisaacstern Apr 27, 2022
5ee8342
only _hash_exclude_ 'storage_config', not 'file_pattern'
cisaacstern Apr 27, 2022
ed66064
parametrize hash exclude test with both recipe classes
cisaacstern Apr 28, 2022
3c13247
drop empty fields before sha256 calc
cisaacstern Apr 28, 2022
9cb550f
parametrize additional fields test
cisaacstern Apr 28, 2022
228eead
parametrize kwargs for recipe sha256 tests
cisaacstern Apr 28, 2022
840df67
test recipes sha256 with matching and not_matching patterns
cisaacstern Apr 29, 2022
568980c
type hints and docstrings for serialization.py
cisaacstern Apr 29, 2022
23f79fa
parametrize file pattern hash test with kwargs; drop empty fields fro…
cisaacstern Apr 29, 2022
160e3e9
clarify addl kwargs test names
cisaacstern Apr 29, 2022
0ca1776
add release notes
cisaacstern Apr 29, 2022
0300877
fix typo in match_pattern_blockchain docstring
cisaacstern Apr 29, 2022
4092932
move imports to top level
cisaacstern Apr 29, 2022
8303cfc
release notes text wrapping
cisaacstern Apr 30, 2022
f4c563b
duck type json_default
cisaacstern Apr 30, 2022
838005b
Merge remote-tracking branch 'charles/recipe-hashes' into recipe-hashes
cisaacstern Apr 30, 2022
a60d3f3
fix circular dependencies
cisaacstern Apr 30, 2022
d3489f5
rename json_default -> either_encode_or_hash
cisaacstern Apr 30, 2022
b48849c
make nitems_per_file a dict comprehension inclusive of op.name
cisaacstern Apr 30, 2022
a5bcf1e
test either_encode_or_hash raises TypeError
cisaacstern Apr 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pangeo_forge_recipes/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ def items(self):
for key in self:
yield key, self[key]

def sha256(self):
"""Compute a sha256 hash for the instance."""
from .serialization import pattern_blockchain
cisaacstern marked this conversation as resolved.
Show resolved Hide resolved

return pattern_blockchain(self).pop(-1)


def pattern_from_file_sequence(file_list, concat_dim, nitems_per_file=None, **kwargs):
"""Convenience function for creating a FilePattern from a list of files."""
Expand Down
6 changes: 6 additions & 0 deletions pangeo_forge_recipes/recipes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
@dataclass
class BaseRecipe(ABC):
_compiler: ClassVar[RecipeCompiler]
_hash_exclude_ = ["file_pattern", "storage_config"]

def to_function(self):
from ..executors import FunctionPipelineExecutor
Expand All @@ -38,6 +39,11 @@ def to_beam(self):

return BeamPipelineExecutor.compile(self._compiler())

def sha256(self):
from ..serialization import dataclass_sha256
cisaacstern marked this conversation as resolved.
Show resolved Hide resolved

return dataclass_sha256(self, ignore_keys=self._hash_exclude_)


RecipeCompiler = Callable[[BaseRecipe], Pipeline]

Expand Down
68 changes: 68 additions & 0 deletions pangeo_forge_recipes/serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from dataclasses import asdict
from enum import Enum
from hashlib import sha256
from json import dumps
from typing import List


def json_default(thing):
# custom serializer for FileType, CombineOp, etc.
if isinstance(thing, Enum):
return thing.value
raise TypeError(f"object of type {type(thing).__name__} not serializable")


def dict_to_sha256(thing):
# https://death.andgravity.com/stable-hashing
b = dumps(
thing,
default=json_default,
ensure_ascii=False,
sort_keys=True,
indent=None,
separators=(",", ":"),
)
return sha256(b.encode("utf-8")).digest()


def dataclass_sha256(dclass: type, ignore_keys: List[str]) -> bytes:
d = asdict(dclass)
for k in ignore_keys:
del d[k]
return dict_to_sha256(d)


def pattern_blockchain(pattern):
# this contains general aspects of the file pattern
# we exclude the file pattern and concat dims because they are generated by iterating
# if they are generated in a different way, we ultimately don't care
root = {
"fsspec_open_kwargs": pattern.fsspec_open_kwargs,
"query_string_secrets": pattern.query_string_secrets,
"file_type": pattern.file_type,
"nitems_per_file": [
op.nitems_per_file for op in pattern.combine_dims if op.name in pattern.concat_dims
],
}

root_sha256 = dict_to_sha256(root)

blockchain = [root_sha256]
for k, v in pattern.items():
key_hash = b"".join(
sorted([dataclass_sha256(dimindex, ignore_keys=["sequence_len"]) for dimindex in k])
)
value_hash = sha256(v.encode("utf-8")).digest()
new_hash = key_hash + value_hash
new_block = sha256(blockchain[-1] + new_hash).digest()
blockchain.append(new_block)

return blockchain


def match_pattern_blockchain(old_pattern_last_hash, new_pattern):

new_chain = pattern_blockchain(new_pattern)
for k, h in zip(new_pattern, new_chain):
if h == old_pattern_last_hash:
return k
63 changes: 63 additions & 0 deletions tests/test_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from datetime import datetime, timedelta

import pandas as pd
import pytest

from pangeo_forge_recipes.patterns import ConcatDim, FilePattern
from pangeo_forge_recipes.serialization import match_pattern_blockchain

URL_FORMAT = (
"https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/"
"v2.1/access/avhrr/{time:%Y%m}/oisst-avhrr-v02r01.{time:%Y%m%d}.nc"
)


def make_file_pattern(dates, nitems_per_file):
def make_url(time):
return URL_FORMAT.format(time=time)

time_concat_dim = ConcatDim("time", dates, nitems_per_file=nitems_per_file)
pattern = FilePattern(make_url, time_concat_dim)

return pattern


@pytest.fixture
def end_date():
return "2022-02-01"


@pytest.fixture
def old_pattern_sha256(end_date):
dates = pd.date_range("1981-09-01", end_date, freq="D")
cisaacstern marked this conversation as resolved.
Show resolved Hide resolved
pattern = make_file_pattern(dates, nitems_per_file=1)
return pattern.sha256()


def get_new_pattern_with_next_url(end_date, nitems_per_file):

fmt = "%Y-%m-%d"

def increment_end_date(ndays):
return datetime.strptime(end_date, fmt) + timedelta(days=ndays)

next_day = increment_end_date(ndays=1)
new_end_date = increment_end_date(ndays=90).strftime(fmt)
new_dates = pd.date_range("1981-09-01", new_end_date, freq="D")
new_pattern = make_file_pattern(new_dates, nitems_per_file=nitems_per_file)
return new_pattern, URL_FORMAT.format(time=next_day)


@pytest.mark.parametrize("new_pattern_nitems_per_file", [1, 2])
def test_match_pattern_blockchain(
old_pattern_sha256,
end_date,
new_pattern_nitems_per_file,
):
new_pattern, next_url = get_new_pattern_with_next_url(end_date, new_pattern_nitems_per_file)
matching_key = match_pattern_blockchain(old_pattern_sha256, new_pattern)

if new_pattern_nitems_per_file == 1:
assert new_pattern[matching_key] == next_url
elif new_pattern_nitems_per_file == 2:
assert matching_key is None