From 0ed0db69c29c497a7280950fb9f12da6f8174fad Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 12 Jan 2022 11:30:20 -0500 Subject: [PATCH 1/3] switch Index to be a frozenset --- .pre-commit-config.yaml | 2 +- pangeo_forge_recipes/patterns.py | 34 ++++++------------- .../recipes/reference_hdf_zarr.py | 2 ++ pangeo_forge_recipes/recipes/xarray_zarr.py | 4 +-- 4 files changed, 16 insertions(+), 26 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d878d3d..9a633aae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: - id: seed-isort-config - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v0.910' + rev: 'v0.931' hooks: - id: mypy exclude: tests diff --git a/pangeo_forge_recipes/patterns.py b/pangeo_forge_recipes/patterns.py index 36460919..56cfa7a6 100644 --- a/pangeo_forge_recipes/patterns.py +++ b/pangeo_forge_recipes/patterns.py @@ -1,17 +1,19 @@ """ Filename / URL patterns. """ +from __future__ import annotations import inspect from dataclasses import dataclass, field, replace from enum import Enum from itertools import product from typing import ( + TYPE_CHECKING, Any, Callable, ClassVar, Dict, - Iterable, + FrozenSet, Iterator, List, Optional, @@ -68,7 +70,7 @@ class MergeDim: operation: ClassVar[CombineOp] = CombineOp.MERGE -@dataclass(frozen=True) +@dataclass(frozen=True, order=True) class DimIndex: """Object used to index a single dimension of a FilePattern or Recipe Chunks. @@ -92,27 +94,13 @@ def __post_init__(self): assert self.index < self.sequence_len -class Index(tuple): - """A tuple of ``DimIndex`` objects. - The order of the indexes doesn't matter for comparision.""" - - def __new__(self, args: Iterable[DimIndex]): - # This validation really slows things down because we call Index a lot! - # if not all((isinstance(a, DimIndex) for a in args)): - # raise ValueError("All arguments must be DimIndex.") - # args_set = set(args) - # if len(set(args_set)) < len(tuple(args)): - # raise ValueError("Duplicate argument detected.") - return tuple.__new__(Index, args) - - def __str__(self): - return ",".join(str(dim) for dim in self) - - def __eq__(self, other): - return (set(self) == set(other)) and (len(self) == len(other)) - - def __hash__(self): - return hash(frozenset(self)) +if TYPE_CHECKING: + # If we just do this, we can't initialize an Index by just writing + # Index(list_of_dims) + Index = FrozenSet[DimIndex] +else: + # But if we just do this, it won't pass mypy 😖 + Index = frozenset[DimIndex] CombineDim = Union[MergeDim, ConcatDim] diff --git a/pangeo_forge_recipes/recipes/reference_hdf_zarr.py b/pangeo_forge_recipes/recipes/reference_hdf_zarr.py index deda7e6c..5ce28496 100644 --- a/pangeo_forge_recipes/recipes/reference_hdf_zarr.py +++ b/pangeo_forge_recipes/recipes/reference_hdf_zarr.py @@ -29,6 +29,8 @@ def scan_file(chunk_key: ChunkKey, config: HDFReferenceRecipe): ref_fname = os.path.basename(fname + ".json") with file_opener(fname, **config.netcdf_storage_options) as fp: protocol = getattr(getattr(fp, "fs", None), "protocol", None) # make mypy happy + if protocol is None: + raise ValueError("Couldn't determine protocol") target_url = unstrip_protocol(fname, protocol) config.metadata_cache[ref_fname] = create_hdf5_reference(fp, target_url, fname) diff --git a/pangeo_forge_recipes/recipes/xarray_zarr.py b/pangeo_forge_recipes/recipes/xarray_zarr.py index 23ce9c1a..1a21203a 100644 --- a/pangeo_forge_recipes/recipes/xarray_zarr.py +++ b/pangeo_forge_recipes/recipes/xarray_zarr.py @@ -49,12 +49,12 @@ def _input_metadata_fname(input_key: InputKey) -> str: - key_str = "-".join([f"{k.name}_{k.index}" for k in input_key]) + key_str = "-".join([f"{k.name}_{k.index}" for k in sorted(input_key)]) return "input-meta-" + key_str + ".json" def _input_reference_fname(input_key: InputKey) -> str: - key_str = "-".join([f"{k.name}_{k.index}" for k in input_key]) + key_str = "-".join([f"{k.name}_{k.index}" for k in sorted(input_key)]) return "input-reference-" + key_str + ".json" From 141ef3b724446e8e9e45549214d87bee917e6fff Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 12 Jan 2022 11:51:51 -0500 Subject: [PATCH 2/3] use Index class --- pangeo_forge_recipes/patterns.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pangeo_forge_recipes/patterns.py b/pangeo_forge_recipes/patterns.py index 56cfa7a6..872516aa 100644 --- a/pangeo_forge_recipes/patterns.py +++ b/pangeo_forge_recipes/patterns.py @@ -1,14 +1,11 @@ """ Filename / URL patterns. """ -from __future__ import annotations - import inspect from dataclasses import dataclass, field, replace from enum import Enum from itertools import product from typing import ( - TYPE_CHECKING, Any, Callable, ClassVar, @@ -94,13 +91,8 @@ def __post_init__(self): assert self.index < self.sequence_len -if TYPE_CHECKING: - # If we just do this, we can't initialize an Index by just writing - # Index(list_of_dims) - Index = FrozenSet[DimIndex] -else: - # But if we just do this, it won't pass mypy 😖 - Index = frozenset[DimIndex] +class Index(FrozenSet[DimIndex]): + pass CombineDim = Union[MergeDim, ConcatDim] From ff7cab39f8a8839440f858faef3d76f82825ad2b Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 12 Jan 2022 13:11:21 -0500 Subject: [PATCH 3/3] fix pattern tests --- tests/test_patterns.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_patterns.py b/tests/test_patterns.py index 08cebf13..d7652ed0 100644 --- a/tests/test_patterns.py +++ b/tests/test_patterns.py @@ -79,7 +79,7 @@ def test_pattern_from_file_sequence(): assert fp.nitems_per_input == {"time": None} assert fp.concat_sequence_lens == {"time": None} for key in fp: - assert fp[key] == file_sequence[key[0].index] + assert fp[key] == file_sequence[sorted(key)[0].index] @pytest.mark.parametrize("pickle", [False, True]) @@ -115,17 +115,17 @@ def test_file_pattern_concat_merge(runtime_secrets, pickle, concat_merge_pattern assert fp.concat_sequence_lens == {"time": None} assert len(list(fp)) == 6 for key in fp: - expected_fname = format_function(time=times[key[1].index], variable=varnames[key[0].index]) for k in key: if k.name == "time": assert k.operation == CombineOp.CONCAT assert k.sequence_len == 3 + time_val = times[k.index] if k.name == "variable": assert k.operation == CombineOp.MERGE assert k.sequence_len == 2 + variable_val = varnames[k.index] + expected_fname = format_function(time=time_val, variable=variable_val) assert fp[key] == expected_fname - # make sure key order doesn't matter - assert fp[key[::-1]] == expected_fname if "fsspec_open_kwargs" in kwargs.keys(): assert fp.is_opendap is False