From 962f99d4dc2e62700cdf387cc71db5cc6cf3d0c8 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Thu, 18 Nov 2021 19:56:37 -0800 Subject: [PATCH 1/5] Simple implementation of skipping re-computing metadata cache. --- pangeo_forge_recipes/recipes/xarray_zarr.py | 13 +++++++++---- pangeo_forge_recipes/storage.py | 3 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pangeo_forge_recipes/recipes/xarray_zarr.py b/pangeo_forge_recipes/recipes/xarray_zarr.py index c7bc036d..20008fa6 100644 --- a/pangeo_forge_recipes/recipes/xarray_zarr.py +++ b/pangeo_forge_recipes/recipes/xarray_zarr.py @@ -162,10 +162,15 @@ def cache_input(input_key: InputKey, *, config: XarrayZarrRecipe) -> None: if config.cache_metadata: if config.metadata_cache is None: raise ValueError("metadata_cache is not set.") - logger.info(f"Caching metadata for input '{input_key!s}'") - with open_input(input_key, config=config) as ds: - input_metadata = ds.to_dict(data=False) - config.metadata_cache[_input_metadata_fname(input_key)] = input_metadata + + if not _input_metadata_fname(input_key) in config.metadata_cache: + with open_input(input_key, config=config) as ds: + logger.info(f"Caching metadata for input '{input_key!s}'") + input_metadata = ds.to_dict(data=False) + config.metadata_cache[_input_metadata_fname(input_key)] = input_metadata + else: + logger.info(f"Metadata already ached for input '{input_key!s}'") + if config.open_input_with_fsspec_reference: if config.file_pattern.is_opendap: diff --git a/pangeo_forge_recipes/storage.py b/pangeo_forge_recipes/storage.py index 4b6e4359..a08c3f73 100644 --- a/pangeo_forge_recipes/storage.py +++ b/pangeo_forge_recipes/storage.py @@ -180,6 +180,9 @@ def __setitem__(self, key: str, value: dict) -> None: def __getitem__(self, key: str) -> dict: return json.loads(self.get_mapper()[key]) + def __contains__(self, item: str) -> bool: + return item in self.get_mapper() + def getitems(self, keys: Sequence[str]) -> dict: mapper = self.get_mapper() all_meta_raw = mapper.getitems(keys) From 4c1b82ddd3151eb6623ce0ed8085b25af3563841 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Thu, 18 Nov 2021 20:26:26 -0800 Subject: [PATCH 2/5] Checking before re-computing for open_input_with_fsspec_reference. --- pangeo_forge_recipes/recipes/xarray_zarr.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pangeo_forge_recipes/recipes/xarray_zarr.py b/pangeo_forge_recipes/recipes/xarray_zarr.py index 20008fa6..aa99be1a 100644 --- a/pangeo_forge_recipes/recipes/xarray_zarr.py +++ b/pangeo_forge_recipes/recipes/xarray_zarr.py @@ -171,13 +171,19 @@ def cache_input(input_key: InputKey, *, config: XarrayZarrRecipe) -> None: else: logger.info(f"Metadata already ached for input '{input_key!s}'") - if config.open_input_with_fsspec_reference: if config.file_pattern.is_opendap: raise ValueError("Can't make references for opendap inputs") if config.metadata_cache is None: raise ValueError("Can't make references; no metadata_cache assigned") fname = config.file_pattern[input_key] + + ref_fname = _input_reference_fname(input_key) + + if ref_fname in config.metadata_cache: + logger.info('Metadata is already cached with fsspec_reference.') + return + if config.input_cache is None: protocol = fsspec.utils.get_protocol(fname) url = unstrip_protocol(fname, protocol) @@ -194,7 +200,6 @@ def cache_input(input_key: InputKey, *, config: XarrayZarrRecipe) -> None: **config.file_pattern.fsspec_open_kwargs, ) as fp: ref_data = create_hdf5_reference(fp, url, fname) - ref_fname = _input_reference_fname(input_key) config.metadata_cache[ref_fname] = ref_data From 2d5e8269afd8fb474905678b6fd0ceb25280d1be Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 20 Nov 2021 11:32:56 -0800 Subject: [PATCH 3/5] Passing pre-commit checks. --- pangeo_forge_recipes/recipes/xarray_zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangeo_forge_recipes/recipes/xarray_zarr.py b/pangeo_forge_recipes/recipes/xarray_zarr.py index aa99be1a..c72d0359 100644 --- a/pangeo_forge_recipes/recipes/xarray_zarr.py +++ b/pangeo_forge_recipes/recipes/xarray_zarr.py @@ -181,7 +181,7 @@ def cache_input(input_key: InputKey, *, config: XarrayZarrRecipe) -> None: ref_fname = _input_reference_fname(input_key) if ref_fname in config.metadata_cache: - logger.info('Metadata is already cached with fsspec_reference.') + logger.info("Metadata is already cached with fsspec_reference.") return if config.input_cache is None: From febee33efc01a5ebb4c38b5d9258d524b0d1a029 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Fri, 3 Dec 2021 15:16:46 -0800 Subject: [PATCH 4/5] Added test for __contains__ in metadata target. --- tests/test_storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_storage.py b/tests/test_storage.py index e1548405..e89afb89 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -35,6 +35,8 @@ def test_cache(tmp_cache): def test_metadata_target(tmp_metadata_target): data = {"foo": 1, "bar": "baz"} tmp_metadata_target["key1"] = data + assert 'key1' in tmp_metadata_target + assert 'key2' not in tmp_metadata_target assert tmp_metadata_target["key1"] == data assert tmp_metadata_target.getitems(["key1"]) == {"key1": data} From 8d86aa59e4fa5c8e82c24e8acc07b69d0db3e114 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Fri, 3 Dec 2021 15:25:15 -0800 Subject: [PATCH 5/5] Pre-commit: '' --> "" --- tests/test_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_storage.py b/tests/test_storage.py index e89afb89..f9ea1602 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -35,8 +35,8 @@ def test_cache(tmp_cache): def test_metadata_target(tmp_metadata_target): data = {"foo": 1, "bar": "baz"} tmp_metadata_target["key1"] = data - assert 'key1' in tmp_metadata_target - assert 'key2' not in tmp_metadata_target + assert "key1" in tmp_metadata_target + assert "key2" not in tmp_metadata_target assert tmp_metadata_target["key1"] == data assert tmp_metadata_target.getitems(["key1"]) == {"key1": data}