From ff940ee9f662ddff614250161f17b0ba92a36f84 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:30:16 -0400 Subject: [PATCH 1/4] Retain original _FillValue in encoding --- pangeo_forge_recipes/aggregation.py | 6 ++++-- tests/test_combiners.py | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pangeo_forge_recipes/aggregation.py b/pangeo_forge_recipes/aggregation.py index 08e04806..28b077f7 100644 --- a/pangeo_forge_recipes/aggregation.py +++ b/pangeo_forge_recipes/aggregation.py @@ -30,7 +30,7 @@ def dataset_to_schema(ds: xr.Dataset) -> XarraySchema: # Remove redundant encoding options for v in ds.variables: - for option in ["_FillValue", "source"]: + for option in ["source"]: # TODO: should be okay to remove _FillValue? if option in ds[v].encoding: del ds[v].encoding[option] @@ -147,7 +147,9 @@ def _combine_attrs(a1: dict, a2: dict) -> dict: common_attrs = set(a1) & set(a2) new_attrs = {} for key in common_attrs: - if a1[key] == a2[key]: + if isinstance(a1[key], np.floating) and isinstance(a2[key], np.floating) and np.isnan(a1[key]) and np.isnan(a2[key]): + new_attrs[key] = a1[key] + elif a1[key] == a2[key]: new_attrs[key] = a1[key] return new_attrs diff --git a/tests/test_combiners.py b/tests/test_combiners.py index 6586f858..b29f1a57 100644 --- a/tests/test_combiners.py +++ b/tests/test_combiners.py @@ -7,7 +7,7 @@ from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from pytest_lazyfixture import lazy_fixture - +import numpy as np from pangeo_forge_recipes.aggregation import dataset_to_schema from pangeo_forge_recipes.combiners import CombineXarraySchemas from pangeo_forge_recipes.patterns import FilePattern @@ -101,12 +101,24 @@ def _get_concat_dim(pattern): def _strip_keys(item): return item[1] +def _assert_schema_equal(a, b): + assert set(a.keys()) == set(b.keys()) + + for key, value1 in a.items(): + value2 = b[key] + if isinstance(value1, np.floating) and isinstance(value2, np.floating) and np.isnan(value1) and np.isnan(value2): + continue + + if isinstance(value1, dict) and isinstance(value2, dict): + _assert_schema_equal(value1, value2) + else: + assert value1 == value2 def has_correct_schema(expected_schema): def _check_results(actual): assert len(actual) == 1 schema = actual[0] - assert schema == expected_schema + _assert_schema_equal(schema, expected_schema) return _check_results From 2205c605633a097362f1e40e99b6c58d9346b861 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 20:38:34 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pangeo_forge_recipes/aggregation.py | 7 ++++++- tests/test_combiners.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pangeo_forge_recipes/aggregation.py b/pangeo_forge_recipes/aggregation.py index 28b077f7..3ee1233e 100644 --- a/pangeo_forge_recipes/aggregation.py +++ b/pangeo_forge_recipes/aggregation.py @@ -147,7 +147,12 @@ def _combine_attrs(a1: dict, a2: dict) -> dict: common_attrs = set(a1) & set(a2) new_attrs = {} for key in common_attrs: - if isinstance(a1[key], np.floating) and isinstance(a2[key], np.floating) and np.isnan(a1[key]) and np.isnan(a2[key]): + if ( + isinstance(a1[key], np.floating) + and isinstance(a2[key], np.floating) + and np.isnan(a1[key]) + and np.isnan(a2[key]) + ): new_attrs[key] = a1[key] elif a1[key] == a2[key]: new_attrs[key] = a1[key] diff --git a/tests/test_combiners.py b/tests/test_combiners.py index b29f1a57..f2e306ce 100644 --- a/tests/test_combiners.py +++ b/tests/test_combiners.py @@ -1,13 +1,14 @@ import logging import apache_beam as beam +import numpy as np import pytest import xarray as xr from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from pytest_lazyfixture import lazy_fixture -import numpy as np + from pangeo_forge_recipes.aggregation import dataset_to_schema from pangeo_forge_recipes.combiners import CombineXarraySchemas from pangeo_forge_recipes.patterns import FilePattern @@ -101,12 +102,18 @@ def _get_concat_dim(pattern): def _strip_keys(item): return item[1] + def _assert_schema_equal(a, b): assert set(a.keys()) == set(b.keys()) for key, value1 in a.items(): value2 = b[key] - if isinstance(value1, np.floating) and isinstance(value2, np.floating) and np.isnan(value1) and np.isnan(value2): + if ( + isinstance(value1, np.floating) + and isinstance(value2, np.floating) + and np.isnan(value1) + and np.isnan(value2) + ): continue if isinstance(value1, dict) and isinstance(value2, dict): @@ -114,6 +121,7 @@ def _assert_schema_equal(a, b): else: assert value1 == value2 + def has_correct_schema(expected_schema): def _check_results(actual): assert len(actual) == 1 From 4193690dc3d122fe29e88cca430faca018b9f44a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:34:56 -0400 Subject: [PATCH 3/4] Add comments suggested in code review Co-authored-by: Raphael Hagen --- pangeo_forge_recipes/aggregation.py | 1 - tests/test_combiners.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pangeo_forge_recipes/aggregation.py b/pangeo_forge_recipes/aggregation.py index 3ee1233e..d947a9c8 100644 --- a/pangeo_forge_recipes/aggregation.py +++ b/pangeo_forge_recipes/aggregation.py @@ -31,7 +31,6 @@ def dataset_to_schema(ds: xr.Dataset) -> XarraySchema: # Remove redundant encoding options for v in ds.variables: for option in ["source"]: - # TODO: should be okay to remove _FillValue? if option in ds[v].encoding: del ds[v].encoding[option] d = ds.to_dict(data=False, encoding=True) diff --git a/tests/test_combiners.py b/tests/test_combiners.py index f2e306ce..db08e56a 100644 --- a/tests/test_combiners.py +++ b/tests/test_combiners.py @@ -104,6 +104,7 @@ def _strip_keys(item): def _assert_schema_equal(a, b): + # This is used instead of ``assert dict1 == dict2`` so that NaNs are treated as equal. assert set(a.keys()) == set(b.keys()) for key, value1 in a.items(): From dc87b8929f864882bfbf2ae4c31a50028ed8e45f Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:37:48 -0400 Subject: [PATCH 4/4] Add another comment --- pangeo_forge_recipes/aggregation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangeo_forge_recipes/aggregation.py b/pangeo_forge_recipes/aggregation.py index d947a9c8..28b015ec 100644 --- a/pangeo_forge_recipes/aggregation.py +++ b/pangeo_forge_recipes/aggregation.py @@ -146,6 +146,7 @@ def _combine_attrs(a1: dict, a2: dict) -> dict: common_attrs = set(a1) & set(a2) new_attrs = {} for key in common_attrs: + # treat NaNs as equal in the attrs if ( isinstance(a1[key], np.floating) and isinstance(a2[key], np.floating)