From 8698f3c0ae6d18515c20fb682974107c0cf3aeea Mon Sep 17 00:00:00 2001 From: Simon Kamuk Christiansen Date: Tue, 11 Feb 2025 12:26:43 +0000 Subject: [PATCH 1/8] Standardize state diff stats in mdp datastore. Change variable name. --- neural_lam/datastore/mdp.py | 16 +++++++++++++++- neural_lam/datastore/npyfilesmeps/store.py | 10 ++++++++-- neural_lam/models/ar_model.py | 6 ++++-- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index 01d2b12b..c975c824 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -301,7 +301,8 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: should contain a `{category}_mean` and `{category}_std` variable for each variable in the category. For `category=="state"`, the dataarray should also contain a `state_diff_mean` and `state_diff_std` variable - for the one- step differences of the state variables. + for the one- step differences of the state variables along with their + standardized versions appended with `_standardized`. Parameters ---------- @@ -327,6 +328,19 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: ) ds_stats = self._ds[stats_variables.keys()].rename(stats_variables) + + if category == "state": + standardized_state_diff = { + op: ds_stats[f"state_diff_{op}"] / ds_stats["state_std"] + for op in ops + } + ds_stats = ds_stats.assign( + **{ + f"state_diff_{op}_standardized": standardized_state_diff[op] + for op in ops + } + ) + return ds_stats @cached_property diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py index 8f926f7e..607350e5 100644 --- a/neural_lam/datastore/npyfilesmeps/store.py +++ b/neural_lam/datastore/npyfilesmeps/store.py @@ -769,8 +769,14 @@ def load_pickled_tensor(fn): } if mean_diff_values is not None and std_diff_values is not None: - variables["state_diff_mean"] = (feature_dim_name, mean_diff_values) - variables["state_diff_std"] = (feature_dim_name, std_diff_values) + variables["state_diff_mean_standardized"] = ( + feature_dim_name, + mean_diff_values, + ) + variables["state_diff_std_standardized"] = ( + feature_dim_name, + std_diff_values, + ) ds_norm = xr.Dataset( variables, diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py index 81c2f720..0aa71112 100644 --- a/neural_lam/models/ar_model.py +++ b/neural_lam/models/ar_model.py @@ -65,10 +65,12 @@ def __init__( da_state_stats.state_std.values, dtype=torch.float32 ), "diff_mean": torch.tensor( - da_state_stats.state_diff_mean.values, dtype=torch.float32 + da_state_stats.state_diff_mean_standardized.values, + dtype=torch.float32, ), "diff_std": torch.tensor( - da_state_stats.state_diff_std.values, dtype=torch.float32 + da_state_stats.state_diff_std_standardized.values, + dtype=torch.float32, ), } From 0948f9ac73e7f7bd51ca7e75705118bb2afbf31a Mon Sep 17 00:00:00 2001 From: Simon Kamuk Christiansen Date: Tue, 11 Feb 2025 12:35:58 +0000 Subject: [PATCH 2/8] clean up code --- neural_lam/datastore/mdp.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index c975c824..39edc51d 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -322,6 +322,8 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: stats_variables = { f"{category}__{split}__{op}": f"{category}_{op}" for op in ops } + + # Add state diff stats if category == "state": stats_variables.update( {f"state__{split}__diff_{op}": f"state_diff_{op}" for op in ops} @@ -329,14 +331,14 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: ds_stats = self._ds[stats_variables.keys()].rename(stats_variables) + # Add standardized state diff stats if category == "state": - standardized_state_diff = { - op: ds_stats[f"state_diff_{op}"] / ds_stats["state_std"] - for op in ops - } ds_stats = ds_stats.assign( **{ - f"state_diff_{op}_standardized": standardized_state_diff[op] + f"state_diff_{op}_standardized": ds_stats[ + f"state_diff_{op}" + ] + / ds_stats["state_std"] for op in ops } ) From 4ea2bed31f2dbe29d05cf1532ad1fa4c852531b6 Mon Sep 17 00:00:00 2001 From: Simon Kamuk Christiansen Date: Tue, 11 Feb 2025 12:41:36 +0000 Subject: [PATCH 3/8] Update docstrings and changelog --- CHANGELOG.md | 2 ++ neural_lam/datastore/base.py | 9 +++++---- neural_lam/datastore/npyfilesmeps/store.py | 5 +++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45bb97c9..ba9fb567 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix duplicate tensor copy to CPU [\#106](https://github.com/mllam/neural-lam/pull/106) @observingClouds +- Add standardization to state diff stats from mdp datastore [\#122](https://github.com/mllam/neural-lam/pull/122) @SimonKamuk + ### Maintenance - update ci/cd testing setup to install torch version compatible with neural-lam dependencies [\#115](https://github.com/mllam/neural-lam/pull/115), @leifdenby diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index f0291657..80ed76e0 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -166,10 +166,11 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: a `{category}_mean` and `{category}_std` variable for each variable in the category. For `category=="state"`, the dataarray should also contain a `state_diff_mean` and `state_diff_std` variable for the one- - step differences of the state variables. The returned dataarray should - at least have dimensions of `({category}_feature)`, but can also - include for example `grid_index` (if the standardization is done per - grid point for example). + step differences of the state variables along with their + standardized versions appended with `_standardized`. + The returned dataarray should at least have dimensions of + `({category}_feature)`, but can also include for example `grid_index` + (if the standardization is done per grid point for example). Parameters ---------- diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py index 607350e5..75f01375 100644 --- a/neural_lam/datastore/npyfilesmeps/store.py +++ b/neural_lam/datastore/npyfilesmeps/store.py @@ -707,8 +707,9 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: """Return the standardization dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for each variable in the category. For `category=="state"`, the dataarray - should also contain a `state_diff_mean` and `state_diff_std` variable - for the one- step differences of the state variables. + should also contain a `state_diff_mean_standardized` and + `state_diff_std_standardized` variable for the one- step differences + of the state variables. Parameters ---------- From f191f2e3e3397abad4e3732b4fd5dca2937e2bd9 Mon Sep 17 00:00:00 2001 From: Simon Kamuk Christiansen Date: Tue, 11 Feb 2025 12:52:16 +0000 Subject: [PATCH 4/8] fix tests --- neural_lam/datastore/base.py | 6 +++--- tests/dummy_datastore.py | 2 +- tests/test_datastores.py | 9 +++++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index 80ed76e0..35cf1ffd 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -165,9 +165,9 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: deviation of 1.0) dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for each variable in the category. For `category=="state"`, the dataarray should also - contain a `state_diff_mean` and `state_diff_std` variable for the one- - step differences of the state variables along with their - standardized versions appended with `_standardized`. + contain variables `state_diff_mean_standardized` and + `state_diff_std_standardized` for the one-step differences of the state + variables, optionally along with their non-standardized versions. The returned dataarray should at least have dimensions of `({category}_feature)`, but can also include for example `grid_index` (if the standardization is done per grid point for example). diff --git a/tests/dummy_datastore.py b/tests/dummy_datastore.py index 0c76bca8..2c4d9e7b 100644 --- a/tests/dummy_datastore.py +++ b/tests/dummy_datastore.py @@ -292,7 +292,7 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: ops = ["mean", "std"] if category == "state": - ops += ["diff_mean", "diff_std"] + ops += ["diff_mean_standardized", "diff_std_standardized"] for op in ops: da_op = xr.ones_like(self.ds[f"{category}_feature"]).astype(float) diff --git a/tests/test_datastores.py b/tests/test_datastores.py index 6f5b3755..a438291c 100644 --- a/tests/test_datastores.py +++ b/tests/test_datastores.py @@ -132,7 +132,7 @@ def test_get_vars(datastore_name): @pytest.mark.parametrize("datastore_name", DATASTORES.keys()) def test_get_normalization_dataarray(datastore_name): - """Check that the `datastore.get_normalization_dataa rray` method is + """Check that the `datastore.get_normalization_dataarray` method is implemented.""" datastore = init_datastore_example(datastore_name) @@ -144,7 +144,12 @@ def test_get_normalization_dataarray(datastore_name): assert isinstance(ds_stats, xr.Dataset) if category == "state": - ops = ["mean", "std", "diff_mean", "diff_std"] + ops = [ + "mean", + "std", + "diff_mean_standardized", + "diff_std_standardized", + ] elif category == "forcing": ops = ["mean", "std"] elif category == "static": From 510567a2467bbfae65e664cbd4573574d84e15fc Mon Sep 17 00:00:00 2001 From: SimonKamuk Date: Wed, 12 Feb 2025 09:37:45 +0000 Subject: [PATCH 5/8] update cicd to check storage usage --- .github/workflows/install-and-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml index a8952430..083b1d2f 100644 --- a/.github/workflows/install-and-test.yml +++ b/.github/workflows/install-and-test.yml @@ -101,6 +101,10 @@ jobs: restore-keys: | ${{ runner.os }}-meps-reduced-example-data-v0.3.0 + - name: Check disk space + run: | + df -h + - name: Run tests run: | ${{ matrix.package_manager == 'pdm' && 'pdm run' || '' }} pytest -vv -s From c172947727add68087b944aa40860ab5050056d5 Mon Sep 17 00:00:00 2001 From: SimonKamuk Date: Wed, 12 Feb 2025 09:38:31 +0000 Subject: [PATCH 6/8] update cicd to check storage usage --- .github/workflows/install-and-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml index 083b1d2f..b2c226b4 100644 --- a/.github/workflows/install-and-test.yml +++ b/.github/workflows/install-and-test.yml @@ -47,6 +47,10 @@ jobs: with: python-version: 3.9 + - name: Check disk space + run: | + df -h + - name: Install PDM (if applicable) if: matrix.package_manager == 'pdm' run: | From f370455763b90f08f01b8db664fe3069d91d3c95 Mon Sep 17 00:00:00 2001 From: SimonKamuk Date: Wed, 12 Feb 2025 09:59:42 +0000 Subject: [PATCH 7/8] remove checks for disk storage --- .github/workflows/install-and-test.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml index b2c226b4..a8952430 100644 --- a/.github/workflows/install-and-test.yml +++ b/.github/workflows/install-and-test.yml @@ -47,10 +47,6 @@ jobs: with: python-version: 3.9 - - name: Check disk space - run: | - df -h - - name: Install PDM (if applicable) if: matrix.package_manager == 'pdm' run: | @@ -105,10 +101,6 @@ jobs: restore-keys: | ${{ runner.os }}-meps-reduced-example-data-v0.3.0 - - name: Check disk space - run: | - df -h - - name: Run tests run: | ${{ matrix.package_manager == 'pdm' && 'pdm run' || '' }} pytest -vv -s From 8e362b4ec7ba0466df92a652480fcf16829d0ba2 Mon Sep 17 00:00:00 2001 From: SimonKamuk Date: Wed, 12 Feb 2025 20:52:11 +0000 Subject: [PATCH 8/8] remove non standardized diff stats from mdp. Add comment in ar_model --- neural_lam/datastore/base.py | 8 ++++---- neural_lam/datastore/mdp.py | 18 ++++++------------ neural_lam/datastore/npyfilesmeps/store.py | 8 ++++---- neural_lam/models/ar_model.py | 2 ++ tests/dummy_datastore.py | 13 +++++++------ 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index 35cf1ffd..83e71ec9 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -164,10 +164,10 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: Return the standardization (i.e. scaling to mean of 0.0 and standard deviation of 1.0) dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for each variable in - the category. For `category=="state"`, the dataarray should also - contain variables `state_diff_mean_standardized` and - `state_diff_std_standardized` for the one-step differences of the state - variables, optionally along with their non-standardized versions. + the category. + For `category=="state"`, the dataarray should also contain a + `state_diff_mean_standardized` and `state_diff_std_standardized` + variable for the one-step differences of the state variables. The returned dataarray should at least have dimensions of `({category}_feature)`, but can also include for example `grid_index` (if the standardization is done per grid point for example). diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index 39edc51d..f3a81b72 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -299,10 +299,10 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: """ Return the standardization dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for - each variable in the category. For `category=="state"`, the dataarray - should also contain a `state_diff_mean` and `state_diff_std` variable - for the one- step differences of the state variables along with their - standardized versions appended with `_standardized`. + each variable in the category. + For `category=="state"`, the dataarray should also contain a + `state_diff_mean_standardized` and `state_diff_std_standardized` + variable for the one-step differences of the state variables. Parameters ---------- @@ -323,20 +323,14 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: f"{category}__{split}__{op}": f"{category}_{op}" for op in ops } - # Add state diff stats - if category == "state": - stats_variables.update( - {f"state__{split}__diff_{op}": f"state_diff_{op}" for op in ops} - ) - ds_stats = self._ds[stats_variables.keys()].rename(stats_variables) # Add standardized state diff stats if category == "state": ds_stats = ds_stats.assign( **{ - f"state_diff_{op}_standardized": ds_stats[ - f"state_diff_{op}" + f"state_diff_{op}_standardized": self._ds[ + f"state__{split}__diff_{op}" ] / ds_stats["state_std"] for op in ops diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py index 75f01375..d767d27e 100644 --- a/neural_lam/datastore/npyfilesmeps/store.py +++ b/neural_lam/datastore/npyfilesmeps/store.py @@ -706,10 +706,10 @@ def boundary_mask(self) -> xr.DataArray: def get_standardization_dataarray(self, category: str) -> xr.Dataset: """Return the standardization dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for - each variable in the category. For `category=="state"`, the dataarray - should also contain a `state_diff_mean_standardized` and - `state_diff_std_standardized` variable for the one- step differences - of the state variables. + each variable in the category. + For `category=="state"`, the dataarray should also contain a + `state_diff_mean_standardized` and `state_diff_std_standardized` + variable for the one-step differences of the state variables. Parameters ---------- diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py index 0aa71112..f3769f19 100644 --- a/neural_lam/models/ar_model.py +++ b/neural_lam/models/ar_model.py @@ -64,6 +64,8 @@ def __init__( "state_std": torch.tensor( da_state_stats.state_std.values, dtype=torch.float32 ), + # Note that the one-step-diff stats (diff_mean and diff_std) are + # for differences computed on standardized data "diff_mean": torch.tensor( da_state_stats.state_diff_mean_standardized.values, dtype=torch.float32, diff --git a/tests/dummy_datastore.py b/tests/dummy_datastore.py index 2c4d9e7b..c60cef47 100644 --- a/tests/dummy_datastore.py +++ b/tests/dummy_datastore.py @@ -268,12 +268,13 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset: Return the standardization (i.e. scaling to mean of 0.0 and standard deviation of 1.0) dataarray for the given category. This should contain a `{category}_mean` and `{category}_std` variable for each variable in - the category. For `category=="state"`, the dataarray should also - contain a `state_diff_mean` and `state_diff_std` variable for the one- - step differences of the state variables. The returned dataarray should - at least have dimensions of `({category}_feature)`, but can also - include for example `grid_index` (if the standardization is done per - grid point for example). + the category. + For `category=="state"`, the dataarray should also contain a + `state_diff_mean_standardized` and `state_diff_std_standardized` + variable for the one-step differences of the state variables. + The returned dataarray should at least have dimensions of + `({category}_feature)`, but can also include for example `grid_index` + (if the standardization is done per grid point for example). Parameters ----------