From 8698f3c0ae6d18515c20fb682974107c0cf3aeea Mon Sep 17 00:00:00 2001
From: Simon Kamuk Christiansen <skc@ampere.dmi.dk>
Date: Tue, 11 Feb 2025 12:26:43 +0000
Subject: [PATCH 1/8] Standardize state diff stats in mdp datastore. Change
 variable name.

---
 neural_lam/datastore/mdp.py                | 16 +++++++++++++++-
 neural_lam/datastore/npyfilesmeps/store.py | 10 ++++++++--
 neural_lam/models/ar_model.py              |  6 ++++--
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index 01d2b12b..c975c824 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -301,7 +301,8 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         should contain a `{category}_mean` and `{category}_std` variable for
         each variable in the category. For `category=="state"`, the dataarray
         should also contain a `state_diff_mean` and `state_diff_std` variable
-        for the one- step differences of the state variables.
+        for the one- step differences of the state variables along with their
+        standardized versions appended with `_standardized`.
 
         Parameters
         ----------
@@ -327,6 +328,19 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
             )
 
         ds_stats = self._ds[stats_variables.keys()].rename(stats_variables)
+
+        if category == "state":
+            standardized_state_diff = {
+                op: ds_stats[f"state_diff_{op}"] / ds_stats["state_std"]
+                for op in ops
+            }
+            ds_stats = ds_stats.assign(
+                **{
+                    f"state_diff_{op}_standardized": standardized_state_diff[op]
+                    for op in ops
+                }
+            )
+
         return ds_stats
 
     @cached_property
diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
index 8f926f7e..607350e5 100644
--- a/neural_lam/datastore/npyfilesmeps/store.py
+++ b/neural_lam/datastore/npyfilesmeps/store.py
@@ -769,8 +769,14 @@ def load_pickled_tensor(fn):
         }
 
         if mean_diff_values is not None and std_diff_values is not None:
-            variables["state_diff_mean"] = (feature_dim_name, mean_diff_values)
-            variables["state_diff_std"] = (feature_dim_name, std_diff_values)
+            variables["state_diff_mean_standardized"] = (
+                feature_dim_name,
+                mean_diff_values,
+            )
+            variables["state_diff_std_standardized"] = (
+                feature_dim_name,
+                std_diff_values,
+            )
 
         ds_norm = xr.Dataset(
             variables,
diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py
index 81c2f720..0aa71112 100644
--- a/neural_lam/models/ar_model.py
+++ b/neural_lam/models/ar_model.py
@@ -65,10 +65,12 @@ def __init__(
                 da_state_stats.state_std.values, dtype=torch.float32
             ),
             "diff_mean": torch.tensor(
-                da_state_stats.state_diff_mean.values, dtype=torch.float32
+                da_state_stats.state_diff_mean_standardized.values,
+                dtype=torch.float32,
             ),
             "diff_std": torch.tensor(
-                da_state_stats.state_diff_std.values, dtype=torch.float32
+                da_state_stats.state_diff_std_standardized.values,
+                dtype=torch.float32,
             ),
         }
 

From 0948f9ac73e7f7bd51ca7e75705118bb2afbf31a Mon Sep 17 00:00:00 2001
From: Simon Kamuk Christiansen <skc@ampere.dmi.dk>
Date: Tue, 11 Feb 2025 12:35:58 +0000
Subject: [PATCH 2/8] clean up code

---
 neural_lam/datastore/mdp.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index c975c824..39edc51d 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -322,6 +322,8 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         stats_variables = {
             f"{category}__{split}__{op}": f"{category}_{op}" for op in ops
         }
+
+        # Add state diff stats
         if category == "state":
             stats_variables.update(
                 {f"state__{split}__diff_{op}": f"state_diff_{op}" for op in ops}
@@ -329,14 +331,14 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
 
         ds_stats = self._ds[stats_variables.keys()].rename(stats_variables)
 
+        # Add standardized state diff stats
         if category == "state":
-            standardized_state_diff = {
-                op: ds_stats[f"state_diff_{op}"] / ds_stats["state_std"]
-                for op in ops
-            }
             ds_stats = ds_stats.assign(
                 **{
-                    f"state_diff_{op}_standardized": standardized_state_diff[op]
+                    f"state_diff_{op}_standardized": ds_stats[
+                        f"state_diff_{op}"
+                    ]
+                    / ds_stats["state_std"]
                     for op in ops
                 }
             )

From 4ea2bed31f2dbe29d05cf1532ad1fa4c852531b6 Mon Sep 17 00:00:00 2001
From: Simon Kamuk Christiansen <skc@ampere.dmi.dk>
Date: Tue, 11 Feb 2025 12:41:36 +0000
Subject: [PATCH 3/8] Update docstrings and changelog

---
 CHANGELOG.md                               | 2 ++
 neural_lam/datastore/base.py               | 9 +++++----
 neural_lam/datastore/npyfilesmeps/store.py | 5 +++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45bb97c9..ba9fb567 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fix duplicate tensor copy to CPU [\#106](https://github.com/mllam/neural-lam/pull/106) @observingClouds
 
+- Add standardization to state diff stats from mdp datastore [\#122](https://github.com/mllam/neural-lam/pull/122) @SimonKamuk
+
 ### Maintenance
 - update ci/cd testing setup to install torch version compatible with neural-lam
   dependencies [\#115](https://github.com/mllam/neural-lam/pull/115), @leifdenby
diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index f0291657..80ed76e0 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -166,10 +166,11 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         a `{category}_mean` and `{category}_std` variable for each variable in
         the category. For `category=="state"`, the dataarray should also
         contain a `state_diff_mean` and `state_diff_std` variable for the one-
-        step differences of the state variables. The returned dataarray should
-        at least have dimensions of `({category}_feature)`, but can also
-        include for example `grid_index` (if the standardization is done per
-        grid point for example).
+        step differences of the state variables along with their
+        standardized versions appended with `_standardized`.
+        The returned dataarray should at least have dimensions of
+        `({category}_feature)`, but can also include for example `grid_index`
+        (if the standardization is done per grid point for example).
 
         Parameters
         ----------
diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
index 607350e5..75f01375 100644
--- a/neural_lam/datastore/npyfilesmeps/store.py
+++ b/neural_lam/datastore/npyfilesmeps/store.py
@@ -707,8 +707,9 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """Return the standardization dataarray for the given category. This
         should contain a `{category}_mean` and `{category}_std` variable for
         each variable in the category. For `category=="state"`, the dataarray
-        should also contain a `state_diff_mean` and `state_diff_std` variable
-        for the one- step differences of the state variables.
+        should also contain a `state_diff_mean_standardized` and
+        `state_diff_std_standardized` variable for the one- step differences
+        of the state variables.
 
         Parameters
         ----------

From f191f2e3e3397abad4e3732b4fd5dca2937e2bd9 Mon Sep 17 00:00:00 2001
From: Simon Kamuk Christiansen <skc@ampere.dmi.dk>
Date: Tue, 11 Feb 2025 12:52:16 +0000
Subject: [PATCH 4/8] fix tests

---
 neural_lam/datastore/base.py | 6 +++---
 tests/dummy_datastore.py     | 2 +-
 tests/test_datastores.py     | 9 +++++++--
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index 80ed76e0..35cf1ffd 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -165,9 +165,9 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         deviation of 1.0) dataarray for the given category. This should contain
         a `{category}_mean` and `{category}_std` variable for each variable in
         the category. For `category=="state"`, the dataarray should also
-        contain a `state_diff_mean` and `state_diff_std` variable for the one-
-        step differences of the state variables along with their
-        standardized versions appended with `_standardized`.
+        contain variables `state_diff_mean_standardized` and
+        `state_diff_std_standardized` for the one-step differences of the state
+        variables, optionally along with their non-standardized versions.
         The returned dataarray should at least have dimensions of
         `({category}_feature)`, but can also include for example `grid_index`
         (if the standardization is done per grid point for example).
diff --git a/tests/dummy_datastore.py b/tests/dummy_datastore.py
index 0c76bca8..2c4d9e7b 100644
--- a/tests/dummy_datastore.py
+++ b/tests/dummy_datastore.py
@@ -292,7 +292,7 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
 
         ops = ["mean", "std"]
         if category == "state":
-            ops += ["diff_mean", "diff_std"]
+            ops += ["diff_mean_standardized", "diff_std_standardized"]
 
         for op in ops:
             da_op = xr.ones_like(self.ds[f"{category}_feature"]).astype(float)
diff --git a/tests/test_datastores.py b/tests/test_datastores.py
index 6f5b3755..a438291c 100644
--- a/tests/test_datastores.py
+++ b/tests/test_datastores.py
@@ -132,7 +132,7 @@ def test_get_vars(datastore_name):
 
 @pytest.mark.parametrize("datastore_name", DATASTORES.keys())
 def test_get_normalization_dataarray(datastore_name):
-    """Check that the `datastore.get_normalization_dataa rray` method is
+    """Check that the `datastore.get_normalization_dataarray` method is
     implemented."""
     datastore = init_datastore_example(datastore_name)
 
@@ -144,7 +144,12 @@ def test_get_normalization_dataarray(datastore_name):
         assert isinstance(ds_stats, xr.Dataset)
 
         if category == "state":
-            ops = ["mean", "std", "diff_mean", "diff_std"]
+            ops = [
+                "mean",
+                "std",
+                "diff_mean_standardized",
+                "diff_std_standardized",
+            ]
         elif category == "forcing":
             ops = ["mean", "std"]
         elif category == "static":

From 510567a2467bbfae65e664cbd4573574d84e15fc Mon Sep 17 00:00:00 2001
From: SimonKamuk <skc@dmi.dk>
Date: Wed, 12 Feb 2025 09:37:45 +0000
Subject: [PATCH 5/8] update cicd to check storage usage

---
 .github/workflows/install-and-test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml
index a8952430..083b1d2f 100644
--- a/.github/workflows/install-and-test.yml
+++ b/.github/workflows/install-and-test.yml
@@ -101,6 +101,10 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-meps-reduced-example-data-v0.3.0
 
+      - name: Check disk space
+        run: |
+          df -h
+
       - name: Run tests
         run: |
           ${{ matrix.package_manager == 'pdm' && 'pdm run' || '' }} pytest -vv -s

From c172947727add68087b944aa40860ab5050056d5 Mon Sep 17 00:00:00 2001
From: SimonKamuk <skc@dmi.dk>
Date: Wed, 12 Feb 2025 09:38:31 +0000
Subject: [PATCH 6/8] update cicd to check storage usage

---
 .github/workflows/install-and-test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml
index 083b1d2f..b2c226b4 100644
--- a/.github/workflows/install-and-test.yml
+++ b/.github/workflows/install-and-test.yml
@@ -47,6 +47,10 @@ jobs:
         with:
           python-version: 3.9
 
+      - name: Check disk space
+        run: |
+          df -h
+
       - name: Install PDM (if applicable)
         if: matrix.package_manager == 'pdm'
         run: |

From f370455763b90f08f01b8db664fe3069d91d3c95 Mon Sep 17 00:00:00 2001
From: SimonKamuk <skc@dmi.dk>
Date: Wed, 12 Feb 2025 09:59:42 +0000
Subject: [PATCH 7/8] remove checks for disk storage

---
 .github/workflows/install-and-test.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/install-and-test.yml b/.github/workflows/install-and-test.yml
index b2c226b4..a8952430 100644
--- a/.github/workflows/install-and-test.yml
+++ b/.github/workflows/install-and-test.yml
@@ -47,10 +47,6 @@ jobs:
         with:
           python-version: 3.9
 
-      - name: Check disk space
-        run: |
-          df -h
-
       - name: Install PDM (if applicable)
         if: matrix.package_manager == 'pdm'
         run: |
@@ -105,10 +101,6 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-meps-reduced-example-data-v0.3.0
 
-      - name: Check disk space
-        run: |
-          df -h
-
       - name: Run tests
         run: |
           ${{ matrix.package_manager == 'pdm' && 'pdm run' || '' }} pytest -vv -s

From 8e362b4ec7ba0466df92a652480fcf16829d0ba2 Mon Sep 17 00:00:00 2001
From: SimonKamuk <skc@dmi.dk>
Date: Wed, 12 Feb 2025 20:52:11 +0000
Subject: [PATCH 8/8] remove non standardized diff stats from mdp. Add comment
 in ar_model

---
 neural_lam/datastore/base.py               |  8 ++++----
 neural_lam/datastore/mdp.py                | 18 ++++++------------
 neural_lam/datastore/npyfilesmeps/store.py |  8 ++++----
 neural_lam/models/ar_model.py              |  2 ++
 tests/dummy_datastore.py                   | 13 +++++++------
 5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index 35cf1ffd..83e71ec9 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -164,10 +164,10 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         Return the standardization (i.e. scaling to mean of 0.0 and standard
         deviation of 1.0) dataarray for the given category. This should contain
         a `{category}_mean` and `{category}_std` variable for each variable in
-        the category. For `category=="state"`, the dataarray should also
-        contain variables `state_diff_mean_standardized` and
-        `state_diff_std_standardized` for the one-step differences of the state
-        variables, optionally along with their non-standardized versions.
+        the category.
+        For `category=="state"`, the dataarray should also contain a
+        `state_diff_mean_standardized` and `state_diff_std_standardized`
+        variable for the one-step differences of the state variables.
         The returned dataarray should at least have dimensions of
         `({category}_feature)`, but can also include for example `grid_index`
         (if the standardization is done per grid point for example).
diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index 39edc51d..f3a81b72 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -299,10 +299,10 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """
         Return the standardization dataarray for the given category. This
         should contain a `{category}_mean` and `{category}_std` variable for
-        each variable in the category. For `category=="state"`, the dataarray
-        should also contain a `state_diff_mean` and `state_diff_std` variable
-        for the one- step differences of the state variables along with their
-        standardized versions appended with `_standardized`.
+        each variable in the category.
+        For `category=="state"`, the dataarray should also contain a
+        `state_diff_mean_standardized` and `state_diff_std_standardized`
+        variable for the one-step differences of the state variables.
 
         Parameters
         ----------
@@ -323,20 +323,14 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
             f"{category}__{split}__{op}": f"{category}_{op}" for op in ops
         }
 
-        # Add state diff stats
-        if category == "state":
-            stats_variables.update(
-                {f"state__{split}__diff_{op}": f"state_diff_{op}" for op in ops}
-            )
-
         ds_stats = self._ds[stats_variables.keys()].rename(stats_variables)
 
         # Add standardized state diff stats
         if category == "state":
             ds_stats = ds_stats.assign(
                 **{
-                    f"state_diff_{op}_standardized": ds_stats[
-                        f"state_diff_{op}"
+                    f"state_diff_{op}_standardized": self._ds[
+                        f"state__{split}__diff_{op}"
                     ]
                     / ds_stats["state_std"]
                     for op in ops
diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
index 75f01375..d767d27e 100644
--- a/neural_lam/datastore/npyfilesmeps/store.py
+++ b/neural_lam/datastore/npyfilesmeps/store.py
@@ -706,10 +706,10 @@ def boundary_mask(self) -> xr.DataArray:
     def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """Return the standardization dataarray for the given category. This
         should contain a `{category}_mean` and `{category}_std` variable for
-        each variable in the category. For `category=="state"`, the dataarray
-        should also contain a `state_diff_mean_standardized` and
-        `state_diff_std_standardized` variable for the one- step differences
-        of the state variables.
+        each variable in the category.
+        For `category=="state"`, the dataarray should also contain a
+        `state_diff_mean_standardized` and `state_diff_std_standardized`
+        variable for the one-step differences of the state variables.
 
         Parameters
         ----------
diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py
index 0aa71112..f3769f19 100644
--- a/neural_lam/models/ar_model.py
+++ b/neural_lam/models/ar_model.py
@@ -64,6 +64,8 @@ def __init__(
             "state_std": torch.tensor(
                 da_state_stats.state_std.values, dtype=torch.float32
             ),
+            # Note that the one-step-diff stats (diff_mean and diff_std) are
+            # for differences computed on standardized data
             "diff_mean": torch.tensor(
                 da_state_stats.state_diff_mean_standardized.values,
                 dtype=torch.float32,
diff --git a/tests/dummy_datastore.py b/tests/dummy_datastore.py
index 2c4d9e7b..c60cef47 100644
--- a/tests/dummy_datastore.py
+++ b/tests/dummy_datastore.py
@@ -268,12 +268,13 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         Return the standardization (i.e. scaling to mean of 0.0 and standard
         deviation of 1.0) dataarray for the given category. This should contain
         a `{category}_mean` and `{category}_std` variable for each variable in
-        the category. For `category=="state"`, the dataarray should also
-        contain a `state_diff_mean` and `state_diff_std` variable for the one-
-        step differences of the state variables. The returned dataarray should
-        at least have dimensions of `({category}_feature)`, but can also
-        include for example `grid_index` (if the standardization is done per
-        grid point for example).
+        the category.
+        For `category=="state"`, the dataarray should also contain a
+        `state_diff_mean_standardized` and `state_diff_std_standardized`
+        variable for the one-step differences of the state variables.
+        The returned dataarray should at least have dimensions of
+        `({category}_feature)`, but can also include for example `grid_index`
+        (if the standardization is done per grid point for example).
 
         Parameters
         ----------