Resolve "Adjustments using adjust require the input data of the control period to have the same size for the time dimension" (#67)

btschwertfeger · web-flow · commit 15f78c1729c1 · 2024-03-10T07:06:22.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,36 @@
 # Changelog
 
-## [v2.0.0](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.0) (2023-01-23)
+## [Unreleased](https://github.com/btschwertfeger/python-cmethods/tree/HEAD)
+
+[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.2...HEAD)
+
+**Merged pull requests:**
+
+- Fix typos and update pre-commit hooks [\#64](https://github.com/btschwertfeger/python-cmethods/pull/64) ([btschwertfeger](https://github.com/btschwertfeger))
+
+## [v2.0.2](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.2) (2024-02-02)
+
+[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.1...v2.0.2)
+
+**Merged pull requests:**
+
+- Update documentation -- QM and QDM formulas [\#62](https://github.com/btschwertfeger/python-cmethods/pull/62) ([btschwertfeger](https://github.com/btschwertfeger))
+- Bump GitHub action versions [\#59](https://github.com/btschwertfeger/python-cmethods/pull/59) ([btschwertfeger](https://github.com/btschwertfeger))
+
+## [v2.0.1](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.1) (2024-02-01)
+
+[Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v2.0.0...v2.0.1)
+
+**Closed issues:**
+
+- The latest documentation still describes the legacy max_scaling_factor [\#60](https://github.com/btschwertfeger/python-cmethods/issues/60)
+
+**Merged pull requests:**
+
+- adjust CI workflows [\#58](https://github.com/btschwertfeger/python-cmethods/pull/58) ([btschwertfeger](https://github.com/btschwertfeger))
+- Resolve "The latest documentation still describes the legacy max scaling factor" [\#61](https://github.com/btschwertfeger/python-cmethods/pull/61) ([btschwertfeger](https://github.com/btschwertfeger))
+
+## [v2.0.0](https://github.com/btschwertfeger/python-cmethods/tree/v2.0.0) (2024-01-23)
 
 [Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v1.0.3...v2.0.0)
 
@@ -13,6 +43,10 @@
 - Optimization for `adjust_3d` [\#47](https://github.com/btschwertfeger/python-cmethods/issues/47)
 - Find a solution to process large data sets more efficient [\#6](https://github.com/btschwertfeger/python-cmethods/issues/6)
 
+**Merged pull requests:**
+
+- Fix the CodeQL workflow execution; prepare v2.0.0 release [\#50](https://github.com/btschwertfeger/python-cmethods/pull/50) ([btschwertfeger](https://github.com/btschwertfeger))
+
 ## [v1.0.3](https://github.com/btschwertfeger/python-cmethods/tree/v1.0.3) (2023-08-09)
 
 [Full Changelog](https://github.com/btschwertfeger/python-cmethods/compare/v1.0.2...v1.0.3)
diff --git a/README.md b/README.md
@@ -7,7 +7,6 @@
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-orange.svg)](https://www.gnu.org/licenses/gpl-3.0)
 [![Downloads](https://pepy.tech/badge/python-cmethods)](https://pepy.tech/project/python-cmethods)
 
-![CodeQL](https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml/badge.svg)
 [![CI/CD](https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml/badge.svg?branch=master)](https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml)
 [![codecov](https://codecov.io/github/btschwertfeger/python-cmethods/branch/master/graph/badge.svg?token=OSO4PAABPD)](https://codecov.io/github/btschwertfeger/python-cmethods)
 
diff --git a/cmethods/core.py b/cmethods/core.py
@@ -50,26 +50,45 @@ def apply_ufunc(
     if method not in __METHODS_FUNC__:
         raise UnknownMethodError(method, __METHODS_FUNC__.keys())
 
+    if kwargs.get("input_core_dims"):
+        if not isinstance(kwargs["input_core_dims"], dict):
+            raise TypeError("input_core_dims must be an object of type 'dict'")
+        if not len(kwargs["input_core_dims"]) == 3 or any(
+            not isinstance(value, str) for value in kwargs["input_core_dims"].values()
+        ):
+            raise ValueError(
+                "input_core_dims must have three key-value pairs like: "
+                '{"obs": "time", "simh": "time", "simp": "time"}',
+            )
+
+        input_core_dims = kwargs["input_core_dims"]
+    else:
+        input_core_dims = {"obs": "time", "simh": "time", "simp": "time"}
+
     result: XRData = xr.apply_ufunc(
         __METHODS_FUNC__[method],
         obs,
         simh,
         # Need to spoof a fake time axis since 'time' coord on full dataset is different
         # than 'time' coord on training dataset.
-        simp.rename({"time": "t2"}),
+        simp.rename({input_core_dims["simp"]: "__t_simp__"}),
         dask="parallelized",
         vectorize=True,
         # This will vectorize over the time dimension, so will submit each grid cell
         # independently
-        input_core_dims=[["time"], ["time"], ["t2"]],
+        input_core_dims=[
+            [input_core_dims["obs"]],
+            [input_core_dims["simh"]],
+            ["__t_simp__"],
+        ],
         # Need to denote that the final output dataset will be labeled with the
         # spoofed time coordinate
-        output_core_dims=[["t2"]],
+        output_core_dims=[["__t_simp__"]],
         kwargs=dict(kwargs),
     )
 
     # Rename to proper coordinate name.
-    result = result.rename({"t2": "time"})
+    result = result.rename({"__t_simp__": input_core_dims["simp"]})
 
     # ufunc will put the core dimension to the end (time), so want to preserve original
     # order where time is commonly first.
@@ -90,6 +109,14 @@ def adjust(
 
     See https://python-cmethods.readthedocs.io/en/latest/src/methods.html
 
+
+    The time dimension of ``obs``, ``simh`` and ``simp`` must be named ``time``.
+
+    If the sizes of time dimensions of the input data sets differ, you have to
+    pass the hidden ``input_core_dims`` parameter, see
+    https://python-cmethods.readthedocs.io/en/latest/src/getting_started.html#advanced-usage
+    for more information.
+
     :param method: Technique to apply
     :type method: str
     :param obs: The reference/observational data set
@@ -127,14 +154,30 @@ def adjust(
         )
 
     # Grouped correction | scaling-based technique
-    group: str = kwargs["group"]
+    group: str | dict[str, str] = kwargs["group"]
+    if isinstance(group, str):
+        # only for same sized time dimensions
+        obs_group = group
+        simh_group = group
+        simp_group = group
+    elif isinstance(group, dict):
+        if any(key not in {"obs", "simh", "simp"} for key in group):
+            raise ValueError(
+                "group must either be a string like 'time' or a dict like "
+                '{"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"}',
+            )
+        # for different sized time dimensions
+        obs_group = group["obs"]
+        simh_group = group["simh"]
+        simp_group = group["simp"]
+
     del kwargs["group"]
 
     result: Optional[XRData] = None
     for (_, obs_gds), (_, simh_gds), (_, simp_gds) in zip(
-        obs.groupby(group),
-        simh.groupby(group),
-        simp.groupby(group),
+        obs.groupby(obs_group),
+        simh.groupby(simh_group),
+        simp.groupby(simp_group),
     ):
         monthly_result = apply_ufunc(
             method,
diff --git a/cmethods/distribution.py b/cmethods/distribution.py
@@ -62,6 +62,11 @@ def quantile_mapping(
 
     cdf_obs = get_cdf(obs, xbins)
     cdf_simh = get_cdf(simh, xbins)
+    cdf_simh = np.interp(
+        cdf_simh,
+        (cdf_simh.min(), cdf_simh.max()),
+        (cdf_obs.min(), cdf_obs.max()),
+    )
 
     if kind in ADDITIVE:
         epsilon = np.interp(simp, xbins, cdf_simh)  # Eq. 1
@@ -124,6 +129,11 @@ def detrended_quantile_mapping(
 
     cdf_obs = get_cdf(obs, xbins)
     cdf_simh = get_cdf(simh, xbins)
+    cdf_simh = np.interp(
+        cdf_simh,
+        (cdf_simh.min(), cdf_simh.max()),
+        (cdf_obs.min(), cdf_obs.max()),
+    )
 
     # detrended => shift mean of $X_{sim,p}$ to range of $X_{sim,h}$ to adjust extremes
     res = np.zeros(len(simp.values))
diff --git a/doc/links.rst b/doc/links.rst
@@ -27,9 +27,6 @@
 .. |Downloads badge| image:: https://static.pepy.tech/personalized-badge/python-cmethods?period=total&units=abbreviation&left_color=grey&right_color=orange&left_text=downloads
    :target: https://pepy.tech/project/python-cmethods
 
-.. |CodeQL badge| image:: https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml/badge.svg?branch=master
-   :target: https://github.com/btschwertfeger/python-cmethods/actions/workflows/codeql.yaml
-
 .. |CI/CD badge| image:: https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml/badge.svg?branch=master
    :target: https://github.com/btschwertfeger/python-cmethods/actions/workflows/cicd.yaml
 
diff --git a/doc/src/getting_started.rst b/doc/src/getting_started.rst
@@ -56,3 +56,60 @@ method specific documentation.
         n_quaniles=1000,
         kind="+",
     )
+
+
+Advanced Usage
+--------------
+
+In some cases the time dimension of input data sets have different sizes. In
+such case, the hidden parameter ``input_core_dims`` must be passed to the
+``adjust`` call.
+
+It defines the dimension names of the input data sets, i.e. if the time
+dimensions of ``obs`` and ``simp`` have the length, but the time dimension of
+``simh`` is somewhat smaller, you have to define this as follows:
+
+
+.. code-block:: python
+    :linenos:
+    :caption: Bias Adjustments for data sets with different time dimension lengths pt. 1
+
+    from cmethods import adjust
+    import xarray as xr
+
+    obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
+    simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
+    simh = simp.copy(deep=True)[3650:]
+
+    bc = adjust(
+        method="quantile_mapping",
+        obs=obs,
+        simh=simh.rename({"time": "t_simh"}),
+        simp=simh,
+        kind="+",
+        input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
+    )
+
+In case you are applying a scaling based technique using grouping, you have to
+adjust the group names accordingly to the time dimension names.
+
+.. code-block:: python
+    :linenos:
+    :caption: Bias Adjustments for data sets with different time dimension lengths pt. 2
+
+    from cmethods import adjust
+    import xarray as xr
+
+    obs = xr.open_dataset("examples/input_data/observations.nc")["tas"]
+    simp = xr.open_dataset("examples/input_data/control.nc")["tas"]
+    simh = simp.copy(deep=True)[3650:]
+
+    bc = adjust(
+        method="linear_scaling",
+        obs=obs,
+        simh=simh.rename({"time": "t_simh"}),
+        simp=simh,
+        kind="+",
+        group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
+        input_core_dims={"obs": "time", "simh": "t_simh", "simp": "time"}
+    )
diff --git a/doc/src/introduction.rst b/doc/src/introduction.rst
@@ -4,7 +4,7 @@ Introduction
 =============
 
 |GitHub badge| |License badge| |PyVersions badge| |Downloads badge|
-|CodeQL badge| |CI/CD badge| |codecov badge|
+|CI/CD badge| |codecov badge|
 |Release date badge| |Release version badge| |DOI badge| |Docs stable|
 
 About
diff --git a/tests/test_methods.py b/tests/test_methods.py
@@ -107,6 +107,55 @@ def test_3d_scaling(
     assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
 
 
+@pytest.mark.parametrize(
+    ("method", "kind"),
+    [
+        ("linear_scaling", "+"),
+        ("variance_scaling", "+"),
+        ("linear_scaling", "*"),
+    ],
+)
+def test_3d_scaling_different_time_span(
+    datasets: dict,
+    method: str,
+    kind: str,
+) -> None:
+    obsh: XRData_t = datasets[kind]["obsh"]
+    obsp: XRData_t = datasets[kind]["obsp"]
+    simh: XRData_t = datasets[kind]["simh"]
+    simp: XRData_t = datasets[kind]["simp"]
+    simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})
+
+    time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}
+
+    # not grouped
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+    # grouped
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        group={"obs": "time.month", "simh": "t_simh.month", "simp": "time.month"},
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+
 @pytest.mark.parametrize(
     ("method", "kind"),
     [
@@ -171,6 +220,42 @@ def test_3d_distribution(
     assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
 
 
+@pytest.mark.parametrize(
+    ("method", "kind"),
+    [
+        ("quantile_mapping", "+"),
+        ("quantile_delta_mapping", "+"),
+        ("quantile_mapping", "*"),
+        ("quantile_delta_mapping", "*"),
+    ],
+)
+def test_3d_distribution_different_time_span(
+    datasets: dict,
+    method: str,
+    kind: str,
+) -> None:
+    obsh: XRData_t = datasets[kind]["obsh"]
+    obsp: XRData_t = datasets[kind]["obsp"]
+    simh: XRData_t = datasets[kind]["simh"]
+    simp: XRData_t = datasets[kind]["simp"]
+
+    simh = simh.sel(time=slice(simh.time[1], None)).rename({"time": "t_simh"})
+    time_names = {"obs": "time", "simh": "t_simh", "simp": "time"}
+
+    result: XRData_t = adjust(
+        method=method,
+        obs=obsh,
+        simh=simh,
+        simp=simp,
+        kind=kind,
+        n_quantiles=N_QUANTILES,
+        input_core_dims=time_names,
+    )
+
+    assert isinstance(result, XRData_t)
+    assert is_3d_rmse_better(result=result[kind], obsp=obsp, simp=simp)
+
+
 def test_1d_detrended_quantile_mapping_add(datasets: dict) -> None:
     kind: str = "+"
     obsh: XRData_t = datasets[kind]["obsh"][:, 0, 0]
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -39,10 +39,10 @@
 def test_quantile_mapping_single_nan() -> None:
     obs, simh, simp = list(np.arange(10)), list(np.arange(10)), list(np.arange(10))
     obs[0] = np.nan
-    expected = np.array([0.0, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.0])
+    expected = np.array([0.0, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.0])
 
     res = quantile_mapping(obs=obs, simh=simh, simp=simp, n_quantiles=5)
-    assert np.allclose(res, expected)
+    assert np.allclose(res, expected), res
 
 
 @pytest.mark.filterwarnings("ignore:All-NaN slice encountered")