From 50b32e068aab863107b47f2c7a2bf38e02d99240 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 13 Feb 2024 18:13:02 -0500 Subject: [PATCH 1/8] Remove lmoments3 --- CHANGES.rst | 4 +- docs/notebooks/frequency_analysis.ipynb | 12 +- environment.yml | 1 - pyproject.toml | 1 - tests/test_stats.py | 52 +++++--- tox.ini | 3 +- xclim/core/utils.py | 6 +- xclim/indices/stats.py | 162 +++++++++++------------- 8 files changed, 128 insertions(+), 113 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index f86f97581..0e0346b23 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,7 @@ New features and enhancements * New ``xclim.core.calendar.stack_periods`` and ``unstack_periods`` for performing ``rolling(time=...).construct(..., stride=...)`` but with non-uniform temporal periods like years or months. They replace ``xclim.sdba.processing.construct_moving_yearly_window`` and ``unpack_moving_yearly_window`` which are deprecated and will be removed in a future release. * New ``as_dataset`` options for ``xclim.set_options``. When True, indicators will output Datasets instead of DataArrays. (:issue:`1257`, :pull:`1625`). * Added new option for UTCI calculation to cap low wind velocities to a minimum of 0.5 m/s following Bröde (2012) guidelines. (:issue:`1634`, :pull:`1635`). +* Distribution instances can now be passed to the ``dist`` argument of most statistical indices. Breaking changes ^^^^^^^^^^^^^^^^ @@ -39,6 +40,7 @@ Breaking changes * `black` formatting style has been updated to the 2024 stable conventions. `isort` has been added to the `dev` installation recipe. (:pull:`1626`). * The indice and indicator for ``winter_storm`` has been removed (deprecated since `xclim` v0.46.0 in favour of ``snd_storm_days``). (:pull:`1565`). * `xclim` has dropped support for `scipy` version below v1.9.0 and `numpy` versions below v1.20.0. (:pull:`1565`). +* `lmoments3` was removed as a dependency of xclim due to its incompatible license (GPLv3 vs xclim's Apache). If we get the approbation from all intellectual property owners of the package, we might put it back. See `Ouranosinc/lmoments3#12 `_. See also the "frequency analysis" notebook for an example on how to continue using the probability weighted moments method for fitting distributions. Bug fixes ^^^^^^^^^ @@ -46,7 +48,7 @@ Bug fixes * Fix wrong `window` attributes in ``xclim.indices.standardized_precipitation_index``, ``xclim.indices.standardized_precipitation_evapotranspiration_index``. (:issue:`1552` :pull:`1554`). * Fix the daily case `freq='D'` of ``xclim.stats.preprocess_standardized_index`` (:issue:`1602` :pull:`1607`). * Several spelling mistakes have been corrected within the documentation and codebase. (:pull:`1576`). -* Added missing ``xclim.ensembles.robustness_fractions`` and ``xclim.ensembles.robistness_categoris`` in api doc section. (:pull:`1630`). +* Added missing ``xclim.ensembles.robustness_fractions`` and ``xclim.ensembles.robistness_categories`` in api doc section. (:pull:`1630`). Internal changes ^^^^^^^^^^^^^^^^ diff --git a/docs/notebooks/frequency_analysis.ipynb b/docs/notebooks/frequency_analysis.ipynb index 0e4c0d086..800671331 100644 --- a/docs/notebooks/frequency_analysis.ipynb +++ b/docs/notebooks/frequency_analysis.ipynb @@ -103,7 +103,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next step is to fit the statistical distribution on these maxima. This is done by the `.fit` method, which takes as argument the sample series, the distribution's name and the parameter estimation `method`. The fit is done by default using the Maximum Likelihood algorithm (`method=\"ML\"`). For some extreme value distributions, however, the maximum likelihood is not always robust, and `xclim` offers the possibility to use Probability Weighted Moments (`method=\"PWM\"`) to estimate parameters. Note that the `lmoments3` package which is used by `xclim` to compute the PWM only supports `expon`, `gamma`, `genextreme`, `genpareto`, `gumbel_r`, `pearson3` and `weibull_min`. Parameters can also be estimated using the method of moments (`method=\"MM\"`)." + "The next step is to fit the statistical distribution on these maxima. This is done by the `.fit` method, which takes as argument the sample series, the distribution's name and the parameter estimation `method`. The fit is done by default using the Maximum Likelihood algorithm (`method=\"ML\"`). Parameters can also be estimated using the method of moments (`method=\"MM\"`).\n", + "\n", + "xclim can also accept a distribution instance instead of name (i.e. a subclass of `scipy.stats.rv_continuous`). For example, for some extreme value distributions, the maximum likelihood is not always robust. Using the Probability Weighted Moments (`method=\"PWM\"`) method can help in that case. This is possible by passing a distribution object from the `lmoments3` package together with `method=\"PWM\"`. That package currently only supports `expon`, `gamma`, `genextreme`, `genpareto`, `gumbel_r`, `pearson3` and `weibull_min` (with other names, see [the documentation](https://lmoments3.readthedocs.io/en/stable/distributions.html)). In the following example, we fit using the \"Generalized extreme value\" distribution from `lmoments3`." ] }, { @@ -112,8 +114,10 @@ "metadata": {}, "outputs": [], "source": [ + "from lmoments3.distr import gev\n", + "\n", "# The fitting dimension is hard-coded as `time`.\n", - "params = fit(sub, dist=\"genextreme\")\n", + "params = fit(sub, dist=gev, method=\"PWM\")\n", "params" ] }, @@ -186,9 +190,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/environment.yml b/environment.yml index a2211ea6b..adb7974da 100644 --- a/environment.yml +++ b/environment.yml @@ -13,7 +13,6 @@ dependencies: - Click >=8.1 - dask >=2.6.0 - jsonpickle - - lmoments3 - numba - numpy >=1.20.0 - pandas >=2.2.0 diff --git a/pyproject.toml b/pyproject.toml index 64ee9531c..177ce4843 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ "Click>=8.1", "dask[array]>=2.6", "jsonpickle", - "lmoments3>=1.0.5", "numba", "numpy>=1.20.0", "pandas>=2.2", diff --git a/tests/test_stats.py b/tests/test_stats.py index 091c95fc7..3700a9f24 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -166,10 +166,11 @@ def test_fa(fitda): np.testing.assert_array_equal(q[0, 0, 0], q0) -def test_fa_gamma(fitda): +def test_fa_gamma_lmom(fitda): + lmom = pytest.importorskip("lmoments3.distr") T = 10 q = stats.fa(fitda, T, "lognorm", method="MM") - q1 = stats.fa(fitda, T, "gamma", method="PWM") + q1 = stats.fa(fitda, T, lmom.gam, method="PWM") np.testing.assert_allclose(q1, q, rtol=0.2) @@ -192,6 +193,21 @@ def test_dims_order(fitda): assert p.dims[-1] == "dparams" +lm3_dist_map = { + "expon": "exp", + "gamma": "gam", + "genextreme": "gev", + # "genlogistic": "glo", + # "gennorm": "gno", + "genpareto": "gpa", + "gumbel_r": "gum", + # "kappa4": "kap", + "norm": "nor", + "pearson3": "pe3", + "weibull_min": "wei", +} + + class TestPWMFit: params = { "expon": {"loc": 0.9527273, "scale": 2.2836364}, @@ -213,20 +229,23 @@ class TestPWMFit: } inputs_pdf = [4, 5, 6, 7] - @pytest.mark.parametrize("dist", stats._lm3_dist_map.keys()) + @pytest.mark.parametrize("dist", lm3_dist_map.keys()) def test_get_lm3_dist(self, dist): """Check that parameterization for lmoments3 and scipy is identical.""" + lmom = pytest.importorskip("lmoments3.distr") + lm3dc = getattr(lmom, lm3_dist_map[dist]) dc = stats.get_dist(dist) - lm3dc = stats.get_lm3_dist(dist) par = self.params[dist] expected = dc(**par).pdf(self.inputs_pdf) values = lm3dc(**par).pdf(self.inputs_pdf) np.testing.assert_array_almost_equal(values, expected) - @pytest.mark.parametrize("dist", stats._lm3_dist_map.keys()) + @pytest.mark.parametrize("dist", lm3_dist_map.keys()) @pytest.mark.parametrize("use_dask", [True, False]) def test_pwm_fit(self, dist, use_dask, random): """Test that the fitted parameters match parameters used to generate a random sample.""" + lmom = pytest.importorskip("lmoments3.distr") + lm3dc = getattr(lmom, lm3_dist_map[dist]) n = 500 dc = stats.get_dist(dist) par = self.params[dist] @@ -237,11 +256,10 @@ def test_pwm_fit(self, dist, use_dask, random): ) if use_dask: da = da.chunk() - out = stats.fit(da, dist=dist, method="PWM").compute() + out = stats.fit(da, dist=lm3dc, method="PWM").compute() # Check that values are identical to lmoments3's output dict - l3dc = stats.get_lm3_dist(dist) - expected = l3dc.lmom_fit(da.values) + expected = lm3dc.lmom_fit(da.values) for key, val in expected.items(): np.testing.assert_array_equal(out.sel(dparams=key), val, 1) @@ -270,15 +288,15 @@ def test_frequency_analysis(ndq_series, use_dask): q.transpose(), mode="max", t=2, dist="genextreme", window=6, freq="YS" ) - # Test with PWM fitting method - out1 = stats.frequency_analysis( - q, mode="max", t=2, dist="genextreme", window=6, freq="YS", method="PWM" - ) - np.testing.assert_allclose( - out1, - out, - rtol=0.5, - ) + # # Test with PWM fitting method + # out1 = stats.frequency_analysis( + # q, mode="max", t=2, dist="genextreme", window=6, freq="YS", method="PWM" + # ) + # np.testing.assert_allclose( + # out1, + # out, + # rtol=0.5, + # ) @pytest.mark.parametrize("use_dask", [True, False]) diff --git a/tox.ini b/tox.ini index afed81075..1dec72ec1 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ env_list = offline-prefetch py39-upstream-doctest py310 - py311 + py311-lmoments py312-numba labels = test = py39, py310-upstream-doctest, py311, notebooks_doctests, offline-prefetch @@ -105,6 +105,7 @@ deps = coverage: coveralls upstream: -rrequirements_upstream.txt sbck: pybind11 + lmoments: lmoments3 install_command = python -m pip install --no-user {opts} {packages} download = True commands_pre = diff --git a/xclim/core/utils.py b/xclim/core/utils.py index b994ca80f..89d440e80 100644 --- a/xclim/core/utils.py +++ b/xclim/core/utils.py @@ -139,9 +139,9 @@ def decorator(func): def wrapper(*args, **kwargs): msg = ( f"`{func.__name__}` is deprecated" - f"{' from version {}'.format(from_version) if from_version else ''} " + f"{f' from version {from_version}' if from_version else ''} " "and will be removed in a future version of xclim" - f"{'. Use `{}` instead'.format(suggested) if suggested else ''}. " + f"{f'. Use `{suggested}` instead' if suggested else ''}. " "Please update your scripts accordingly." ) warnings.warn( @@ -689,7 +689,7 @@ def infer_kind_from_parameter(param) -> InputKind: if annot.issubset({"int", "float", "Sequence[int]", "Sequence[float]"}): return InputKind.NUMBER_SEQUENCE - if annot == {"str"}: + if annot.issuperset({"str"}): return InputKind.STRING if annot == {"DayOfYearStr"}: diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 262a6065b..d479fff9d 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -6,8 +6,8 @@ from collections.abc import Sequence from typing import Any -import lmoments3.distr import numpy as np +import scipy.stats import xarray as xr from xclim.core.calendar import compare_offsets, resample_doy, select_time @@ -19,13 +19,11 @@ __all__ = [ "_fit_start", - "_lm3_dist_map", "dist_method", "fa", "fit", "frequency_analysis", "get_dist", - "get_lm3_dist", "parametric_cdf", "parametric_quantile", "preprocess_standardized_index", @@ -34,22 +32,6 @@ ] -# Map the scipy distribution name to the lmoments3 name. Distributions with mismatched parameters are excluded. -_lm3_dist_map = { - "expon": "exp", - "gamma": "gam", - "genextreme": "gev", - # "genlogistic": "glo", - # "gennorm": "gno", - "genpareto": "gpa", - "gumbel_r": "gum", - # "kappa4": "kap", - "norm": "nor", - "pearson3": "pe3", - "weibull_min": "wei", -} - - # Fit the parameters. # This would also be the place to impose constraints on the series minimum length if needed. def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): @@ -88,7 +70,7 @@ def _fitfunc_1d(arr, *, dist, nparams, method, **fitkwargs): def fit( da: xr.DataArray, - dist: str = "norm", + dist: str | scipy.stats.rv_continuous = "norm", method: str = "ML", dim: str = "time", **fitkwargs: Any, @@ -99,13 +81,12 @@ def fit( ---------- da : xr.DataArray Time series to be fitted along the time dimension. - dist : str + dist : str or rv_continuous distribution object Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm - (see :py:mod:scipy.stats for full list). If the PWM method is used, only the following distributions are - currently supported: 'expon', 'gamma', 'genextreme', 'genpareto', 'gumbel_r', 'pearson3', 'weibull_min'. + (see :py:mod:scipy.stats for full list). Or the distribution object itself. method : {"ML" or "MLE", "MM", "PWM", "APP"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MM), - probability weighted moments (PWM), also called L-Moments, or approximate method (APP). + Fitting method, either maximum likelihood (ML or MLE), method of moments (MM) or approximate method (APP). + Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed. The PWM method is usually more robust to outliers. dim : str The dimension upon which to perform the indexing (default: "time"). @@ -134,13 +115,9 @@ def fit( raise ValueError(f"Fitting method not recognized: {method}") # Get the distribution - dc = get_dist(dist) - if method == "PWM": - lm3dc = get_lm3_dist(dist) - else: - lm3dc = None + dist = get_dist(dist) - shape_params = [] if dc.shapes is None else dc.shapes.split(",") + shape_params = [] if dist.shapes is None else dist.shapes.split(",") dist_params = shape_params + ["loc", "scale"] data = xr.apply_ufunc( @@ -154,7 +131,7 @@ def fit( keep_attrs=True, kwargs=dict( # Don't know how APP should be included, this works for now - dist=dc if method in ["ML", "MLE", "MM", "APP"] else lm3dc, + dist=dist, nparams=len(dist_params), method=method, **fitkwargs, @@ -170,11 +147,11 @@ def fit( da.attrs, ["standard_name", "long_name", "units", "description"], "original_" ) attrs = dict( - long_name=f"{dist} parameters", - description=f"Parameters of the {dist} distribution", + long_name=f"{dist.name} parameters", + description=f"Parameters of the {dist.name} distribution", method=method, estimator=method_name[method].capitalize(), - scipy_dist=dist, + scipy_dist=dist.name, units="", history=update_history( f"Estimate distribution parameters by {method_name[method]} method along dimension {dim}.", @@ -186,7 +163,11 @@ def fit( return out -def parametric_quantile(p: xr.DataArray, q: float | Sequence[float]) -> xr.DataArray: +def parametric_quantile( + p: xr.DataArray, + q: float | Sequence[float], + dist: str | scipy.stats.rv_continuous | None = None, +) -> xr.DataArray: """Return the value corresponding to the given distribution parameters and quantile. Parameters @@ -197,6 +178,8 @@ def parametric_quantile(p: xr.DataArray, q: float | Sequence[float]) -> xr.DataA and attribute `scipy_dist`, storing the name of the distribution. q : float or Sequence of float Quantile to compute, which must be between `0` and `1`, inclusive. + dist: str, rv_continuous instance, optional + The distribution name or instance is the `scipy_dist` attribute is not available on `p`. Returns ------- @@ -209,20 +192,18 @@ def parametric_quantile(p: xr.DataArray, q: float | Sequence[float]) -> xr.DataA """ q = np.atleast_1d(q) - # Get the distribution - dist = p.attrs["scipy_dist"] - dc = get_dist(dist) + dist = get_dist(dist or p.attrs["scipy_dist"]) # Create a lambda function to facilitate passing arguments to dask. There is probably a better way to do this. if np.all(q > 0.5): def func(x): - return dc.isf(1 - q, *x) + return dist.isf(1 - q, *x) else: def func(x): - return dc.ppf(q, *x) + return dist.ppf(q, *x) data = xr.apply_ufunc( func, @@ -242,8 +223,8 @@ def func(x): out.attrs = unprefix_attrs(p.attrs, ["units", "standard_name"], "original_") attrs = dict( - long_name=f"{dist} quantiles", - description=f"Quantiles estimated by the {dist} distribution", + long_name=f"{dist.name} quantiles", + description=f"Quantiles estimated by the {dist.name} distribution", cell_methods="dparams: ppf", history=update_history( "Compute parametric quantiles from distribution parameters", @@ -255,7 +236,11 @@ def func(x): return out -def parametric_cdf(p: xr.DataArray, v: float | Sequence[float]) -> xr.DataArray: +def parametric_cdf( + p: xr.DataArray, + v: float | Sequence[float], + dist: str | scipy.stats.rv_continuous | None = None, +) -> xr.DataArray: """Return the cumulative distribution function corresponding to the given distribution parameters and value. Parameters @@ -266,6 +251,8 @@ def parametric_cdf(p: xr.DataArray, v: float | Sequence[float]) -> xr.DataArray: and attribute `scipy_dist`, storing the name of the distribution. v : float or Sequence of float Value to compute the CDF. + dist: str, rv_continuous instance, optional + The distribution name or instance is the `scipy_dist` attribute is not available on `p`. Returns ------- @@ -274,13 +261,11 @@ def parametric_cdf(p: xr.DataArray, v: float | Sequence[float]) -> xr.DataArray: """ v = np.atleast_1d(v) - # Get the distribution - dist = p.attrs["scipy_dist"] - dc = get_dist(dist) + dist = get_dist(dist or p.attrs["scipy_dist"]) # Create a lambda function to facilitate passing arguments to dask. There is probably a better way to do this. def func(x): - return dc.cdf(v, *x) + return dist.cdf(v, *x) data = xr.apply_ufunc( func, @@ -300,8 +285,8 @@ def func(x): out.attrs = unprefix_attrs(p.attrs, ["units", "standard_name"], "original_") attrs = dict( - long_name=f"{dist} cdf", - description=f"CDF estimated by the {dist} distribution", + long_name=f"{dist.name} cdf", + description=f"CDF estimated by the {dist.name} distribution", cell_methods="dparams: cdf", history=update_history( "Compute parametric cdf from distribution parameters", @@ -316,7 +301,7 @@ def func(x): def fa( da: xr.DataArray, t: int | Sequence, - dist: str = "norm", + dist: str | scipy.stats.rv_continuous = "norm", mode: str = "max", method: str = "ML", ) -> xr.DataArray: @@ -329,14 +314,15 @@ def fa( t : int or Sequence of int Return period. The period depends on the resolution of the input data. If the input array's resolution is yearly, then the return period is in years. - dist : str + dist : str or rv_continuous instance Name of the univariate distribution, such as: `beta`, `expon`, `genextreme`, `gamma`, `gumbel_r`, `lognorm`, `norm` + Or the distribution instance itself. mode : {'min', 'max} Whether we are looking for a probability of exceedance (max) or a probability of non-exceedance (min). method : {"ML", "MLE", "MOM", "PWM", "APP"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM), - probability weighted moments (PWM), also called L-Moments, or approximate method (APP). + Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM) or approximate method (APP). + Also accpets probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. The PWM method is usually more robust to outliers. Returns @@ -363,7 +349,7 @@ def fa( # Compute the quantiles out = ( - parametric_quantile(p, q) + parametric_quantile(p, q, dist) .rename({"quantile": "return_period"}) .assign_coords(return_period=t) ) @@ -375,7 +361,7 @@ def frequency_analysis( da: xr.DataArray, mode: str, t: int | Sequence[int], - dist: str, + dist: str | scipy.stats.rv_continuous, window: int = 1, freq: str | None = None, method: str = "ML", @@ -392,16 +378,17 @@ def frequency_analysis( t : int or sequence Return period. The period depends on the resolution of the input data. If the input array's resolution is yearly, then the return period is in years. - dist : str + dist : str or rv_continuous Name of the univariate distribution, e.g. `beta`, `expon`, `genextreme`, `gamma`, `gumbel_r`, `lognorm`, `norm`. + Or an instance of the distribution. window : int Averaging window length (days). freq : str, optional Resampling frequency. If None, the frequency is assumed to be 'YS' unless the indexer is season='DJF', in which case `freq` would be set to `YS-DEC`. method : {"ML" or "MLE", "MOM", "PWM", "APP"} - Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM), - probability weighted moments (PWM), also called L-Moments, or approximate method (APP). + Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM) or approximate method (APP). + Also accpets probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. The PWM method is usually more robust to outliers. \*\*indexer Time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, @@ -435,28 +422,18 @@ def frequency_analysis( return fa(sel, t, dist=dist, mode=mode, method=method) -def get_dist(dist: str): +def get_dist(dist: str | scipy.stats.rv_continuous): """Return a distribution object from `scipy.stats`.""" - from scipy import stats # pylint: disable=import-outside-toplevel + if isinstance(dist, scipy.stats.rv_continuous): + return dist - dc = getattr(stats, dist, None) + dc = getattr(scipy.stats, dist, None) if dc is None: e = f"Statistical distribution `{dist}` is not found in scipy.stats." raise ValueError(e) return dc -def get_lm3_dist(dist: str): - """Return a distribution object from `lmoments3.distr`.""" - if dist not in _lm3_dist_map: - raise ValueError( - f"The PWM fitting method cannot be used with the {dist} distribution, as it is not supported " - f"by `lmoments3`." - ) - - return getattr(lmoments3.distr, _lm3_dist_map[dist]) - - def _fit_start(x, dist: str, **fitkwargs: Any) -> tuple[tuple, dict]: r"""Return initial values for distribution parameters. @@ -532,7 +509,9 @@ def _fit_start(x, dist: str, **fitkwargs: Any) -> tuple[tuple, dict]: return (), {} -def _dist_method_1D(*args, dist: str, function: str, **kwargs: Any) -> xr.DataArray: +def _dist_method_1D( + *args, dist: str | scipy.stats.rv_continuous, function: str, **kwargs: Any +) -> xr.DataArray: r"""Statistical function for given argument on given distribution initialized with params. See :py:ref:`scipy:scipy.stats.rv_continuous` for all available functions and their arguments. @@ -561,6 +540,7 @@ def dist_method( function: str, fit_params: xr.DataArray, arg: xr.DataArray | None = None, + dist: str | scipy.stats.rv_continuous | None = None, **kwargs: Any, ) -> xr.DataArray: r"""Vectorized statistical function for given argument on given distribution initialized with params. @@ -574,9 +554,10 @@ def dist_method( The name of the function to call. fit_params : xr.DataArray Distribution parameters are along `dparams`, in the same order as given by :py:func:`fit`. - Must have a `scipy_dist` attribute with the name of the distribution fitted. arg : array_like, optional The first argument for the requested function if different from `fit_params`. + dist : str pr rv_continuous, optional + The distribution name or instance. Defaults to the `scipy_dist` attribute or `fit_params`. \*\*kwargs Other parameters to pass to the function call. @@ -595,7 +576,11 @@ def dist_method( return xr.apply_ufunc( _dist_method_1D, *args, - kwargs={"dist": fit_params.attrs["scipy_dist"], "function": function, **kwargs}, + kwargs={ + "dist": dist or fit_params.attrs["scipy_dist"], + "function": function, + **kwargs, + }, output_dtypes=[float], dask="parallelized", ) @@ -668,7 +653,7 @@ def standardized_index_fit_params( da: xr.DataArray, freq: str | None, window: int, - dist: str, + dist: str | scipy.stats.rv_continuous, method: str, offset: Quantified | None = None, **indexer, @@ -690,7 +675,7 @@ def standardized_index_fit_params( window : int Averaging window length relative to the resampling frequency. For example, if `freq="MS"`, i.e. a monthly resampling, the window is an integer number of months. - dist : {'gamma', 'fisk'} + dist : {'gamma', 'fisk'} or rv_continuous instance Name of the univariate distribution. (see :py:mod:`scipy.stats`). method : {'ML', 'APP', 'PWM'} Name of the fitting method, such as `ML` (maximum likelihood), `APP` (approximate). The approximate method @@ -715,11 +700,12 @@ def standardized_index_fit_params( """ # "WPM" method doesn't seem to work for gamma or pearson3 dist_and_methods = {"gamma": ["ML", "APP", "PWM"], "fisk": ["ML", "APP"]} - if dist not in dist_and_methods: - raise NotImplementedError(f"The distribution `{dist}` is not supported.") - if method not in dist_and_methods[dist]: + dist = get_dist(dist) + if dist.name not in dist_and_methods: + raise NotImplementedError(f"The distribution `{dist.name}` is not supported.") + if method not in dist_and_methods[dist.name]: raise NotImplementedError( - f"The method `{method}` is not supported for distribution `{dist}`." + f"The method `{method}` is not supported for distribution `{dist.name}`." ) if offset is not None: @@ -736,7 +722,7 @@ def standardized_index_fit_params( "calibration_period": cal_range, "freq": freq or "", "window": window, - "scipy_dist": dist, + "scipy_dist": dist.name, "method": method, "group": group, "units": "", @@ -748,7 +734,11 @@ def standardized_index_fit_params( return params -def standardized_index(da: xr.DataArray, params: xr.DataArray): +def standardized_index( + da: xr.DataArray, + params: xr.DataArray, + dist: str | scipy.stats.rv_continuous | None = None, +): """Compute standardized index for given fit parameters. This computes standardized indices which measure the deviation of variables in the dataset compared @@ -764,6 +754,8 @@ def standardized_index(da: xr.DataArray, params: xr.DataArray): ``xclim.indices.preprocess_standardized_index``. params : xarray.DataArray Fit parameters computed using ``xclim.indices.stats.standardized_index_fit_params``. + dist : str or rv_continuous, optional + Name of distribution or instance. Defaults to the "scipy_dist" attribute of `params`. """ group = params.attrs["group"] @@ -779,7 +771,7 @@ def reindex_time(da, da_ref): lambda x: (x == 0).sum("time") / x.notnull().sum("time") ) params, probs_of_zero = (reindex_time(dax, da) for dax in [params, probs_of_zero]) - dist_probs = dist_method("cdf", params, da) + dist_probs = dist_method("cdf", params, da, dist=dist) probs = probs_of_zero + ((1 - probs_of_zero) * dist_probs) params_norm = xr.DataArray( From ca332ab476f645794e26a6f3bf8dae6378d7c320 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 13 Feb 2024 18:18:23 -0500 Subject: [PATCH 2/8] Better error message --- xclim/indices/stats.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index d479fff9d..e08bb12e6 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -117,6 +117,11 @@ def fit( # Get the distribution dist = get_dist(dist) + if method == "PWM" and not hasattr(dist, "lmom_fit"): + raise ValueError( + f"The given distribution {dist} does not implement the PWM fitting method. Please pass an instance from the lmoments3 package." + ) + shape_params = [] if dist.shapes is None else dist.shapes.split(",") dist_params = shape_params + ["loc", "scale"] From 5826b0b3902713c8d4276cd0e57c10b92ec109ed Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 13 Feb 2024 18:25:59 -0500 Subject: [PATCH 3/8] Add pr numbers - fix typo --- CHANGES.rst | 4 ++-- xclim/indices/stats.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 0e0346b23..65ca43371 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,7 +22,7 @@ New features and enhancements * New ``xclim.core.calendar.stack_periods`` and ``unstack_periods`` for performing ``rolling(time=...).construct(..., stride=...)`` but with non-uniform temporal periods like years or months. They replace ``xclim.sdba.processing.construct_moving_yearly_window`` and ``unpack_moving_yearly_window`` which are deprecated and will be removed in a future release. * New ``as_dataset`` options for ``xclim.set_options``. When True, indicators will output Datasets instead of DataArrays. (:issue:`1257`, :pull:`1625`). * Added new option for UTCI calculation to cap low wind velocities to a minimum of 0.5 m/s following Bröde (2012) guidelines. (:issue:`1634`, :pull:`1635`). -* Distribution instances can now be passed to the ``dist`` argument of most statistical indices. +* Distribution instances can now be passed to the ``dist`` argument of most statistical indices. (:pull:`1644`). Breaking changes ^^^^^^^^^^^^^^^^ @@ -40,7 +40,7 @@ Breaking changes * `black` formatting style has been updated to the 2024 stable conventions. `isort` has been added to the `dev` installation recipe. (:pull:`1626`). * The indice and indicator for ``winter_storm`` has been removed (deprecated since `xclim` v0.46.0 in favour of ``snd_storm_days``). (:pull:`1565`). * `xclim` has dropped support for `scipy` version below v1.9.0 and `numpy` versions below v1.20.0. (:pull:`1565`). -* `lmoments3` was removed as a dependency of xclim due to its incompatible license (GPLv3 vs xclim's Apache). If we get the approbation from all intellectual property owners of the package, we might put it back. See `Ouranosinc/lmoments3#12 `_. See also the "frequency analysis" notebook for an example on how to continue using the probability weighted moments method for fitting distributions. +* `lmoments3` was removed as a dependency of xclim due to its incompatible license (GPLv3 vs xclim's Apache). If we get the approbation from all intellectual property owners of the package, we might put it back. See `Ouranosinc/lmoments3#12 `_. See also the "frequency analysis" notebook for an example on how to continue using the probability weighted moments method for fitting distributions. (:issue:`1620`, :pull:`1644`). Bug fixes ^^^^^^^^^ diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index e08bb12e6..9f73edea9 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -327,7 +327,7 @@ def fa( Whether we are looking for a probability of exceedance (max) or a probability of non-exceedance (min). method : {"ML", "MLE", "MOM", "PWM", "APP"} Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM) or approximate method (APP). - Also accpets probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. + Also accepts probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. The PWM method is usually more robust to outliers. Returns @@ -393,7 +393,7 @@ def frequency_analysis( in which case `freq` would be set to `YS-DEC`. method : {"ML" or "MLE", "MOM", "PWM", "APP"} Fitting method, either maximum likelihood (ML or MLE), method of moments (MOM) or approximate method (APP). - Also accpets probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. + Also accepts probability weighted moments (PWM), also called L-Moments, if `dist` is an instance from the lmoments3 library. The PWM method is usually more robust to outliers. \*\*indexer Time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, From 4caf72417c933461137e3ddf1b0df2206a8595b8 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 13 Feb 2024 18:40:24 -0500 Subject: [PATCH 4/8] Ensure kwargs are seen as such in parameters --- .github/workflows/main.yml | 2 +- xclim/core/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4f3dac00c..b15df6f32 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -120,7 +120,7 @@ jobs: - tox-env: py39-coverage-sbck python-version: "3.9" markers: -m 'not slow' - - tox-env: py310-coverage # No markers -- includes slow tests + - tox-env: py310-coverage-lmoments # No markers -- includes slow tests python-version: "3.10" - tox-env: py311-coverage-sbck python-version: "3.11" diff --git a/xclim/core/utils.py b/xclim/core/utils.py index 89d440e80..d84d42b04 100644 --- a/xclim/core/utils.py +++ b/xclim/core/utils.py @@ -680,6 +680,9 @@ def infer_kind_from_parameter(param) -> InputKind: if param.name == "freq": return InputKind.FREQ_STR + if param.kind == param.VAR_KEYWORD: + return InputKind.KWARGS + if annot == {"Quantified"}: return InputKind.QUANTIFIED @@ -704,9 +707,6 @@ def infer_kind_from_parameter(param) -> InputKind: if annot == {"Dataset"}: return InputKind.DATASET - if param.kind == param.VAR_KEYWORD: - return InputKind.KWARGS - return InputKind.OTHER_PARAMETER From 15dcd27c8f44d01d47c41eb2be7e602147b197cb Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 14 Feb 2024 10:48:09 -0500 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: David Huard --- CHANGES.rst | 2 +- xclim/indices/stats.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 65ca43371..5683ed57e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -48,7 +48,7 @@ Bug fixes * Fix wrong `window` attributes in ``xclim.indices.standardized_precipitation_index``, ``xclim.indices.standardized_precipitation_evapotranspiration_index``. (:issue:`1552` :pull:`1554`). * Fix the daily case `freq='D'` of ``xclim.stats.preprocess_standardized_index`` (:issue:`1602` :pull:`1607`). * Several spelling mistakes have been corrected within the documentation and codebase. (:pull:`1576`). -* Added missing ``xclim.ensembles.robustness_fractions`` and ``xclim.ensembles.robistness_categories`` in api doc section. (:pull:`1630`). +* Added missing ``xclim.ensembles.robustness_fractions`` and ``xclim.ensembles.robustness_categories`` in api doc section. (:pull:`1630`). Internal changes ^^^^^^^^^^^^^^^^ diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 9f73edea9..44bb4db4c 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -184,7 +184,7 @@ def parametric_quantile( q : float or Sequence of float Quantile to compute, which must be between `0` and `1`, inclusive. dist: str, rv_continuous instance, optional - The distribution name or instance is the `scipy_dist` attribute is not available on `p`. + The distribution name or instance if the `scipy_dist` attribute is not available on `p`. Returns ------- From 0214348baa9bf7c908720b7fbd72723c13784e86 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 14 Feb 2024 10:50:13 -0500 Subject: [PATCH 6/8] restore freq_anal test with PWM --- tests/test_stats.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 3700a9f24..adbb6b4c4 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -288,15 +288,27 @@ def test_frequency_analysis(ndq_series, use_dask): q.transpose(), mode="max", t=2, dist="genextreme", window=6, freq="YS" ) - # # Test with PWM fitting method - # out1 = stats.frequency_analysis( - # q, mode="max", t=2, dist="genextreme", window=6, freq="YS", method="PWM" - # ) - # np.testing.assert_allclose( - # out1, - # out, - # rtol=0.5, - # ) + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_frequency_analysis_lmoments(ndq_series, use_dask): + lmom = pytest.importorskip("lmoments3.distr") + q = ndq_series.copy() + q[:, 0, 0] = np.nan + if use_dask: + q = q.chunk() + + out = stats.frequency_analysis( + q, mode="max", t=2, dist="genextreme", window=6, freq="YS" + ) + out1 = stats.frequency_analysis( + q, mode="max", t=2, dist=lmom.gev, window=6, freq="YS", method="PWM" + ) + np.testing.assert_allclose( + out1, + out, + rtol=0.5, + ) @pytest.mark.parametrize("use_dask", [True, False]) From 302249ae7d223427892e6912b89473096c6bd91c Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 14 Feb 2024 11:02:02 -0500 Subject: [PATCH 7/8] add lmoments to notebook tests --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 1dec72ec1..34ef199ae 100644 --- a/tox.ini +++ b/tox.ini @@ -106,6 +106,7 @@ deps = upstream: -rrequirements_upstream.txt sbck: pybind11 lmoments: lmoments3 + notebooks_doctests: lmoments3 install_command = python -m pip install --no-user {opts} {packages} download = True commands_pre = From 0449922f0e2fe009752f49e59963f836176d802c Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 15 Feb 2024 13:59:04 -0500 Subject: [PATCH 8/8] Apply suggestions from code review Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- CHANGES.rst | 2 +- docs/notebooks/frequency_analysis.ipynb | 2 +- xclim/indices/stats.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5683ed57e..1a527730a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -40,7 +40,7 @@ Breaking changes * `black` formatting style has been updated to the 2024 stable conventions. `isort` has been added to the `dev` installation recipe. (:pull:`1626`). * The indice and indicator for ``winter_storm`` has been removed (deprecated since `xclim` v0.46.0 in favour of ``snd_storm_days``). (:pull:`1565`). * `xclim` has dropped support for `scipy` version below v1.9.0 and `numpy` versions below v1.20.0. (:pull:`1565`). -* `lmoments3` was removed as a dependency of xclim due to its incompatible license (GPLv3 vs xclim's Apache). If we get the approbation from all intellectual property owners of the package, we might put it back. See `Ouranosinc/lmoments3#12 `_. See also the "frequency analysis" notebook for an example on how to continue using the probability weighted moments method for fitting distributions. (:issue:`1620`, :pull:`1644`). +* `lmoments3` was removed as a dependency of `xclim` due to incompatible licensing (GPLv3 vs `xclim`'s Apache 2.0). Depending on the outcome of efforts to modify the licensing of `lmoments3`, this change may eventually be reverted. See `Ouranosinc/lmoments3#12 `_. See also the "frequency analysis" notebook for an example on how to continue using the probability weighted moments method for fitting distributions. (:issue:`1620`, :pull:`1644`). Bug fixes ^^^^^^^^^ diff --git a/docs/notebooks/frequency_analysis.ipynb b/docs/notebooks/frequency_analysis.ipynb index 800671331..0e68b1a18 100644 --- a/docs/notebooks/frequency_analysis.ipynb +++ b/docs/notebooks/frequency_analysis.ipynb @@ -105,7 +105,7 @@ "source": [ "The next step is to fit the statistical distribution on these maxima. This is done by the `.fit` method, which takes as argument the sample series, the distribution's name and the parameter estimation `method`. The fit is done by default using the Maximum Likelihood algorithm (`method=\"ML\"`). Parameters can also be estimated using the method of moments (`method=\"MM\"`).\n", "\n", - "xclim can also accept a distribution instance instead of name (i.e. a subclass of `scipy.stats.rv_continuous`). For example, for some extreme value distributions, the maximum likelihood is not always robust. Using the Probability Weighted Moments (`method=\"PWM\"`) method can help in that case. This is possible by passing a distribution object from the `lmoments3` package together with `method=\"PWM\"`. That package currently only supports `expon`, `gamma`, `genextreme`, `genpareto`, `gumbel_r`, `pearson3` and `weibull_min` (with other names, see [the documentation](https://lmoments3.readthedocs.io/en/stable/distributions.html)). In the following example, we fit using the \"Generalized extreme value\" distribution from `lmoments3`." + "`xclim` can also accept a distribution instance instead of name (i.e. a subclass of `scipy.stats.rv_continuous`). For example, for some extreme value distributions, the maximum likelihood is not always robust. Using the \"Probability Weighted Moments\" (`method=\"PWM\"`) method can help in that case. This is possible by passing a distribution object from the `lmoments3` package together with `method=\"PWM\"`. That package currently only supports `expon`, `gamma`, `genextreme`, `genpareto`, `gumbel_r`, `pearson3`, and `weibull_min` (with other names, see [the documentation](https://lmoments3.readthedocs.io/en/stable/distributions.html)). In the following example, we fit using the \"Generalized extreme value\" distribution from `lmoments3`." ] }, { diff --git a/xclim/indices/stats.py b/xclim/indices/stats.py index 44bb4db4c..8c5a559a6 100644 --- a/xclim/indices/stats.py +++ b/xclim/indices/stats.py @@ -83,7 +83,7 @@ def fit( Time series to be fitted along the time dimension. dist : str or rv_continuous distribution object Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm - (see :py:mod:scipy.stats for full list). Or the distribution object itself. + (see :py:mod:scipy.stats for full list) or the distribution object itself. method : {"ML" or "MLE", "MM", "PWM", "APP"} Fitting method, either maximum likelihood (ML or MLE), method of moments (MM) or approximate method (APP). Can also be the probability weighted moments (PWM), also called L-Moments, if a compatible `dist` object is passed.