From c556b46747d12f2bd9a20f0b51734e9da0e2c88c Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Thu, 3 Oct 2024 16:31:56 -0300 Subject: [PATCH] Add combine_roulette function (#555) * Add combine_roulette function * check weights are positive --- preliz/internal/optimization.py | 14 ++-- preliz/tests/test_combine_roulette.py | 49 ++++++++++++ preliz/unidimensional/__init__.py | 3 +- preliz/unidimensional/combine_roulette.py | 91 +++++++++++++++++++++++ preliz/unidimensional/roulette.py | 44 +++++++---- 5 files changed, 177 insertions(+), 24 deletions(-) create mode 100644 preliz/tests/test_combine_roulette.py create mode 100644 preliz/unidimensional/combine_roulette.py diff --git a/preliz/internal/optimization.py b/preliz/internal/optimization.py index 7290c8af..2048b42d 100644 --- a/preliz/internal/optimization.py +++ b/preliz/internal/optimization.py @@ -77,18 +77,18 @@ def func(params, dist, x_vals): return opt -def optimize_cdf(dist, x_vals, ecdf, none_idx, fixed): - def func(params, dist, x_vals, ecdf): +def optimize_pdf(dist, x_vals, epdf, none_idx, fixed): + def func(params, dist, x_vals, epdf): params = get_params(dist, params, none_idx, fixed) dist._parametrization(**params) - loss = dist.cdf(x_vals) - ecdf + loss = dist.pdf(x_vals) - epdf return loss init_vals = np.array(dist.params)[none_idx] bounds = np.array(dist.params_support)[none_idx] bounds = list(zip(*bounds)) - opt = least_squares(func, x0=init_vals, args=(dist, x_vals, ecdf), bounds=bounds) + opt = least_squares(func, x0=init_vals, args=(dist, x_vals, epdf), bounds=bounds) params = get_params(dist, opt["x"], none_idx, fixed) dist._parametrization(**params) loss = opt["cost"] @@ -305,9 +305,9 @@ def get_distributions(dist_names): return dists -def fit_to_ecdf(selected_distributions, x_vals, ecdf, mean, std, x_min, x_max, extra_pros): +def fit_to_epdf(selected_distributions, x_vals, epdf, mean, std, x_min, x_max, extra_pros): """ - Minimize the difference between the cdf and the ecdf over a grid of values + Minimize the difference between the pdf and the epdf over a grid of values defined by x_min and x_max Note: This function is intended to be used with pz.roulette @@ -325,7 +325,7 @@ def fit_to_ecdf(selected_distributions, x_vals, ecdf, mean, std, x_min, x_max, e if dist._check_endpoints(x_min, x_max, raise_error=False): none_idx, fixed = get_fixed_params(dist) dist._fit_moments(mean, std) # pylint:disable=protected-access - loss = optimize_cdf(dist, x_vals, ecdf, none_idx, fixed) + loss = optimize_pdf(dist, x_vals, epdf, none_idx, fixed) fitted.update(loss, dist) diff --git a/preliz/tests/test_combine_roulette.py b/preliz/tests/test_combine_roulette.py new file mode 100644 index 00000000..2204df3d --- /dev/null +++ b/preliz/tests/test_combine_roulette.py @@ -0,0 +1,49 @@ +import pytest +from numpy.testing import assert_almost_equal +from preliz import combine_roulette +from preliz.distributions import BetaScaled, LogNormal, StudentT + +response0 = ( + [1.5, 2.5, 3.5], + [0.32142857142857145, 0.35714285714285715, 0.32142857142857145], + 28, + 0, + 10, + 10, + 11, +) +response1 = ( + [7.5, 8.5, 9.5], + [0.32142857142857145, 0.35714285714285715, 0.32142857142857145], + 28, + 0, + 10, + 10, + 11, +) +response2 = ([9.5], [1], 10, 0, 10, 10, 11) +response3 = ([9.5], [1], 10, 0, 10, 10, 14) + + +@pytest.mark.parametrize( + "responses, weights, dist_names, params, result", + [ + ([response0, response1], [0.5, 0.5], None, None, BetaScaled(1.2, 1, 0, 10)), + ( + [response0, response1], + [0.5, 0.5], + ["Beta", "StudentT"], + "TruncatedNormal(lower=0), StudentT(nu=1000)", + StudentT(1000, 5.5, 3.1), + ), + ([response0, response2], [1, 1], None, None, LogNormal(1.1, 0.6)), + ], +) +def test_combine_roulette(responses, weights, dist_names, params, result): + dist = combine_roulette(responses, weights, dist_names, params) + assert_almost_equal(dist.params, result.params, decimal=1) + + +def test_combine_roulette_error(): + with pytest.raises(ValueError): + combine_roulette([response0, response3]) diff --git a/preliz/unidimensional/__init__.py b/preliz/unidimensional/__init__.py index ac013f75..e625e7e6 100644 --- a/preliz/unidimensional/__init__.py +++ b/preliz/unidimensional/__init__.py @@ -1,8 +1,9 @@ from .beta_mode import beta_mode +from .combine_roulette import combine_roulette from .maxent import maxent from .mle import mle from .quartile import quartile from .quartile_int import quartile_int from .roulette import Roulette -__all__ = ["beta_mode", "maxent", "mle", "Roulette", "quartile", "quartile_int"] +__all__ = ["beta_mode", "combine_roulette", "maxent", "mle", "Roulette", "quartile", "quartile_int"] diff --git a/preliz/unidimensional/combine_roulette.py b/preliz/unidimensional/combine_roulette.py new file mode 100644 index 00000000..f7b8fa9e --- /dev/null +++ b/preliz/unidimensional/combine_roulette.py @@ -0,0 +1,91 @@ +import numpy as np + +from preliz.internal.distribution_helper import process_extra +from preliz.internal.optimization import fit_to_epdf, get_distributions + + +def combine_roulette(responses, weights=None, dist_names=None, params=None): + """ + Combine multiple elicited distributions into a single distribution. + + Parameters + ---------- + responses : list of tuples + Typically, each tuple comes from the ``.inputs`` attribute of a ``Roulette`` object and + represents a single elicited distribution. + weights : array-like, optional + Weights for each elicited distribution. Defaults to None, i.e. equal weights. + The sum of the weights must be equal to 1, otherwise it will be normalized. + dist_names: list + List of distributions names to be used in the elicitation. + Defaults to ["Normal", "BetaScaled", "Gamma", "LogNormal", "StudentT"]. + params : str, optional + Extra parameters to be passed to the distributions. The format is a string with the + PreliZ's distribution name followed by the argument to fix. + For example: "TruncatedNormal(lower=0), StudentT(nu=8)". + + Returns + ------- + PreliZ distribution + """ + + if params is not None: + extra_pros = process_extra(params) + else: + extra_pros = [] + + if weights is None: + weights = np.full(len(responses), 1 / len(responses)) + else: + weights = np.array(weights, dtype=float) + + if np.any(weights <= 0): + raise ValueError("The weights must be positive.") + + weights /= weights.sum() + + if not all(records[3:] == responses[0][3:] for records in responses): + raise ValueError( + "To combine single elicitation instances, the grid should be the same for all of them." + ) + + if dist_names is None: + dist_names = ["Normal", "BetaScaled", "Gamma", "LogNormal", "StudentT"] + + new_pdf = {} + for records, weight in zip(responses, weights): + chips = records[2] + for x_i, pdf_i in zip(records[0], records[1]): + if x_i in new_pdf: + new_pdf[x_i] += pdf_i * weight * chips + else: + new_pdf[x_i] = pdf_i * weight * chips + + total = sum(new_pdf.values()) + mean = 0 + for x_i, pdf_i in new_pdf.items(): + val = pdf_i / total + mean += x_i * val + new_pdf[x_i] = val + + var = 0 + for x_i, pdf_i in new_pdf.items(): + var += pdf_i * (x_i - mean) ** 2 + std = var**0.5 + + # Assuming all the elicited distributions have the same x_min and x_max + x_min = responses[0][3] + x_max = responses[0][4] + + fitted_dist = fit_to_epdf( + get_distributions(dist_names), + list(new_pdf.keys()), + list(new_pdf.values()), + mean, + std, + x_min, + x_max, + extra_pros, + ) + + return fitted_dist diff --git a/preliz/unidimensional/roulette.py b/preliz/unidimensional/roulette.py index a7f8950d..0a2f187f 100644 --- a/preliz/unidimensional/roulette.py +++ b/preliz/unidimensional/roulette.py @@ -9,7 +9,7 @@ except ImportError: pass -from ..internal.optimization import fit_to_ecdf, get_distributions +from ..internal.optimization import fit_to_epdf, get_distributions from ..internal.plot_helper import check_inside_notebook, representations from ..internal.distribution_helper import process_extra from ..distributions import all_discrete, all_continuous @@ -36,9 +36,10 @@ def __init__( Number of columns for the grid. Defaults to 11. dist_names: list List of distributions names to be used in the elicitation. - For example: ["Normal", "StudentT"]. - Default to None, almost all 1D distributions available in PreliZ will be used, - with some exceptions like Uniform or Cauchy. + Defaults to None, the pre-selected distribution are ["Normal", "BetaScaled", + "Gamma", "LogNormal", "StudentT"] but almost all 1D PreliZ's distributions + are available to be selected from the menu with some exceptions like Uniform + or Cauchy. params: Optional[str]: Extra parameters to be passed to the distributions. The format is a string with the PreliZ's distribution name followed by the argument to fix. @@ -49,7 +50,11 @@ def __init__( Returns ------- - PreliZ distribution + Roulette object + The object has many attributes, but the most important are: + - dist: The fitted distribution + - inputs: A tuple with the x values, the empirical pdf, the total + chips, the x_min, the x_max, the number of rows and the number of columns. References ---------- @@ -65,7 +70,7 @@ def __init__( self._figsize = figsize self._w_extra = params self.dist = None - self._hist = None + self.inputs = None check_inside_notebook(need_widget=True) @@ -151,7 +156,7 @@ def _create_grid(self): def _on_leave_fig(self): extra_pros = process_extra(self._widgets["w_extra"].value) - x_vals, ecdf, probs, mean, std, filled_columns = self._weights_to_ecdf() + x_vals, epdf, mean, std, filled_columns = self._weights_to_pdf() fitted_dist = None if filled_columns > 1: @@ -159,10 +164,10 @@ def _on_leave_fig(self): if selected_distributions: self._reset_dist_panel(yticks=False) - fitted_dist = fit_to_ecdf( + fitted_dist = fit_to_epdf( selected_distributions, x_vals, - ecdf, + epdf, mean, std, self._x_min, @@ -178,20 +183,27 @@ def _on_leave_fig(self): self._reset_dist_panel(yticks=True) self._fig.canvas.draw() - self.hist = (x_vals, probs) + self.inputs = ( + x_vals, + epdf, + sum(self._grid._weights.values()), + self._x_min, + self._x_max, + self._nrows, + self._ncols, + ) self.dist = fitted_dist - def _weights_to_ecdf(self): + def _weights_to_pdf(self): step = (self._x_max - self._x_min) / (self._ncols - 1) x_vals = [(k + 0.5) * step + self._x_min for k, v in self._grid._weights.items() if v != 0] total = sum(self._grid._weights.values()) - probabilities = [v / total for v in self._grid._weights.values() if v != 0] - cum_sum = np.cumsum(probabilities) + epdf = [v / total for v in self._grid._weights.values() if v != 0] - mean = sum(value * prob for value, prob in zip(x_vals, probabilities)) - std = (sum(prob * (value - mean) ** 2 for value, prob in zip(x_vals, probabilities))) ** 0.5 + mean = sum(prob * value for value, prob in zip(x_vals, epdf)) + std = (sum(prob * (value - mean) ** 2 for value, prob in zip(x_vals, epdf))) ** 0.5 - return x_vals, cum_sum, probabilities, mean, std, len(x_vals) + return x_vals, epdf, mean, std, len(x_vals) def _update_grid(self): self._ax_grid.cla()