From c556b46747d12f2bd9a20f0b51734e9da0e2c88c Mon Sep 17 00:00:00 2001
From: Osvaldo A Martin <aloctavodia@gmail.com>
Date: Thu, 3 Oct 2024 16:31:56 -0300
Subject: [PATCH] Add combine_roulette function (#555)

* Add combine_roulette function

* check weights are positive
---
 preliz/internal/optimization.py           | 14 ++--
 preliz/tests/test_combine_roulette.py     | 49 ++++++++++++
 preliz/unidimensional/__init__.py         |  3 +-
 preliz/unidimensional/combine_roulette.py | 91 +++++++++++++++++++++++
 preliz/unidimensional/roulette.py         | 44 +++++++----
 5 files changed, 177 insertions(+), 24 deletions(-)
 create mode 100644 preliz/tests/test_combine_roulette.py
 create mode 100644 preliz/unidimensional/combine_roulette.py

diff --git a/preliz/internal/optimization.py b/preliz/internal/optimization.py
index 7290c8af..2048b42d 100644
--- a/preliz/internal/optimization.py
+++ b/preliz/internal/optimization.py
@@ -77,18 +77,18 @@ def func(params, dist, x_vals):
     return opt
 
 
-def optimize_cdf(dist, x_vals, ecdf, none_idx, fixed):
-    def func(params, dist, x_vals, ecdf):
+def optimize_pdf(dist, x_vals, epdf, none_idx, fixed):
+    def func(params, dist, x_vals, epdf):
         params = get_params(dist, params, none_idx, fixed)
         dist._parametrization(**params)
-        loss = dist.cdf(x_vals) - ecdf
+        loss = dist.pdf(x_vals) - epdf
         return loss
 
     init_vals = np.array(dist.params)[none_idx]
     bounds = np.array(dist.params_support)[none_idx]
     bounds = list(zip(*bounds))
 
-    opt = least_squares(func, x0=init_vals, args=(dist, x_vals, ecdf), bounds=bounds)
+    opt = least_squares(func, x0=init_vals, args=(dist, x_vals, epdf), bounds=bounds)
     params = get_params(dist, opt["x"], none_idx, fixed)
     dist._parametrization(**params)
     loss = opt["cost"]
@@ -305,9 +305,9 @@ def get_distributions(dist_names):
     return dists
 
 
-def fit_to_ecdf(selected_distributions, x_vals, ecdf, mean, std, x_min, x_max, extra_pros):
+def fit_to_epdf(selected_distributions, x_vals, epdf, mean, std, x_min, x_max, extra_pros):
     """
-    Minimize the difference between the cdf and the ecdf over a grid of values
+    Minimize the difference between the pdf and the epdf over a grid of values
     defined by x_min and x_max
 
     Note: This function is intended to be used with pz.roulette
@@ -325,7 +325,7 @@ def fit_to_ecdf(selected_distributions, x_vals, ecdf, mean, std, x_min, x_max, e
         if dist._check_endpoints(x_min, x_max, raise_error=False):
             none_idx, fixed = get_fixed_params(dist)
             dist._fit_moments(mean, std)  # pylint:disable=protected-access
-            loss = optimize_cdf(dist, x_vals, ecdf, none_idx, fixed)
+            loss = optimize_pdf(dist, x_vals, epdf, none_idx, fixed)
 
             fitted.update(loss, dist)
 
diff --git a/preliz/tests/test_combine_roulette.py b/preliz/tests/test_combine_roulette.py
new file mode 100644
index 00000000..2204df3d
--- /dev/null
+++ b/preliz/tests/test_combine_roulette.py
@@ -0,0 +1,49 @@
+import pytest
+from numpy.testing import assert_almost_equal
+from preliz import combine_roulette
+from preliz.distributions import BetaScaled, LogNormal, StudentT
+
+response0 = (
+    [1.5, 2.5, 3.5],
+    [0.32142857142857145, 0.35714285714285715, 0.32142857142857145],
+    28,
+    0,
+    10,
+    10,
+    11,
+)
+response1 = (
+    [7.5, 8.5, 9.5],
+    [0.32142857142857145, 0.35714285714285715, 0.32142857142857145],
+    28,
+    0,
+    10,
+    10,
+    11,
+)
+response2 = ([9.5], [1], 10, 0, 10, 10, 11)
+response3 = ([9.5], [1], 10, 0, 10, 10, 14)
+
+
+@pytest.mark.parametrize(
+    "responses, weights, dist_names, params, result",
+    [
+        ([response0, response1], [0.5, 0.5], None, None, BetaScaled(1.2, 1, 0, 10)),
+        (
+            [response0, response1],
+            [0.5, 0.5],
+            ["Beta", "StudentT"],
+            "TruncatedNormal(lower=0), StudentT(nu=1000)",
+            StudentT(1000, 5.5, 3.1),
+        ),
+        ([response0, response2], [1, 1], None, None, LogNormal(1.1, 0.6)),
+    ],
+)
+def test_combine_roulette(responses, weights, dist_names, params, result):
+    dist = combine_roulette(responses, weights, dist_names, params)
+    assert_almost_equal(dist.params, result.params, decimal=1)
+
+
+def test_combine_roulette_error():
+    with pytest.raises(ValueError):
+        combine_roulette([response0, response3])
diff --git a/preliz/unidimensional/__init__.py b/preliz/unidimensional/__init__.py
index ac013f75..e625e7e6 100644
--- a/preliz/unidimensional/__init__.py
+++ b/preliz/unidimensional/__init__.py
@@ -1,8 +1,9 @@
 from .beta_mode import beta_mode
+from .combine_roulette import combine_roulette
 from .maxent import maxent
 from .mle import mle
 from .quartile import quartile
 from .quartile_int import quartile_int
 from .roulette import Roulette
 
-__all__ = ["beta_mode", "maxent", "mle", "Roulette", "quartile", "quartile_int"]
+__all__ = ["beta_mode", "combine_roulette", "maxent", "mle", "Roulette", "quartile", "quartile_int"]
diff --git a/preliz/unidimensional/combine_roulette.py b/preliz/unidimensional/combine_roulette.py
new file mode 100644
index 00000000..f7b8fa9e
--- /dev/null
+++ b/preliz/unidimensional/combine_roulette.py
@@ -0,0 +1,91 @@
+import numpy as np
+
+from preliz.internal.distribution_helper import process_extra
+from preliz.internal.optimization import fit_to_epdf, get_distributions
+
+
+def combine_roulette(responses, weights=None, dist_names=None, params=None):
+    """
+    Combine multiple elicited distributions into a single distribution.
+
+    Parameters
+    ----------
+    responses : list of tuples
+        Typically, each tuple comes from the ``.inputs`` attribute of a ``Roulette`` object and
+        represents a single elicited distribution.
+    weights : array-like, optional
+        Weights for each elicited distribution. Defaults to None, i.e. equal weights.
+        The sum of the weights must be equal to 1, otherwise it will be normalized.
+    dist_names: list
+        List of distributions names to be used in the elicitation.
+        Defaults to ["Normal", "BetaScaled", "Gamma", "LogNormal", "StudentT"].
+    params : str, optional
+        Extra parameters to be passed to the distributions. The format is a string with the
+        PreliZ's distribution name followed by the argument to fix.
+        For example: "TruncatedNormal(lower=0), StudentT(nu=8)".
+
+    Returns
+    -------
+    PreliZ distribution
+    """
+
+    if params is not None:
+        extra_pros = process_extra(params)
+    else:
+        extra_pros = []
+
+    if weights is None:
+        weights = np.full(len(responses), 1 / len(responses))
+    else:
+        weights = np.array(weights, dtype=float)
+
+    if np.any(weights <= 0):
+        raise ValueError("The weights must be positive.")
+
+    weights /= weights.sum()
+
+    if not all(records[3:] == responses[0][3:] for records in responses):
+        raise ValueError(
+            "To combine single elicitation instances, the grid should be the same for all of them."
+        )
+
+    if dist_names is None:
+        dist_names = ["Normal", "BetaScaled", "Gamma", "LogNormal", "StudentT"]
+
+    new_pdf = {}
+    for records, weight in zip(responses, weights):
+        chips = records[2]
+        for x_i, pdf_i in zip(records[0], records[1]):
+            if x_i in new_pdf:
+                new_pdf[x_i] += pdf_i * weight * chips
+            else:
+                new_pdf[x_i] = pdf_i * weight * chips
+
+    total = sum(new_pdf.values())
+    mean = 0
+    for x_i, pdf_i in new_pdf.items():
+        val = pdf_i / total
+        mean += x_i * val
+        new_pdf[x_i] = val
+
+    var = 0
+    for x_i, pdf_i in new_pdf.items():
+        var += pdf_i * (x_i - mean) ** 2
+    std = var**0.5
+
+    # Assuming all the elicited distributions have the same x_min and x_max
+    x_min = responses[0][3]
+    x_max = responses[0][4]
+
+    fitted_dist = fit_to_epdf(
+        get_distributions(dist_names),
+        list(new_pdf.keys()),
+        list(new_pdf.values()),
+        mean,
+        std,
+        x_min,
+        x_max,
+        extra_pros,
+    )
+
+    return fitted_dist
diff --git a/preliz/unidimensional/roulette.py b/preliz/unidimensional/roulette.py
index a7f8950d..0a2f187f 100644
--- a/preliz/unidimensional/roulette.py
+++ b/preliz/unidimensional/roulette.py
@@ -9,7 +9,7 @@
 except ImportError:
     pass
 
-from ..internal.optimization import fit_to_ecdf, get_distributions
+from ..internal.optimization import fit_to_epdf, get_distributions
 from ..internal.plot_helper import check_inside_notebook, representations
 from ..internal.distribution_helper import process_extra
 from ..distributions import all_discrete, all_continuous
@@ -36,9 +36,10 @@ def __init__(
             Number of columns for the grid. Defaults to 11.
         dist_names: list
             List of distributions names to be used in the elicitation.
-            For example: ["Normal", "StudentT"].
-            Default to None, almost all 1D distributions available in PreliZ will be used,
-            with some exceptions like Uniform or Cauchy.
+            Defaults to None, the pre-selected distribution are ["Normal", "BetaScaled",
+            "Gamma", "LogNormal", "StudentT"] but almost all 1D PreliZ's distributions
+            are available to be selected from the menu with some exceptions like Uniform
+            or Cauchy.
         params: Optional[str]:
             Extra parameters to be passed to the distributions. The format is a string with the
             PreliZ's distribution name followed by the argument to fix.
@@ -49,7 +50,11 @@ def __init__(
 
         Returns
         -------
-        PreliZ distribution
+        Roulette object
+            The object has many attributes, but the most important are:
+            - dist: The fitted distribution
+            - inputs: A tuple with the x values, the empirical pdf, the total
+            chips, the x_min, the x_max, the number of rows and the number of columns.
 
         References
         ----------
@@ -65,7 +70,7 @@ def __init__(
         self._figsize = figsize
         self._w_extra = params
         self.dist = None
-        self._hist = None
+        self.inputs = None
 
         check_inside_notebook(need_widget=True)
 
@@ -151,7 +156,7 @@ def _create_grid(self):
     def _on_leave_fig(self):
         extra_pros = process_extra(self._widgets["w_extra"].value)
 
-        x_vals, ecdf, probs, mean, std, filled_columns = self._weights_to_ecdf()
+        x_vals, epdf, mean, std, filled_columns = self._weights_to_pdf()
 
         fitted_dist = None
         if filled_columns > 1:
@@ -159,10 +164,10 @@ def _on_leave_fig(self):
 
             if selected_distributions:
                 self._reset_dist_panel(yticks=False)
-                fitted_dist = fit_to_ecdf(
+                fitted_dist = fit_to_epdf(
                     selected_distributions,
                     x_vals,
-                    ecdf,
+                    epdf,
                     mean,
                     std,
                     self._x_min,
@@ -178,20 +183,27 @@ def _on_leave_fig(self):
             self._reset_dist_panel(yticks=True)
         self._fig.canvas.draw()
 
-        self.hist = (x_vals, probs)
+        self.inputs = (
+            x_vals,
+            epdf,
+            sum(self._grid._weights.values()),
+            self._x_min,
+            self._x_max,
+            self._nrows,
+            self._ncols,
+        )
         self.dist = fitted_dist
 
-    def _weights_to_ecdf(self):
+    def _weights_to_pdf(self):
         step = (self._x_max - self._x_min) / (self._ncols - 1)
         x_vals = [(k + 0.5) * step + self._x_min for k, v in self._grid._weights.items() if v != 0]
         total = sum(self._grid._weights.values())
-        probabilities = [v / total for v in self._grid._weights.values() if v != 0]
-        cum_sum = np.cumsum(probabilities)
+        epdf = [v / total for v in self._grid._weights.values() if v != 0]
 
-        mean = sum(value * prob for value, prob in zip(x_vals, probabilities))
-        std = (sum(prob * (value - mean) ** 2 for value, prob in zip(x_vals, probabilities))) ** 0.5
+        mean = sum(prob * value for value, prob in zip(x_vals, epdf))
+        std = (sum(prob * (value - mean) ** 2 for value, prob in zip(x_vals, epdf))) ** 0.5
 
-        return x_vals, cum_sum, probabilities, mean, std, len(x_vals)
+        return x_vals, epdf, mean, std, len(x_vals)
 
     def _update_grid(self):
         self._ax_grid.cla()