implement all unmirrored distributions from Stan's R code with testin…

…g for lack of support for all distributions; weibull, lognormal, and beta currently do not fully work
ihmeuw-msca · Jul 23, 2024 · 00e6fb7 · 00e6fb7
1 parent 10e1011
commit 00e6fb7
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 12 deletions.
diff --git a/src/ensemble/distributions.py b/src/ensemble/distributions.py
@@ -3,6 +3,9 @@
 import numpy as np
 import scipy.optimize
 import scipy.stats
+from scipy.special import gamma as gamma_func
+
+# from scipy.special import gammainccinv, gammaincinv
 
 
 # distribution parent class to abstract away the diff scipy funcs
@@ -29,19 +32,22 @@ def stats(self, moments):
 
 class Exponential(Distribution):
     def _create_scipy_dist(self) -> None:
+        positive_support(self.mean)
         lambda_ = 1 / self.mean
         self._scipy_dist = scipy.stats.expon(scale=1 / lambda_)
 
 
 class Gamma(Distribution):
     def _create_scipy_dist(self) -> None:
+        strict_positive_support(self.mean)
         alpha = self.mean**2 / self.variance
         beta = self.mean / self.variance
         self._scipy_dist = scipy.stats.gamma(a=alpha, scale=1 / beta)
 
 
 class InvGamma(Distribution):
     def _create_scipy_dist(self) -> None:
+        strict_positive_support(self.mean)
         optim_params = scipy.optimize.minimize(
             fun=self._shape_scale,
             # a *good* friend told me that this is a good initial guess and it works so far???
@@ -63,16 +69,17 @@ def _shape_scale(self, x, samp_mean, samp_var) -> None:
 
 class Fisk(Distribution):
     def _create_scipy_dist(self):
+        positive_support(self.mean)
         optim_params = scipy.optimize.minimize(
             fun=self._shape_scale,
             # start beta at 1.1 and solve for alpha
             x0=[self.mean * 1.1 * np.sin(np.pi / 1.1) / np.pi, 1.1],
             args=(self.mean, self.variance),
             # options={"disp": True},
         )
-        shape, scale = np.abs(optim_params.x)
-        # print("parameters from optimizer: ", shape, scale)
-        self._scipy_dist = scipy.stats.fisk(c=scale, scale=shape)
+        alpha, beta = np.abs(optim_params.x)
+        # parameterization notes: numpy's c is wikipedia's beta, numpy's scale is wikipedia's alpha
+        self._scipy_dist = scipy.stats.fisk(c=beta, scale=alpha)
 
     def _shape_scale(self, x, samp_mean, samp_var) -> None:
         alpha = x[0]
@@ -94,12 +101,36 @@ def _create_scipy_dist(self) -> None:
 
 class Weibull(Distribution):
     def _create_scipy_dist(self) -> None:
-        raise NotImplementedError
+        positive_support(self.mean)
+        optim_params = scipy.optimize.minimize(
+            fun=self._shape_scale,
+            # ideally can invert gamma function for k, then use mean / sd as a guess for lambda
+            x0=[self.mean / gamma_func(1 + 1 / 1.5), 1.5],
+            args=(self.mean, self.variance),
+            options={"disp": True},
+        )
+        lambda_, k = np.abs(optim_params.x)
+        print("params from optim: ", lambda_, k)
+        self._scipy_dist = scipy.stats.weibull_min(c=k, scale=lambda_)
+
+    def _shape_scale(self, x, samp_mean, samp_var) -> None:
+        lambda_ = x[0]
+        k = x[1]
+        mean_guess = lambda_ * gamma_func(1 + (1 / k))
+        variance_guess = lambda_**2 * (
+            gamma_func(1 + (2 / k) - gamma_func(1 + (1 / k)) ** 2)
+        )
+        return (mean_guess - samp_mean) ** 2 + (variance_guess - samp_var) ** 2
 
 
 class LogNormal(Distribution):
     def _create_scipy_dist(self) -> None:
-        raise NotImplementedError
+        # using method of moments gets close, but not quite there
+        loc = np.log(self.mean / np.sqrt(1 + (self.variance / self.mean**2)))
+        scale = np.sqrt(np.log(1 + (self.variance / self.mean**2)))
+        # loc = np.log(self.mean**2 / np.sqrt(self.mean**2 + self.variance))
+        # scale = np.log(1 + self.variance / self.mean**2)
+        self._scipy_dist = scipy.stats.lognorm(loc=loc, s=scale)
 
 
 class Normal(Distribution):
@@ -111,10 +142,47 @@ def _create_scipy_dist(self) -> None:
 
 class Beta(Distribution):
     def _create_scipy_dist(self) -> None:
-        raise NotImplementedError
+        beta_bounds(self.mean)
+        optim_params = scipy.optimize.minimize(
+            fun=self._shape_scale,
+            # trying something similar to invgamma, unsuccessful for variance
+            x0=[2, self.mean * 2 - 2],
+            args=(self.mean, self.variance),
+            options={"disp": True},
+        )
+        alpha, beta = np.abs(optim_params.x)
+        print("params from optim: ", alpha, beta)
+        self._scipy_dist = scipy.stats.beta(a=alpha, b=beta)
+
+    def _shape_scale(self, x, samp_mean, samp_var):
+        alpha = x[0]
+        beta = x[1]
+        mean_guess = alpha / (alpha + beta)
+        variance_guess = (
+            alpha * beta / ((alpha + beta) ** 2 * (alpha + beta + 1))
+        )
+        return (mean_guess - samp_mean) ** 2 + (variance_guess - samp_var) ** 2
 
 
 # exp, gamma, invgamma, llogis, gumbel, weibull, lognormal, normal, mgamma, mgumbel, beta
 
 
 # distribution_dict = {"exponential": Exponential()}
+
+
+### HELPER FUNCTIONS
+# the following functions give a crude solution to negative means which surely mean the data is negative
+# what about data that is negative, but still has a positive mean?
+def positive_support(mean):
+    if mean < 0:
+        raise ValueError("This distribution is only supported on [0, np.inf)")
+
+
+def strict_positive_support(mean):
+    if mean <= 0:
+        raise ValueError("This distribution is only supported on (0, np.inf)")
+
+
+def beta_bounds(mean):
+    if (mean < 0) or (mean > 1):
+        raise ValueError("This distribution is only supposrted on [0, 1]")
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
@@ -19,12 +19,14 @@
 # @pytest.mark.parametrize("a, b, expected", [(1, 2, 3), (2, 3, 5)])
 # def test_add(a, b, expected):
 #     assert add(a, b) == expected
+NEG_MEAN = -2
+BETA_MEAN = 0.5
+BETA_VARIANCE = 0.2
 MEAN = 2
 VARIANCE = 8
 
 
 def test_exp():
-    # x = np.linspace(0, 1, num=10)
     exp = Exponential(MEAN, VARIANCE)
     res = exp.stats(moments="mv")
     exp_var = MEAN**2
@@ -49,8 +51,6 @@ def test_invgamma():
 def test_fisk():
     fisk = Fisk(MEAN, VARIANCE)
     res = fisk.stats(moments="mv")
-    print("resulting mean and var: ", res)
-    # assert False
     assert np.isclose(res[0], MEAN)
     assert np.isclose(res[1], VARIANCE)
 
@@ -61,13 +61,26 @@ def test_gumbel():
     assert np.isclose(res[0], MEAN)
     assert np.isclose(res[1], VARIANCE)
 
+    gumbel = GumbelR(NEG_MEAN, VARIANCE)
+    res = gumbel.stats(moments="mv")
+    assert np.isclose(res[0], NEG_MEAN)
+    assert np.isclose(res[1], VARIANCE)
+
 
 def test_weibull():
-    raise NotImplementedError
+    weibull = Weibull(MEAN, VARIANCE)
+    res = weibull.stats(moments="mv")
+    print("resulting mean and var: ", res)
+    assert np.isclose(res[0], MEAN)
+    assert np.isclose(res[1], VARIANCE)
 
 
 def test_lognormal():
-    raise NotImplementedError
+    lognormal = LogNormal(MEAN, VARIANCE)
+    res = lognormal.stats(moments="mv")
+    print("resulting mean and var: ", res)
+    assert np.isclose(res[0], MEAN)
+    assert np.isclose(res[1], VARIANCE)
 
 
 def test_normal():
@@ -76,6 +89,31 @@ def test_normal():
     assert np.isclose(res[0], MEAN)
     assert np.isclose(res[1], VARIANCE)
 
+    norm = Normal(NEG_MEAN, VARIANCE)
+    res = norm.stats(moments="mv")
+    assert np.isclose(res[0], NEG_MEAN)
+    assert np.isclose(res[1], VARIANCE)
+
 
 def test_beta():
-    raise NotImplementedError
+    beta = Beta(BETA_MEAN, VARIANCE)
+    res = beta.stats(moments="mv")
+    print("resulting mean and var: ", res)
+    assert np.isclose(res[0], BETA_MEAN)
+    assert np.isclose(res[1], VARIANCE)
+
+
+def test_diff_supports():
+    # negative means for only positive RVs
+    with pytest.raises(ValueError):
+        Exponential(NEG_MEAN, VARIANCE)
+    with pytest.raises(ValueError):
+        Gamma(NEG_MEAN, VARIANCE)
+    with pytest.raises(ValueError):
+        InvGamma(NEG_MEAN, VARIANCE)
+    with pytest.raises(ValueError):
+        Fisk(NEG_MEAN, VARIANCE)
+
+    # mean outside of 0 and 1 for Beta
+    with pytest.raises(ValueError):
+        Beta(NEG_MEAN, VARIANCE)