Skip to content

Commit

Permalink
implement serialization and deserialization in model.py along with te…
Browse files Browse the repository at this point in the history
…sting
  • Loading branch information
mbi6245 committed Oct 9, 2024
1 parent f8ece98 commit ef6e027
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 36 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,6 @@ dmypy.json
# Misc.
.DS_Store
*.csv
*.parquet
*.parquet
diabetes_demo.ipynb
ensemble_fpg_plots.png
20 changes: 10 additions & 10 deletions plots.ipynb

Large diffs are not rendered by default.

90 changes: 77 additions & 13 deletions src/ensemble/model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import List, Tuple, Union

import cvxpy as cp
Expand Down Expand Up @@ -32,19 +33,24 @@ class EnsembleDistribution:

def __init__(
self,
distributions: List[str],
weights: List[float],
# distributions: List[str],
# weights: List[float],
named_weights: dict,
mean: float,
variance: float,
):
_check_valid_ensemble(distributions, weights)
self.support = _check_supports_match(distributions)
self._distributions = list(named_weights.keys())
self._weights = list(named_weights.values())

self.distributions = distributions
self.my_objs = []
for distribution in distributions:
self.my_objs.append(distribution_dict[distribution](mean, variance))
self.weights = weights
_check_valid_ensemble(self._distributions, self._weights)
self.support = _check_supports_match(self._distributions)

self.named_weights = named_weights
self.fitted_distributions = []
for distribution in self.named_weights.keys():
self.fitted_distributions.append(
distribution_dict[distribution](mean, variance)
)
self.mean = mean
self.variance = variance

Expand Down Expand Up @@ -179,13 +185,13 @@ def rvs(self, size: int = 1) -> np.ndarray:
# ensemble_cdf(x) - p, where p is aforementioned Unif(0, 1) sample
# return quantiles which minimize the objective function (i.e. which
# values of x minimize ensemble_cdf(x) - q)
dist_counts = np.random.multinomial(size, self.weights)
dist_counts = np.random.multinomial(size, self._weights)
samples = np.hstack(
[
distribution_dict[dist](self.mean, self.variance).rvs(
size=counts
)
for dist, counts in zip(self.distributions, dist_counts)
for dist, counts in zip(self._distributions, dist_counts)
]
)
np.random.shuffle(samples)
Expand Down Expand Up @@ -214,7 +220,7 @@ def stats_temp(
if "v" in moments:
res_list.append(self.variance)

res_list = [res[()] for res in res_list]
# res_list = [res[()] for res in res_list]
if len(res_list) == 1:
return res_list[0]
else:
Expand Down Expand Up @@ -242,6 +248,62 @@ def plot(self):
ax[1].set_ylabel("density")
ax[1].set_title("ensemble CDF")

def to_json(self, file_path: str, appending: bool = False) -> None:
"""serializes EnsembleDistribution object as a JSON file with the option
to append instead of writing a new file
Parameters
----------
file_path : str
path to file to write in
appending : bool, optional
option to append to existing file instead of overwrite,
by default False
"""
distribution_summary = {
"named_weights": self.named_weights,
"mean": self.mean,
"variance": self.variance,
}

if appending:
with open(file_path, "r") as outfile:
existing = json.load(outfile)
with open(file_path, "w") as outfile:
existing.append(distribution_summary)
json.dump(existing, outfile)
else:
with open(file_path, "w") as outfile:
json.dump([distribution_summary], outfile)


def from_json(file_path: str) -> List[EnsembleDistribution]:
"""deserializes JSON object into list of Ensemble Distribution objects
Parameters
----------
file_path : str
path to file that JSON object is stored in
Returns
-------
List[EnsembleDistribution]
list of EnsembleDistribution objects
"""
with open(file_path, "r") as infile:
distribution_summaries = json.load(infile)

res = [None] * len(distribution_summaries)
for i in range(len(distribution_summaries)):
named_weights, mean, variance = (
distribution_summaries[i]["named_weights"],
distribution_summaries[i]["mean"],
distribution_summaries[i]["variance"],
)
res[i] = EnsembleDistribution(named_weights, mean, variance)

return res


class EnsembleResult:
"""Result from ensemble distribution fitting
Expand Down Expand Up @@ -405,7 +467,9 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
res = EnsembleResult(
weights=fitted_weights,
ensemble_distribution=EnsembleDistribution(
self.distributions, fitted_weights, sample_mean, sample_variance
dict(zip(self.distributions, fitted_weights)),
sample_mean,
sample_variance,
),
)

Expand Down
1 change: 1 addition & 0 deletions test_read.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"named_weights": {"normal": 0.5, "gumbel": 0.5}, "mean": 1, "variance": 1}, {"named_weights": {"gamma": 0.2, "invgamma": 0.8}, "mean": 1, "variance": 1}]
42 changes: 30 additions & 12 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,48 @@
import pytest
import scipy.stats as stats

from ensemble.model import EnsembleDistribution, EnsembleFitter
from ensemble.model import EnsembleDistribution, EnsembleFitter, from_json

STD_NORMAL_DRAWS = stats.norm(loc=0, scale=1).rvs(100)

ENSEMBLE_RL_DRAWS = EnsembleDistribution(
distributions=["normal", "gumbel"], weights=[0.7, 0.3], mean=0, variance=1
named_weights={"normal": 0.7, "gumbel": 0.3}, mean=0, variance=1
).rvs(size=100)

ENSEMBLE_POS_DRAWS = EnsembleDistribution(
distributions=["exponential", "lognormal"],
weights=[0.5, 0.5],
named_weights={"exponential": 0.5, "lognormal": 0.5},
mean=5,
variance=1,
).rvs(size=100)

ENSEMBLE_POS_DRAWS2 = EnsembleDistribution(
distributions=["exponential", "lognormal", "fisk"],
weights=[0.3, 0.5, 0.2],
named_weights={"exponential": 0.3, "lognormal": 0.5, "fisk": 0.2},
mean=40,
variance=5,
)


DEFAULT_SETTINGS = ([0.5, 0.5], 1, 1)
DEFAULT_SETTINGS = (1, 1)


def test_bad_weights():
with pytest.raises(ValueError):
EnsembleDistribution(["normal", "gumbel"], [1, 0.1], 1, 1)
EnsembleDistribution({"normal": 1, "gumbel": 0.1}, *DEFAULT_SETTINGS)
with pytest.raises(ValueError):
EnsembleDistribution(["normal", "gumbel"], [0.3, 0.69], 1, 1)
EnsembleDistribution({"normal": 0.3, "gumbel": 0.69}, *DEFAULT_SETTINGS)


def test_incompatible_dists():
with pytest.raises(ValueError):
EnsembleDistribution(["normal", "exponential"], *DEFAULT_SETTINGS)
EnsembleDistribution(
{"normal": 0.5, "exponential": 0.5}, *DEFAULT_SETTINGS
)
with pytest.raises(ValueError):
EnsembleDistribution(["beta", "normal"], *DEFAULT_SETTINGS)
EnsembleDistribution({"beta": 0.5, "normal": 0.5}, *DEFAULT_SETTINGS)
with pytest.raises(ValueError):
EnsembleDistribution(["beta", "exponential"], *DEFAULT_SETTINGS)
EnsembleDistribution(
{"beta": 0.5, "exponential": 0.5}, *DEFAULT_SETTINGS
)


def test_incompatible_data():
Expand All @@ -64,3 +66,19 @@ def test_resulting_weights():
model2 = EnsembleFitter(["exponential", "lognormal", "fisk"], "KS")
res2 = model2.fit(ENSEMBLE_POS_DRAWS)
assert np.isclose(np.sum(res2.weights), 1)


def test_json():
model0 = EnsembleDistribution(
{"normal": 0.5, "gumbel": 0.5}, *DEFAULT_SETTINGS
)
model0.to_json("test_read.json")
model1 = EnsembleDistribution(
{"gamma": 0.2, "invgamma": 0.8}, *DEFAULT_SETTINGS
)
model1.to_json("test_read.json", appending=True)

m1 = from_json("test_read.json")[1]
assert m1.stats_temp("mv") == DEFAULT_SETTINGS
assert m1._distributions == ["gamma", "invgamma"]
assert m1._weights == [0.2, 0.8]

0 comments on commit ef6e027

Please sign in to comment.