Skip to content

Commit 0734c8b

Browse files
committed
Switch to formulaic
1 parent b06ce8e commit 0734c8b

9 files changed

+89
-99
lines changed

causalpy/experiments/diff_in_diff.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import numpy as np
2020
import pandas as pd
2121
import seaborn as sns
22-
from formulae import design_matrices
22+
from formulaic import model_matrix
2323
from matplotlib import pyplot as plt
2424
from sklearn.base import RegressorMixin
2525

@@ -91,18 +91,15 @@ def __init__(
9191
self.data = data
9292
self.expt_type = "Difference in Differences"
9393
self.formula = formula
94-
self.rhs_formula = formula.split("~", 1)[1].strip()
9594
self.time_variable_name = time_variable_name
9695
self.group_variable_name = group_variable_name
9796
self.input_validation()
9897

99-
dm = design_matrices(self.formula, self.data)
100-
self.labels = list(dm.common.terms.keys())
101-
self.y, self.X = (
102-
np.asarray(dm.response.design_matrix).reshape(-1, 1),
103-
np.asarray(dm.common.design_matrix),
104-
)
105-
self.outcome_variable_name = dm.response.name
98+
dm = model_matrix(self.formula, self.data)
99+
self.labels = list(dm.rhs.columns)
100+
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
101+
self.rhs_matrix_spec = dm.rhs.model_spec
102+
self.outcome_variable_name = dm.lhs.columns[0]
106103

107104
# fit model
108105
if isinstance(self.model, PyMCModel):
@@ -127,7 +124,9 @@ def __init__(
127124
)
128125
if self.x_pred_control.empty:
129126
raise ValueError("x_pred_control is empty")
130-
new_x = np.array(design_matrices(self.rhs_formula, self.x_pred_control).common)
127+
new_x = model_matrix(
128+
spec=self.rhs_matrix_spec, data=self.x_pred_control
129+
).to_numpy()
131130
self.y_pred_control = self.model.predict(new_x)
132131

133132
# predicted outcome for treatment group
@@ -144,9 +143,9 @@ def __init__(
144143
)
145144
if self.x_pred_treatment.empty:
146145
raise ValueError("x_pred_treatment is empty")
147-
new_x = np.array(
148-
design_matrices(self.rhs_formula, self.x_pred_treatment).common
149-
)
146+
new_x = model_matrix(
147+
spec=self.rhs_matrix_spec, data=self.x_pred_treatment
148+
).to_numpy()
150149
self.y_pred_treatment = self.model.predict(new_x)
151150

152151
# predicted outcome for counterfactual. This is given by removing the influence
@@ -166,9 +165,9 @@ def __init__(
166165
)
167166
if self.x_pred_counterfactual.empty:
168167
raise ValueError("x_pred_counterfactual is empty")
169-
new_x = np.array(
170-
design_matrices(self.rhs_formula, self.x_pred_counterfactual).common
171-
)
168+
new_x = model_matrix(
169+
spec=self.rhs_matrix_spec, data=self.x_pred_counterfactual
170+
).to_numpy()
172171
# INTERVENTION: set the interaction term between the group and the
173172
# post_treatment variable to zero. This is the counterfactual.
174173
for i, label in enumerate(self.labels):

causalpy/experiments/instrumental_variable.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import numpy as np
2121
import pandas as pd
22-
from patsy import dmatrices
22+
from formulaic import model_matrix
2323
from sklearn.linear_model import LinearRegression as sk_lin_reg
2424

2525
from causalpy.custom_exceptions import DataException
@@ -110,19 +110,17 @@ def __init__(
110110
self.model = model
111111
self.input_validation()
112112

113-
y, X = dmatrices(formula, self.data)
114-
self._y_design_info = y.design_info
115-
self._x_design_info = X.design_info
116-
self.labels = X.design_info.column_names
117-
self.y, self.X = np.asarray(y), np.asarray(X)
118-
self.outcome_variable_name = y.design_info.column_names[0]
113+
dm = model_matrix(self.formula, self.data)
114+
self.labels = list(dm.rhs.columns)
115+
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
116+
self.rhs_matrix_spec = dm.rhs.model_spec
117+
self.outcome_variable_name = dm.lhs.columns[0]
119118

120-
t, Z = dmatrices(instruments_formula, self.instruments_data)
121-
self._t_design_info = t.design_info
122-
self._z_design_info = Z.design_info
123-
self.labels_instruments = Z.design_info.column_names
124-
self.t, self.Z = np.asarray(t), np.asarray(Z)
125-
self.instrument_variable_name = t.design_info.column_names[0]
119+
dm = model_matrix(self.instruments_formula, self.instruments_data)
120+
self.labels_instruments = list(dm.rhs.columns)
121+
self.t, self.Z = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
122+
self.instrument_rhs_matrix_spec = dm.rhs.model_spec
123+
self.instrument_variable_name = dm.lhs.columns[0]
126124

127125
self.get_naive_OLS_fit()
128126
self.get_2SLS_fit()
@@ -176,7 +174,7 @@ def get_2SLS_fit(self):
176174
fitted_Z_values = first_stage_reg.predict(self.Z)
177175
X2 = self.data.copy(deep=True)
178176
X2[self.instrument_variable_name] = fitted_Z_values
179-
_, X2 = dmatrices(self.formula, X2)
177+
X2 = model_matrix(self.formula, X2).rhs.to_numpy()
180178
second_stage_reg = sk_lin_reg().fit(X=X2, y=self.y)
181179
betas_first = list(first_stage_reg.coef_[0][1:])
182180
betas_first.insert(0, first_stage_reg.intercept_[0])
@@ -196,7 +194,7 @@ def get_naive_OLS_fit(self):
196194
ols_reg = sk_lin_reg().fit(self.X, self.y)
197195
beta_params = list(ols_reg.coef_[0][1:])
198196
beta_params.insert(0, ols_reg.intercept_[0])
199-
self.ols_beta_params = dict(zip(self._x_design_info.column_names, beta_params))
197+
self.ols_beta_params = dict(zip(self.labels, beta_params))
200198
self.ols_reg = ols_reg
201199

202200
def plot(self, round_to=None):

causalpy/experiments/interrupted_time_series.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
import arviz as az
2121
import numpy as np
2222
import pandas as pd
23+
from formulaic import model_matrix
2324
from matplotlib import pyplot as plt
24-
from patsy import build_design_matrices, dmatrices
2525
from sklearn.base import RegressorMixin
2626

2727
from causalpy.custom_exceptions import BadIndexException
@@ -95,18 +95,15 @@ def __init__(
9595
self.formula = formula
9696

9797
# set things up with pre-intervention data
98-
y, X = dmatrices(formula, self.datapre)
99-
self.outcome_variable_name = y.design_info.column_names[0]
100-
self._y_design_info = y.design_info
101-
self._x_design_info = X.design_info
102-
self.labels = X.design_info.column_names
103-
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
98+
dm = model_matrix(self.formula, self.datapre)
99+
self.labels = list(dm.rhs.columns)
100+
self.matrix_spec = dm.model_spec
101+
self.outcome_variable_name = dm.lhs.columns[0]
102+
self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
104103
# process post-intervention data
105-
(new_y, new_x) = build_design_matrices(
106-
[self._y_design_info, self._x_design_info], self.datapost
107-
)
108-
self.post_X = np.asarray(new_x)
109-
self.post_y = np.asarray(new_y)
104+
new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
105+
self.post_X = new_dm.rhs.to_numpy()
106+
self.post_y = new_dm.lhs.to_numpy()
110107

111108
# fit the model to the observed (pre-intervention) data
112109
if isinstance(self.model, PyMCModel):

causalpy/experiments/inverse_propensity_weighting.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
import matplotlib.pyplot as plt
2222
import numpy as np
2323
import pandas as pd
24+
from formulaic import model_matrix
2425
from matplotlib.lines import Line2D
25-
from patsy import dmatrices
2626
from sklearn.linear_model import LinearRegression as sk_lin_reg
2727

2828
from causalpy.custom_exceptions import DataException
@@ -89,11 +89,11 @@ def __init__(
8989
self.weighting_scheme = weighting_scheme
9090
self.input_validation()
9191

92-
t, X = dmatrices(formula, self.data)
93-
self._t_design_info = t.design_info
94-
self._t_design_info = X.design_info
95-
self.labels = X.design_info.column_names
96-
self.t, self.X = np.asarray(t), np.asarray(X)
92+
dm = model_matrix(self.formula, self.data)
93+
self.labels = list(dm.rhs.columns)
94+
self.t, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
95+
self.rhs_matrix_spec = dm.rhs.model_spec
96+
self.outcome_variable_name = dm.lhs.columns[0]
9797
self.y = self.data[self.outcome_variable]
9898

9999
COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels}

causalpy/experiments/prepostnegd.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
import numpy as np
2222
import pandas as pd
2323
import seaborn as sns
24+
from formulaic import model_matrix
2425
from matplotlib import pyplot as plt
25-
from patsy import build_design_matrices, dmatrices
2626
from sklearn.base import RegressorMixin
2727

2828
from causalpy.custom_exceptions import (
@@ -104,12 +104,11 @@ def __init__(
104104
self.pretreatment_variable_name = pretreatment_variable_name
105105
self.input_validation()
106106

107-
y, X = dmatrices(formula, self.data)
108-
self._y_design_info = y.design_info
109-
self._x_design_info = X.design_info
110-
self.labels = X.design_info.column_names
111-
self.y, self.X = np.asarray(y), np.asarray(X)
112-
self.outcome_variable_name = y.design_info.column_names[0]
107+
dm = model_matrix(self.formula, self.data)
108+
self.labels = list(dm.rhs.columns)
109+
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
110+
self.rhs_matrix_spec = dm.rhs.model_spec
111+
self.outcome_variable_name = dm.lhs.columns[0]
113112

114113
# fit the model to the observed (pre-intervention) data
115114
if isinstance(self.model, PyMCModel):
@@ -135,19 +134,21 @@ def __init__(
135134
self.group_variable_name: np.zeros(self.pred_xi.shape),
136135
}
137136
)
138-
(new_x_untreated,) = build_design_matrices(
139-
[self._x_design_info], x_pred_untreated
140-
)
141-
self.pred_untreated = self.model.predict(X=np.asarray(new_x_untreated))
137+
new_x_untreated = model_matrix(
138+
spec=self.rhs_matrix_spec, data=x_pred_untreated
139+
).to_numpy()
140+
self.pred_untreated = self.model.predict(X=new_x_untreated)
142141
# treated
143142
x_pred_treated = pd.DataFrame(
144143
{
145144
self.pretreatment_variable_name: self.pred_xi,
146145
self.group_variable_name: np.ones(self.pred_xi.shape),
147146
}
148147
)
149-
(new_x_treated,) = build_design_matrices([self._x_design_info], x_pred_treated)
150-
self.pred_treated = self.model.predict(X=np.asarray(new_x_treated))
148+
new_x_treated = model_matrix(
149+
spec=self.rhs_matrix_spec, data=x_pred_treated
150+
).to_numpy()
151+
self.pred_treated = self.model.predict(X=new_x_treated)
151152

152153
# Evaluate causal impact as equal to the trestment effect
153154
self.causal_impact = self.model.idata.posterior["beta"].sel(

causalpy/experiments/regression_discontinuity.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import pandas as pd
2222
import seaborn as sns
2323
from matplotlib import pyplot as plt
24-
from patsy import build_design_matrices, dmatrices
24+
from formulaic import model_matrix
2525
from sklearn.base import RegressorMixin
2626

2727
from causalpy.custom_exceptions import (
@@ -111,15 +111,14 @@ def __init__(
111111
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501
112112
UserWarning,
113113
)
114-
y, X = dmatrices(formula, filtered_data)
114+
dm = model_matrix(formula, filtered_data)
115115
else:
116-
y, X = dmatrices(formula, self.data)
116+
dm = model_matrix(formula, self.data)
117117

118-
self._y_design_info = y.design_info
119-
self._x_design_info = X.design_info
120-
self.labels = X.design_info.column_names
121-
self.y, self.X = np.asarray(y), np.asarray(X)
122-
self.outcome_variable_name = y.design_info.column_names[0]
118+
self.labels = list(dm.rhs.columns)
119+
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
120+
self.rhs_matrix_spec = dm.rhs.model_spec
121+
self.outcome_variable_name = dm.lhs.columns[0]
123122

124123
# fit model
125124
if isinstance(self.model, PyMCModel):
@@ -146,8 +145,8 @@ def __init__(
146145
self.x_pred = pd.DataFrame(
147146
{self.running_variable_name: xi, "treated": self._is_treated(xi)}
148147
)
149-
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
150-
self.pred = self.model.predict(X=np.asarray(new_x))
148+
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
149+
self.pred = self.model.predict(X=new_x)
151150

152151
# calculate discontinuity by evaluating the difference in model expectation on
153152
# either side of the discontinuity
@@ -164,8 +163,8 @@ def __init__(
164163
"treated": np.array([0, 1]),
165164
}
166165
)
167-
(new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
168-
self.pred_discon = self.model.predict(X=np.asarray(new_x))
166+
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_discon).to_numpy()
167+
self.pred_discon = self.model.predict(X=new_x)
169168

170169
# ******** THIS IS SUBOPTIMAL AT THE MOMENT ************************************
171170
if isinstance(self.model, PyMCModel):

causalpy/experiments/regression_kink.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import numpy as np
2323
import pandas as pd
2424
import seaborn as sns
25-
from patsy import build_design_matrices, dmatrices
25+
from formulaic import model_matrix
2626

2727
from causalpy.plot_utils import plot_xY
2828

@@ -74,15 +74,14 @@ def __init__(
7474
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501
7575
UserWarning,
7676
)
77-
y, X = dmatrices(formula, filtered_data)
77+
dm = model_matrix(formula, filtered_data)
7878
else:
79-
y, X = dmatrices(formula, self.data)
79+
dm = model_matrix(formula, self.data)
8080

81-
self._y_design_info = y.design_info
82-
self._x_design_info = X.design_info
83-
self.labels = X.design_info.column_names
84-
self.y, self.X = np.asarray(y), np.asarray(X)
85-
self.outcome_variable_name = y.design_info.column_names[0]
81+
self.labels = list(dm.rhs.columns)
82+
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
83+
self.rhs_matrix_spec = dm.rhs.model_spec
84+
self.outcome_variable_name = dm.lhs.columns[0]
8685

8786
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
8887
self.model.fit(X=self.X, y=self.y, coords=COORDS)
@@ -102,8 +101,8 @@ def __init__(
102101
self.x_pred = pd.DataFrame(
103102
{self.running_variable_name: xi, "treated": self._is_treated(xi)}
104103
)
105-
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
106-
self.pred = self.model.predict(X=np.asarray(new_x))
104+
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
105+
self.pred = self.model.predict(X=new_x)
107106

108107
# evaluate gradient change around kink point
109108
mu_kink_left, mu_kink, mu_kink_right = self._probe_kink_point()
@@ -158,8 +157,8 @@ def _probe_kink_point(self):
158157
"treated": np.array([0, 1, 1]),
159158
}
160159
)
161-
(new_x,) = build_design_matrices([self._x_design_info], x_predict)
162-
predicted = self.model.predict(X=np.asarray(new_x))
160+
new_x = model_matrix(spec=self.rhs_matrix_spec, data=x_predict).to_numpy()
161+
predicted = self.model.predict(X=new_x)
163162
# extract predicted mu values
164163
mu_kink_left = predicted["posterior_predictive"].sel(obs_ind=0)["mu"]
165164
mu_kink = predicted["posterior_predictive"].sel(obs_ind=1)["mu"]

causalpy/experiments/synthetic_control.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
import arviz as az
2121
import numpy as np
2222
import pandas as pd
23+
from formulaic import model_matrix
2324
from matplotlib import pyplot as plt
24-
from patsy import build_design_matrices, dmatrices
2525
from sklearn.base import RegressorMixin
2626

2727
from causalpy.custom_exceptions import BadIndexException
@@ -90,18 +90,15 @@ def __init__(
9090
self.formula = formula
9191

9292
# set things up with pre-intervention data
93-
y, X = dmatrices(formula, self.datapre)
94-
self.outcome_variable_name = y.design_info.column_names[0]
95-
self._y_design_info = y.design_info
96-
self._x_design_info = X.design_info
97-
self.labels = X.design_info.column_names
98-
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
93+
dm = model_matrix(self.formula, self.datapre)
94+
self.labels = list(dm.rhs.columns)
95+
self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
96+
self.matrix_spec = dm.model_spec
97+
self.outcome_variable_name = dm.lhs.columns[0]
9998
# process post-intervention data
100-
(new_y, new_x) = build_design_matrices(
101-
[self._y_design_info, self._x_design_info], self.datapost
102-
)
103-
self.post_X = np.asarray(new_x)
104-
self.post_y = np.asarray(new_y)
99+
new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
100+
self.post_X = new_dm.rhs.to_numpy()
101+
self.post_y = new_dm.lhs.to_numpy()
105102

106103
# fit the model to the observed (pre-intervention) data
107104
if isinstance(self.model, PyMCModel):

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ dependencies = [
3434
"numpy",
3535
"pandas",
3636
"patsy",
37-
"formulae",
37+
"formulaic",
3838
"pymc>=5.15.1",
3939
"scikit-learn>=1",
4040
"scipy",

0 commit comments

Comments
 (0)