Switch to formulaic

ksolarski · ksolarski · commit 0734c8b7a25b · 2025-05-28T22:17:29.000+03:00
diff --git a/causalpy/experiments/diff_in_diff.py b/causalpy/experiments/diff_in_diff.py
@@ -19,7 +19,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-from formulae import design_matrices
+from formulaic import model_matrix
 from matplotlib import pyplot as plt
 from sklearn.base import RegressorMixin
 
@@ -91,18 +91,15 @@ def __init__(
         self.data = data
         self.expt_type = "Difference in Differences"
         self.formula = formula
-        self.rhs_formula = formula.split("~", 1)[1].strip()
         self.time_variable_name = time_variable_name
         self.group_variable_name = group_variable_name
         self.input_validation()
 
-        dm = design_matrices(self.formula, self.data)
-        self.labels = list(dm.common.terms.keys())
-        self.y, self.X = (
-            np.asarray(dm.response.design_matrix).reshape(-1, 1),
-            np.asarray(dm.common.design_matrix),
-        )
-        self.outcome_variable_name = dm.response.name
+        dm = model_matrix(self.formula, self.data)
+        self.labels = list(dm.rhs.columns)
+        self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
 
         # fit model
         if isinstance(self.model, PyMCModel):
@@ -127,7 +124,9 @@ def __init__(
         )
         if self.x_pred_control.empty:
             raise ValueError("x_pred_control is empty")
-        new_x = np.array(design_matrices(self.rhs_formula, self.x_pred_control).common)
+        new_x = model_matrix(
+            spec=self.rhs_matrix_spec, data=self.x_pred_control
+        ).to_numpy()
         self.y_pred_control = self.model.predict(new_x)
 
         # predicted outcome for treatment group
@@ -144,9 +143,9 @@ def __init__(
         )
         if self.x_pred_treatment.empty:
             raise ValueError("x_pred_treatment is empty")
-        new_x = np.array(
-            design_matrices(self.rhs_formula, self.x_pred_treatment).common
-        )
+        new_x = model_matrix(
+            spec=self.rhs_matrix_spec, data=self.x_pred_treatment
+        ).to_numpy()
         self.y_pred_treatment = self.model.predict(new_x)
 
         # predicted outcome for counterfactual. This is given by removing the influence
@@ -166,9 +165,9 @@ def __init__(
         )
         if self.x_pred_counterfactual.empty:
             raise ValueError("x_pred_counterfactual is empty")
-        new_x = np.array(
-            design_matrices(self.rhs_formula, self.x_pred_counterfactual).common
-        )
+        new_x = model_matrix(
+            spec=self.rhs_matrix_spec, data=self.x_pred_counterfactual
+        ).to_numpy()
         # INTERVENTION: set the interaction term between the group and the
         # post_treatment variable to zero. This is the counterfactual.
         for i, label in enumerate(self.labels):
diff --git a/causalpy/experiments/instrumental_variable.py b/causalpy/experiments/instrumental_variable.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import pandas as pd
-from patsy import dmatrices
+from formulaic import model_matrix
 from sklearn.linear_model import LinearRegression as sk_lin_reg
 
 from causalpy.custom_exceptions import DataException
@@ -110,19 +110,17 @@ def __init__(
         self.model = model
         self.input_validation()
 
-        y, X = dmatrices(formula, self.data)
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.y, self.X = np.asarray(y), np.asarray(X)
-        self.outcome_variable_name = y.design_info.column_names[0]
+        dm = model_matrix(self.formula, self.data)
+        self.labels = list(dm.rhs.columns)
+        self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
 
-        t, Z = dmatrices(instruments_formula, self.instruments_data)
-        self._t_design_info = t.design_info
-        self._z_design_info = Z.design_info
-        self.labels_instruments = Z.design_info.column_names
-        self.t, self.Z = np.asarray(t), np.asarray(Z)
-        self.instrument_variable_name = t.design_info.column_names[0]
+        dm = model_matrix(self.instruments_formula, self.instruments_data)
+        self.labels_instruments = list(dm.rhs.columns)
+        self.t, self.Z = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.instrument_rhs_matrix_spec = dm.rhs.model_spec
+        self.instrument_variable_name = dm.lhs.columns[0]
 
         self.get_naive_OLS_fit()
         self.get_2SLS_fit()
@@ -176,7 +174,7 @@ def get_2SLS_fit(self):
         fitted_Z_values = first_stage_reg.predict(self.Z)
         X2 = self.data.copy(deep=True)
         X2[self.instrument_variable_name] = fitted_Z_values
-        _, X2 = dmatrices(self.formula, X2)
+        X2 = model_matrix(self.formula, X2).rhs.to_numpy()
         second_stage_reg = sk_lin_reg().fit(X=X2, y=self.y)
         betas_first = list(first_stage_reg.coef_[0][1:])
         betas_first.insert(0, first_stage_reg.intercept_[0])
@@ -196,7 +194,7 @@ def get_naive_OLS_fit(self):
         ols_reg = sk_lin_reg().fit(self.X, self.y)
         beta_params = list(ols_reg.coef_[0][1:])
         beta_params.insert(0, ols_reg.intercept_[0])
-        self.ols_beta_params = dict(zip(self._x_design_info.column_names, beta_params))
+        self.ols_beta_params = dict(zip(self.labels, beta_params))
         self.ols_reg = ols_reg
 
     def plot(self, round_to=None):
diff --git a/causalpy/experiments/interrupted_time_series.py b/causalpy/experiments/interrupted_time_series.py
@@ -20,8 +20,8 @@
 import arviz as az
 import numpy as np
 import pandas as pd
+from formulaic import model_matrix
 from matplotlib import pyplot as plt
-from patsy import build_design_matrices, dmatrices
 from sklearn.base import RegressorMixin
 
 from causalpy.custom_exceptions import BadIndexException
@@ -95,18 +95,15 @@ def __init__(
         self.formula = formula
 
         # set things up with pre-intervention data
-        y, X = dmatrices(formula, self.datapre)
-        self.outcome_variable_name = y.design_info.column_names[0]
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
+        dm = model_matrix(self.formula, self.datapre)
+        self.labels = list(dm.rhs.columns)
+        self.matrix_spec = dm.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
+        self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
         # process post-intervention data
-        (new_y, new_x) = build_design_matrices(
-            [self._y_design_info, self._x_design_info], self.datapost
-        )
-        self.post_X = np.asarray(new_x)
-        self.post_y = np.asarray(new_y)
+        new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
+        self.post_X = new_dm.rhs.to_numpy()
+        self.post_y = new_dm.lhs.to_numpy()
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
diff --git a/causalpy/experiments/inverse_propensity_weighting.py b/causalpy/experiments/inverse_propensity_weighting.py
@@ -21,8 +21,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from formulaic import model_matrix
 from matplotlib.lines import Line2D
-from patsy import dmatrices
 from sklearn.linear_model import LinearRegression as sk_lin_reg
 
 from causalpy.custom_exceptions import DataException
@@ -89,11 +89,11 @@ def __init__(
         self.weighting_scheme = weighting_scheme
         self.input_validation()
 
-        t, X = dmatrices(formula, self.data)
-        self._t_design_info = t.design_info
-        self._t_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.t, self.X = np.asarray(t), np.asarray(X)
+        dm = model_matrix(self.formula, self.data)
+        self.labels = list(dm.rhs.columns)
+        self.t, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
         self.y = self.data[self.outcome_variable]
 
         COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels}
diff --git a/causalpy/experiments/prepostnegd.py b/causalpy/experiments/prepostnegd.py
@@ -21,8 +21,8 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from formulaic import model_matrix
 from matplotlib import pyplot as plt
-from patsy import build_design_matrices, dmatrices
 from sklearn.base import RegressorMixin
 
 from causalpy.custom_exceptions import (
@@ -104,12 +104,11 @@ def __init__(
         self.pretreatment_variable_name = pretreatment_variable_name
         self.input_validation()
 
-        y, X = dmatrices(formula, self.data)
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.y, self.X = np.asarray(y), np.asarray(X)
-        self.outcome_variable_name = y.design_info.column_names[0]
+        dm = model_matrix(self.formula, self.data)
+        self.labels = list(dm.rhs.columns)
+        self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
@@ -135,19 +134,21 @@ def __init__(
                 self.group_variable_name: np.zeros(self.pred_xi.shape),
             }
         )
-        (new_x_untreated,) = build_design_matrices(
-            [self._x_design_info], x_pred_untreated
-        )
-        self.pred_untreated = self.model.predict(X=np.asarray(new_x_untreated))
+        new_x_untreated = model_matrix(
+            spec=self.rhs_matrix_spec, data=x_pred_untreated
+        ).to_numpy()
+        self.pred_untreated = self.model.predict(X=new_x_untreated)
         # treated
         x_pred_treated = pd.DataFrame(
             {
                 self.pretreatment_variable_name: self.pred_xi,
                 self.group_variable_name: np.ones(self.pred_xi.shape),
             }
         )
-        (new_x_treated,) = build_design_matrices([self._x_design_info], x_pred_treated)
-        self.pred_treated = self.model.predict(X=np.asarray(new_x_treated))
+        new_x_treated = model_matrix(
+            spec=self.rhs_matrix_spec, data=x_pred_treated
+        ).to_numpy()
+        self.pred_treated = self.model.predict(X=new_x_treated)
 
         # Evaluate causal impact as equal to the trestment effect
         self.causal_impact = self.model.idata.posterior["beta"].sel(
diff --git a/causalpy/experiments/regression_discontinuity.py b/causalpy/experiments/regression_discontinuity.py
@@ -21,7 +21,7 @@
 import pandas as pd
 import seaborn as sns
 from matplotlib import pyplot as plt
-from patsy import build_design_matrices, dmatrices
+from formulaic import model_matrix
 from sklearn.base import RegressorMixin
 
 from causalpy.custom_exceptions import (
@@ -111,15 +111,14 @@ def __init__(
                     f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.",  # noqa: E501
                     UserWarning,
                 )
-            y, X = dmatrices(formula, filtered_data)
+            dm = model_matrix(formula, filtered_data)
         else:
-            y, X = dmatrices(formula, self.data)
+            dm = model_matrix(formula, self.data)
 
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.y, self.X = np.asarray(y), np.asarray(X)
-        self.outcome_variable_name = y.design_info.column_names[0]
+        self.labels = list(dm.rhs.columns)
+        self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
 
         # fit model
         if isinstance(self.model, PyMCModel):
@@ -146,8 +145,8 @@ def __init__(
         self.x_pred = pd.DataFrame(
             {self.running_variable_name: xi, "treated": self._is_treated(xi)}
         )
-        (new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
-        self.pred = self.model.predict(X=np.asarray(new_x))
+        new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
+        self.pred = self.model.predict(X=new_x)
 
         # calculate discontinuity by evaluating the difference in model expectation on
         # either side of the discontinuity
@@ -164,8 +163,8 @@ def __init__(
                 "treated": np.array([0, 1]),
             }
         )
-        (new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
-        self.pred_discon = self.model.predict(X=np.asarray(new_x))
+        new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_discon).to_numpy()
+        self.pred_discon = self.model.predict(X=new_x)
 
         # ******** THIS IS SUBOPTIMAL AT THE MOMENT ************************************
         if isinstance(self.model, PyMCModel):
diff --git a/causalpy/experiments/regression_kink.py b/causalpy/experiments/regression_kink.py
@@ -22,7 +22,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-from patsy import build_design_matrices, dmatrices
+from formulaic import model_matrix
 
 from causalpy.plot_utils import plot_xY
 
@@ -74,15 +74,14 @@ def __init__(
                     f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.",  # noqa: E501
                     UserWarning,
                 )
-            y, X = dmatrices(formula, filtered_data)
+            dm = model_matrix(formula, filtered_data)
         else:
-            y, X = dmatrices(formula, self.data)
+            dm = model_matrix(formula, self.data)
 
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.y, self.X = np.asarray(y), np.asarray(X)
-        self.outcome_variable_name = y.design_info.column_names[0]
+        self.labels = list(dm.rhs.columns)
+        self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.rhs_matrix_spec = dm.rhs.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
 
         COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
         self.model.fit(X=self.X, y=self.y, coords=COORDS)
@@ -102,8 +101,8 @@ def __init__(
         self.x_pred = pd.DataFrame(
             {self.running_variable_name: xi, "treated": self._is_treated(xi)}
         )
-        (new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
-        self.pred = self.model.predict(X=np.asarray(new_x))
+        new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
+        self.pred = self.model.predict(X=new_x)
 
         # evaluate gradient change around kink point
         mu_kink_left, mu_kink, mu_kink_right = self._probe_kink_point()
@@ -158,8 +157,8 @@ def _probe_kink_point(self):
                 "treated": np.array([0, 1, 1]),
             }
         )
-        (new_x,) = build_design_matrices([self._x_design_info], x_predict)
-        predicted = self.model.predict(X=np.asarray(new_x))
+        new_x = model_matrix(spec=self.rhs_matrix_spec, data=x_predict).to_numpy()
+        predicted = self.model.predict(X=new_x)
         # extract predicted mu values
         mu_kink_left = predicted["posterior_predictive"].sel(obs_ind=0)["mu"]
         mu_kink = predicted["posterior_predictive"].sel(obs_ind=1)["mu"]
diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py
@@ -20,8 +20,8 @@
 import arviz as az
 import numpy as np
 import pandas as pd
+from formulaic import model_matrix
 from matplotlib import pyplot as plt
-from patsy import build_design_matrices, dmatrices
 from sklearn.base import RegressorMixin
 
 from causalpy.custom_exceptions import BadIndexException
@@ -90,18 +90,15 @@ def __init__(
         self.formula = formula
 
         # set things up with pre-intervention data
-        y, X = dmatrices(formula, self.datapre)
-        self.outcome_variable_name = y.design_info.column_names[0]
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
+        dm = model_matrix(self.formula, self.datapre)
+        self.labels = list(dm.rhs.columns)
+        self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
+        self.matrix_spec = dm.model_spec
+        self.outcome_variable_name = dm.lhs.columns[0]
         # process post-intervention data
-        (new_y, new_x) = build_design_matrices(
-            [self._y_design_info, self._x_design_info], self.datapost
-        )
-        self.post_X = np.asarray(new_x)
-        self.post_y = np.asarray(new_y)
+        new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
+        self.post_X = new_dm.rhs.to_numpy()
+        self.post_y = new_dm.lhs.to_numpy()
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "numpy",
     "pandas",
     "patsy",
-    "formulae",
+    "formulaic",
     "pymc>=5.15.1",
     "scikit-learn>=1",
     "scipy",