Merge pull request #321 from py-why/speedup_info

add new methods to the notebooks and readme
py-why · Sep 12, 2024 · d2c65f8 · d2c65f8
2 parents 98d92ca + f533461
commit d2c65f8
Show file tree

Hide file tree

Showing 7 changed files with 1,662 additions and 486 deletions.
diff --git a/README.md b/README.md
@@ -190,6 +190,10 @@ print(f"Best estimator: {ct.best_estimator}")
 
 ```
 
+Now if ***outcome_model="auto"*** in the CausalTune constructor, we search over a simultaneous search space for the EconML estimators and for FLAML wrappers for common regressors. The old behavior is now achieved by ***outcome_model="nested"*** (Refitting AutoML for each estimator).
+
+You can also preprocess the data in the CausalityDataset using one of the popular category encoders: ***OneHot, WoE, Label, Target***.
+
 ## Supported Models
 The package supports the following causal estimators:
 * Meta Learners:

diff --git a/causaltune/dataset_processor.py b/causaltune/dataset_processor.py
@@ -8,7 +8,18 @@
 
 
 class CausalityDatasetProcessor(BaseEstimator, TransformerMixin):
+    """
+    A processor for CausalityDataset, designed to preprocess data for causal inference tasks by encoding, normalizing,
+    and handling missing values.
+    Attributes:
+        encoder_type (str): Type of encoder used for categorical feature encoding ('onehot', 'label', 'target', 'woe').
+        outcome (str): The target variable used for encoding.
+        encoder: Encoder object used during feature transformations.
+    """
     def __init__(self):
+        """
+        Initializes CausalityDatasetProcessor with default attributes for encoder_type, outcome, and encoder.
+        """
         self.encoder_type = None
         self.outcome = None
         self.encoder = None
@@ -19,13 +30,31 @@ def fit(
         encoder_type: Optional[str] = "onehot",
         outcome: str = None,
     ):
+        """
+        Fits the processor by preprocessing the input CausalityDataset.
+        Args:
+            cd (CausalityDataset): The dataset for causal analysis.
+            encoder_type (str, optional): Encoder to use for categorical features. Default is 'onehot'.
+            outcome (str, optional): The target variable for encoding (needed for 'target' or 'woe'). Default is None.
+        Returns:
+            CausalityDatasetProcessor: The fitted processor instance.
+        """
         cd = copy.deepcopy(cd)
         self.preprocess_dataset(
             cd, encoder_type=encoder_type, outcome=outcome, fit_phase=True
         )
         return self
 
     def transform(self, cd: CausalityDataset):
+        """
+        Transforms the CausalityDataset using the fitted encoder.
+        Args:
+            cd (CausalityDataset): Dataset to transform.
+        Returns:
+            CausalityDataset: Transformed dataset.
+        Raises:
+            ValueError: If processor has not been trained yet.
+        """
         if self.encoder:
             cd = self.preprocess_dataset(
                 cd,

diff --git a/notebooks/AB testing.ipynb b/notebooks/AB testing.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -49,7 +48,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -78,7 +76,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -97,23 +94,20 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "#### Data Generating Process"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "We first create synthetic data from a DGP with perfect randomisation of the treatment as we are replicating an AB test environment"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -154,7 +148,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -179,7 +172,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now if outcome_model=\"auto\" in the CausalTune constructor, we search over a simultaneous search space for the EconML estimators and for FLAML wrappers for common regressors. The old behavior is now achieved by outcome_model=\"nested\" (the default for now)"
+    "Now if `outcome_model=\"auto\"` in the CausalTune constructor, we search over a simultaneous search space for the EconML estimators and for FLAML wrappers for common regressors. The old behavior is now achieved by `outcome_model=\"nested\"` (Refitting AutoML for each estimator).\n",
+    "\n",
+    "You can also preprocess the data in the CausalityDataset using one of the popular category encoders: OneHot, WoE, Label, Target."
    ]
   },
   {
@@ -201,7 +196,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -230,7 +224,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -274,7 +267,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -356,21 +348,18 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": []
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 2. Segmentation with Wise Pizza"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -431,7 +420,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": []

diff --git a/notebooks/CausalityDataset setup.ipynb b/notebooks/CausalityDataset setup.ipynb