Merge branch 'master' into dependabot/pip/jinja2-3.1.3

Signed-off-by: Samuel Hoffman <[email protected]>
Trusted-AI · Feb 21, 2024 · 251d112 · 251d112
2 parents c45f6fe + d2ba8c4
commit 251d112
Show file tree

Hide file tree

Showing 23 changed files with 295 additions and 484 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,6 +43,7 @@ jobs:
 
       - name: Install dependencies
         run: |
+          sudo apt install libglpk-dev libxml2-dev
           python -m pip install --upgrade pip setuptools wheel
           pip install -e '.[all]'
           pip install flake8

diff --git a/aif360/datasets/law_school_gpa_dataset.py b/aif360/datasets/law_school_gpa_dataset.py
@@ -1,36 +1,24 @@
-import os
 import pandas as pd
+from sklearn.model_selection import train_test_split
 from aif360.datasets import RegressionDataset
-try:
-    import tempeh.configurations as tc
-except ImportError as error:
-    from logging import warning
-    warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n"
-            "pip install 'aif360[LawSchoolGPA]'".format(error))
+from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL
 
 class LawSchoolGPADataset(RegressionDataset):
-    """Law School GPA dataset.
-
-    See https://github.com/microsoft/tempeh for details.
-    """
+    """Law School GPA dataset."""
 
     def __init__(self, dep_var_name='zfygpa',
-                 protected_attribute_names=['race'],
-                 privileged_classes=[['white']],
+                 protected_attribute_names=['race', 'gender'],
+                 privileged_classes=[['white'], ['male']],
                  instance_weights_name=None,
                  categorical_features=[],
                  na_values=[], custom_preprocessing=None,
                  metadata=None):
         """See :obj:`RegressionDataset` for a description of the arguments."""
-        dataset = tc.datasets["lawschool_gpa"]()
-        X_train,X_test = dataset.get_X(format=pd.DataFrame)
-        y_train, y_test = dataset.get_y(format=pd.Series)
-        A_train, A_test = dataset.get_sensitive_features(name='race',
-                                                         format=pd.Series)
-        all_train = pd.concat([X_train, y_train, A_train], axis=1)
-        all_test = pd.concat([X_test, y_test, A_test], axis=1)
-
-        df = pd.concat([all_train, all_test], axis=0)
+        df = pd.read_sas(LSAC_URL, encoding="utf-8")
+        df.race = df.race1.where(df.race1.isin(['black', 'white']))
+        df.gender = df.gender.fillna("female")
+        df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna()
+        df = pd.concat(train_test_split(df, test_size=0.33, random_state=123))
 
         super(LawSchoolGPADataset, self).__init__(df=df,
             dep_var_name=dep_var_name,

diff --git a/aif360/metrics/ot_metric.py b/aif360/metrics/ot_metric.py
@@ -1,7 +1,12 @@
 from typing import Union
 import pandas as pd
 import numpy as np
-import ot
+try:
+    import ot
+except ImportError as error:
+    from logging import warning
+    warning("{}: ot_distance will be unavailable. To install, run:\n"
+            "pip install 'aif360[OptimalTransport]'".format(error))
 from sklearn.preprocessing import LabelEncoder
 
 def _normalize(distribution1, distribution2):
@@ -17,7 +22,7 @@ def _normalize(distribution1, distribution2):
         extra = -np.minimum(np.min(distribution1), np.min(distribution2))
         distribution1 += extra
         distribution2 += extra
-    
+
     total_of_distribution1 = np.sum(distribution1)
     if total_of_distribution1 != 0:
         distribution1 /= total_of_distribution1
@@ -75,10 +80,10 @@ def _evaluate(
     if prot_attr is None:
         initial_distribution, required_distribution, matrix_distance = _transform(ground_truth, classifier, cost_matrix)
         return ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)
-    
+
     if not ground_truth.nunique() == 2:
         raise ValueError(f"Expected to have exactly 2 target values, got {ground_truth.nunique()}.")
-    
+
     # Calculate EMD between ground truth distribution and distribution of each group
     emds = {}
     for sa_val in sorted(prot_attr.unique()):
@@ -137,7 +142,7 @@ def ot_distance(
     # Assert correct mode passed
     if mode not in ['binary', 'continuous', 'nominal', 'ordinal']:
         raise ValueError(f"Expected one of {['binary', 'continuous', 'nominal', 'ordinal']}, got {mode}.")
-    
+
     # Assert correct types passed to ground_truth, classifier and prot_attr
     if not isinstance(ground_truth, (pd.Series, str)):
         raise TypeError(f"ground_truth: expected pd.Series or str, got {type(ground_truth)}")
@@ -148,17 +153,17 @@ def ot_distance(
             raise TypeError(f"classifier: expected pd.DataFrame for {mode} mode, got {type(classifier)}")
     if prot_attr is not None and not isinstance(prot_attr, (pd.Series, str)):
         raise TypeError(f"prot_attr: expected pd.Series or str, got {type(prot_attr)}")
-    
+
     # Assert correct type passed to cost_matrix
     if cost_matrix is not None and not isinstance(cost_matrix, np.ndarray):
         raise TypeError(f"cost_matrix: expected numpy.ndarray, got {type(cost_matrix)}")
-    
+
     # Assert scoring is "Wasserstein1"
     if not scoring == "Wasserstein1":
         raise ValueError(f"Scoring mode can only be \"Wasserstein1\", got {scoring}")
-    
+
     grt = ground_truth.copy()
- 
+
     if classifier is not None:
         cls = classifier.copy()
         if prot_attr is not None:
@@ -171,7 +176,7 @@ def ot_distance(
         sat.index = grt.index
     else:
         sat = None
-    
+
     uniques = list(grt.unique())
     if mode == "binary":
         if len(uniques) > 2:

diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py
@@ -12,4 +12,4 @@
 from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank
 from aif360.sklearn.datasets.compas_dataset import fetch_compas
 from aif360.sklearn.datasets.meps_datasets import fetch_meps
-from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa
+from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa
diff --git a/aif360/sklearn/datasets/lawschool_dataset.py b/aif360/sklearn/datasets/lawschool_dataset.py
@@ -0,0 +1,89 @@
+from io import BytesIO
+import os
+import urllib
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+from aif360.sklearn.datasets.utils import standardize_dataset, Dataset
+
+
+# cache location
+DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                 '..', 'data', 'raw')
+LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat"
+
+def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True,
+                        binary_race=True, fillna_gender="female",
+                        usecols=["race", "gender", "lsat", "ugpa"],
+                        dropcols=None, numeric_only=False, dropna=True):
+    """Load the Law School GPA dataset.
+
+    Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged).
+    The other protected attribute is gender ('male' is privileged and 'female'
+    is unprivileged). The outcome variable is standardized first year GPA
+    ('zfygpa'). Note: this is a continuous variable, i.e., a regression task.
+
+    Args:
+        subset ({'train', 'test', or 'all'}, optional): Select the dataset to
+            load: 'train' for the training set, 'test' for the test set, 'all'
+            for both.
+        data_home (string, optional): Specify another download and cache folder
+            for the datasets. By default all AIF360 datasets are stored in
+            'aif360/sklearn/data/raw' subfolders.
+        cache (bool): Whether to cache downloaded datasets.
+        binary_race (bool, optional): Filter only white and black students.
+        fillna_gender (str or None, optional): Fill NA values for gender with
+            this value. If `None`, leave as NA. Note: this is used for backward-
+            compatibility with tempeh and may be dropped in later versions.
+        usecols (single label or list-like, optional): Feature column(s) to
+            keep. All others are dropped.
+        dropcols (single label or list-like, optional): Feature column(s) to
+            drop.
+        numeric_only (bool): Drop all non-numeric feature columns.
+        dropna (bool): Drop rows with NAs.
+
+    Returns:
+        namedtuple: Tuple containing X, y, and sample_weights for the Law School
+        GPA dataset accessible by index or name.
+    """
+    if subset not in {'train', 'test', 'all'}:
+        raise ValueError("subset must be either 'train', 'test', or 'all'; "
+                         "cannot be {}".format(subset))
+
+    cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
+                              os.path.basename(LSAC_URL))
+    if cache and os.path.isfile(cache_path):
+        df = pd.read_sas(cache_path, encoding="utf-8")
+    else:
+        data = urllib.request.urlopen(LSAC_URL).read()
+        if cache:
+            os.makedirs(os.path.dirname (cache_path), exist_ok=True)
+            with open(cache_path, 'wb') as f:
+                f.write(data)
+        df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8")
+
+    df.race = df.race1.astype('category')
+    if binary_race:
+        df.race = df.race.cat.set_categories(['black', 'white'], ordered=True)
+
+    # for backwards-compatibility with tempeh
+    if fillna_gender is not None:
+        df.gender = df.gender.fillna(fillna_gender)
+    df.gender = df.gender.astype('category').cat.set_categories(
+        ['female', 'male'], ordered=True)
+
+    ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa',
+                               usecols=usecols, dropcols=dropcols,
+                               numeric_only=numeric_only, dropna=dropna)
+
+    # for backwards-compatibility with tempeh
+    train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123)
+    if subset == "train":
+        return Dataset(train_X, train_y)
+    elif subset == "test":
+        return Dataset(test_X, test_y)
+    else:
+        X = pd.concat([train_X, test_X], axis=0)
+        y = pd.concat([train_y, test_y], axis=0)
+        return Dataset(X, y)
diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py
@@ -168,19 +168,20 @@ def fetch_german(*, data_home=None, cache=True, binary_age=True, usecols=None,
                                dropcols=dropcols, numeric_only=numeric_only,
                                dropna=dropna)
 
-def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False, 
+def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
                usecols=None, dropcols=['duration'], numeric_only=False, dropna=False):
     """Load the Bank Marketing Dataset.
 
-    The protected attribute is 'age' (binarized by default as suggested by [#lequy22]: 
-    age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged; 
-    see the binary_age flag to keep this continuous). The outcome variable is 'deposit': 
+    The protected attribute is 'age' (binarized by default as suggested by [#lequy22]_:
+    age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged;
+    see the binary_age flag to keep this continuous). The outcome variable is 'deposit':
     'yes' or 'no'.
 
-        References:
-            .. [#lequy22] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine 
-            learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge 
-            Discovery 12.3 (2022): e1452.
+    References:
+        .. [#lequy22] `Le Quy, Tai, et al. "A survey on datasets for fairness-
+           aware machine learning." Wiley Interdisciplinary Reviews: Data Mining
+           and Knowledge Discovery 12.3 (2022): e1452.
+           <https://wires.onlinelibrary.wiley.com/doi/pdf/10.1002/widm.1452>`_
 
     Note:
         By default, the data is downloaded from OpenML. See the `bank-marketing
@@ -235,15 +236,15 @@ def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
             df[col] = df[col].cat.remove_categories('unknown')
     df.education = df.education.astype('category').cat.reorder_categories(
         ['primary', 'secondary', 'tertiary'], ordered=True)
-    
+
     # binarize protected attribute (but not corresponding feature)
     age = (pd.cut(df.age, [0, 24, 60, 100], ordered=False,
-                labels=[0, 1, 0] if numeric_only 
+                labels=[0, 1, 0] if numeric_only
                 else ['<25 or >=60', '25-60', '<25 or >=60'])
         if binary_age else 'age')
-    age = age.cat.reorder_categories([0, 1] if numeric_only 
+    age = age.cat.reorder_categories([0, 1] if numeric_only
                                     else ['<25 or >=60', '25-60'])
-    
+
     return standardize_dataset(df, prot_attr=[age], target='deposit',
                                usecols=usecols, dropcols=dropcols,
                                numeric_only=numeric_only, dropna=dropna)
diff --git a/aif360/sklearn/datasets/tempeh_datasets.py b/aif360/sklearn/datasets/tempeh_datasets.py
diff --git a/aif360/sklearn/inprocessing/infairness.py b/aif360/sklearn/inprocessing/infairness.py
@@ -1,10 +1,16 @@
-from inFairness import fairalgo
+try:
+    from inFairness import fairalgo
+    from skorch import NeuralNet
+    from skorch.dataset import unpack_data, Dataset as Dataset_
+    from skorch.utils import is_pandas_ndframe
+except ImportError as error:
+    from logging import warning
+    warning("{}: SenSeI and SenSR will be unavailable. To install, run:\n"
+            "pip install 'aif360[inFairness]'".format(error))
+    Dataset_ = NeuralNet = object
 from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils.multiclass import type_of_target
 from sklearn.exceptions import NotFittedError
-from skorch import NeuralNet
-from skorch.dataset import unpack_data, Dataset as Dataset_
-from skorch.utils import is_pandas_ndframe
 
 
 class Dataset(Dataset_):

diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py
@@ -17,30 +17,6 @@
 from aif360.detectors.mdss.MDSS import MDSS
 
 
-__all__ = [
-    # meta-metrics
-    'difference', 'ratio', 'intersection', 'one_vs_rest',
-    # scorer factory
-    'make_scorer',
-    # helpers
-    'num_samples', 'num_pos_neg',
-    'specificity_score', 'base_rate', 'selection_rate', 'smoothed_base_rate',
-    'smoothed_selection_rate', 'generalized_fpr', 'generalized_fnr',
-    # group fairness
-    'ot_distance', 'statistical_parity_difference', 'disparate_impact_ratio',
-    'equal_opportunity_difference', 'average_odds_difference', 'average_predictive_value_difference',
-    'average_odds_error', 'class_imbalance', 'kl_divergence',
-    'conditional_demographic_disparity', 'smoothed_edf',
-    'df_bias_amplification', 'mdss_bias_score',
-    # individual fairness
-    'generalized_entropy_index', 'generalized_entropy_error',
-    'between_group_generalized_entropy_error', 'theil_index',
-    'coefficient_of_variation', 'consistency_score',
-    # aliases
-    'sensitivity_score', 'mean_difference', 'false_negative_rate_error',
-    'false_positive_rate_error'
-]
-
 # ============================= META-METRICS ===================================
 def difference(func, y_true, y_pred=None, prot_attr=None, priv_group=1,
                sample_weight=None, **kwargs):