Skip to content

Commit

Permalink
Merge branch 'master' into dependabot/pip/jinja2-3.1.3
Browse files Browse the repository at this point in the history
Signed-off-by: Samuel Hoffman <[email protected]>
  • Loading branch information
hoffmansc authored Feb 21, 2024
2 parents c45f6fe + d2ba8c4 commit 251d112
Show file tree
Hide file tree
Showing 23 changed files with 295 additions and 484 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:

- name: Install dependencies
run: |
sudo apt install libglpk-dev libxml2-dev
python -m pip install --upgrade pip setuptools wheel
pip install -e '.[all]'
pip install flake8
Expand Down
32 changes: 10 additions & 22 deletions aif360/datasets/law_school_gpa_dataset.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,24 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from aif360.datasets import RegressionDataset
try:
import tempeh.configurations as tc
except ImportError as error:
from logging import warning
warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n"
"pip install 'aif360[LawSchoolGPA]'".format(error))
from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL

class LawSchoolGPADataset(RegressionDataset):
"""Law School GPA dataset.
See https://github.com/microsoft/tempeh for details.
"""
"""Law School GPA dataset."""

def __init__(self, dep_var_name='zfygpa',
protected_attribute_names=['race'],
privileged_classes=[['white']],
protected_attribute_names=['race', 'gender'],
privileged_classes=[['white'], ['male']],
instance_weights_name=None,
categorical_features=[],
na_values=[], custom_preprocessing=None,
metadata=None):
"""See :obj:`RegressionDataset` for a description of the arguments."""
dataset = tc.datasets["lawschool_gpa"]()
X_train,X_test = dataset.get_X(format=pd.DataFrame)
y_train, y_test = dataset.get_y(format=pd.Series)
A_train, A_test = dataset.get_sensitive_features(name='race',
format=pd.Series)
all_train = pd.concat([X_train, y_train, A_train], axis=1)
all_test = pd.concat([X_test, y_test, A_test], axis=1)

df = pd.concat([all_train, all_test], axis=0)
df = pd.read_sas(LSAC_URL, encoding="utf-8")
df.race = df.race1.where(df.race1.isin(['black', 'white']))
df.gender = df.gender.fillna("female")
df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna()
df = pd.concat(train_test_split(df, test_size=0.33, random_state=123))

super(LawSchoolGPADataset, self).__init__(df=df,
dep_var_name=dep_var_name,
Expand Down
25 changes: 15 additions & 10 deletions aif360/metrics/ot_metric.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from typing import Union
import pandas as pd
import numpy as np
import ot
try:
import ot
except ImportError as error:
from logging import warning
warning("{}: ot_distance will be unavailable. To install, run:\n"
"pip install 'aif360[OptimalTransport]'".format(error))
from sklearn.preprocessing import LabelEncoder

def _normalize(distribution1, distribution2):
Expand All @@ -17,7 +22,7 @@ def _normalize(distribution1, distribution2):
extra = -np.minimum(np.min(distribution1), np.min(distribution2))
distribution1 += extra
distribution2 += extra

total_of_distribution1 = np.sum(distribution1)
if total_of_distribution1 != 0:
distribution1 /= total_of_distribution1
Expand Down Expand Up @@ -75,10 +80,10 @@ def _evaluate(
if prot_attr is None:
initial_distribution, required_distribution, matrix_distance = _transform(ground_truth, classifier, cost_matrix)
return ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)

if not ground_truth.nunique() == 2:
raise ValueError(f"Expected to have exactly 2 target values, got {ground_truth.nunique()}.")

# Calculate EMD between ground truth distribution and distribution of each group
emds = {}
for sa_val in sorted(prot_attr.unique()):
Expand Down Expand Up @@ -137,7 +142,7 @@ def ot_distance(
# Assert correct mode passed
if mode not in ['binary', 'continuous', 'nominal', 'ordinal']:
raise ValueError(f"Expected one of {['binary', 'continuous', 'nominal', 'ordinal']}, got {mode}.")

# Assert correct types passed to ground_truth, classifier and prot_attr
if not isinstance(ground_truth, (pd.Series, str)):
raise TypeError(f"ground_truth: expected pd.Series or str, got {type(ground_truth)}")
Expand All @@ -148,17 +153,17 @@ def ot_distance(
raise TypeError(f"classifier: expected pd.DataFrame for {mode} mode, got {type(classifier)}")
if prot_attr is not None and not isinstance(prot_attr, (pd.Series, str)):
raise TypeError(f"prot_attr: expected pd.Series or str, got {type(prot_attr)}")

# Assert correct type passed to cost_matrix
if cost_matrix is not None and not isinstance(cost_matrix, np.ndarray):
raise TypeError(f"cost_matrix: expected numpy.ndarray, got {type(cost_matrix)}")

# Assert scoring is "Wasserstein1"
if not scoring == "Wasserstein1":
raise ValueError(f"Scoring mode can only be \"Wasserstein1\", got {scoring}")

grt = ground_truth.copy()

if classifier is not None:
cls = classifier.copy()
if prot_attr is not None:
Expand All @@ -171,7 +176,7 @@ def ot_distance(
sat.index = grt.index
else:
sat = None

uniques = list(grt.unique())
if mode == "binary":
if len(uniques) > 2:
Expand Down
2 changes: 1 addition & 1 deletion aif360/sklearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank
from aif360.sklearn.datasets.compas_dataset import fetch_compas
from aif360.sklearn.datasets.meps_datasets import fetch_meps
from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa
from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa
89 changes: 89 additions & 0 deletions aif360/sklearn/datasets/lawschool_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from io import BytesIO
import os
import urllib

import pandas as pd
from sklearn.model_selection import train_test_split

from aif360.sklearn.datasets.utils import standardize_dataset, Dataset


# cache location
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data', 'raw')
LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat"

def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True,
binary_race=True, fillna_gender="female",
usecols=["race", "gender", "lsat", "ugpa"],
dropcols=None, numeric_only=False, dropna=True):
"""Load the Law School GPA dataset.
Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged).
The other protected attribute is gender ('male' is privileged and 'female'
is unprivileged). The outcome variable is standardized first year GPA
('zfygpa'). Note: this is a continuous variable, i.e., a regression task.
Args:
subset ({'train', 'test', or 'all'}, optional): Select the dataset to
load: 'train' for the training set, 'test' for the test set, 'all'
for both.
data_home (string, optional): Specify another download and cache folder
for the datasets. By default all AIF360 datasets are stored in
'aif360/sklearn/data/raw' subfolders.
cache (bool): Whether to cache downloaded datasets.
binary_race (bool, optional): Filter only white and black students.
fillna_gender (str or None, optional): Fill NA values for gender with
this value. If `None`, leave as NA. Note: this is used for backward-
compatibility with tempeh and may be dropped in later versions.
usecols (single label or list-like, optional): Feature column(s) to
keep. All others are dropped.
dropcols (single label or list-like, optional): Feature column(s) to
drop.
numeric_only (bool): Drop all non-numeric feature columns.
dropna (bool): Drop rows with NAs.
Returns:
namedtuple: Tuple containing X, y, and sample_weights for the Law School
GPA dataset accessible by index or name.
"""
if subset not in {'train', 'test', 'all'}:
raise ValueError("subset must be either 'train', 'test', or 'all'; "
"cannot be {}".format(subset))

cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
os.path.basename(LSAC_URL))
if cache and os.path.isfile(cache_path):
df = pd.read_sas(cache_path, encoding="utf-8")
else:
data = urllib.request.urlopen(LSAC_URL).read()
if cache:
os.makedirs(os.path.dirname (cache_path), exist_ok=True)
with open(cache_path, 'wb') as f:
f.write(data)
df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8")

df.race = df.race1.astype('category')
if binary_race:
df.race = df.race.cat.set_categories(['black', 'white'], ordered=True)

# for backwards-compatibility with tempeh
if fillna_gender is not None:
df.gender = df.gender.fillna(fillna_gender)
df.gender = df.gender.astype('category').cat.set_categories(
['female', 'male'], ordered=True)

ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)

# for backwards-compatibility with tempeh
train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123)
if subset == "train":
return Dataset(train_X, train_y)
elif subset == "test":
return Dataset(test_X, test_y)
else:
X = pd.concat([train_X, test_X], axis=0)
y = pd.concat([train_y, test_y], axis=0)
return Dataset(X, y)
25 changes: 13 additions & 12 deletions aif360/sklearn/datasets/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,20 @@ def fetch_german(*, data_home=None, cache=True, binary_age=True, usecols=None,
dropcols=dropcols, numeric_only=numeric_only,
dropna=dropna)

def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
usecols=None, dropcols=['duration'], numeric_only=False, dropna=False):
"""Load the Bank Marketing Dataset.
The protected attribute is 'age' (binarized by default as suggested by [#lequy22]:
age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged;
see the binary_age flag to keep this continuous). The outcome variable is 'deposit':
The protected attribute is 'age' (binarized by default as suggested by [#lequy22]_:
age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged;
see the binary_age flag to keep this continuous). The outcome variable is 'deposit':
'yes' or 'no'.
References:
.. [#lequy22] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine
learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery 12.3 (2022): e1452.
References:
.. [#lequy22] `Le Quy, Tai, et al. "A survey on datasets for fairness-
aware machine learning." Wiley Interdisciplinary Reviews: Data Mining
and Knowledge Discovery 12.3 (2022): e1452.
<https://wires.onlinelibrary.wiley.com/doi/pdf/10.1002/widm.1452>`_
Note:
By default, the data is downloaded from OpenML. See the `bank-marketing
Expand Down Expand Up @@ -235,15 +236,15 @@ def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
df[col] = df[col].cat.remove_categories('unknown')
df.education = df.education.astype('category').cat.reorder_categories(
['primary', 'secondary', 'tertiary'], ordered=True)

# binarize protected attribute (but not corresponding feature)
age = (pd.cut(df.age, [0, 24, 60, 100], ordered=False,
labels=[0, 1, 0] if numeric_only
labels=[0, 1, 0] if numeric_only
else ['<25 or >=60', '25-60', '<25 or >=60'])
if binary_age else 'age')
age = age.cat.reorder_categories([0, 1] if numeric_only
age = age.cat.reorder_categories([0, 1] if numeric_only
else ['<25 or >=60', '25-60'])

return standardize_dataset(df, prot_attr=[age], target='deposit',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)
59 changes: 0 additions & 59 deletions aif360/sklearn/datasets/tempeh_datasets.py

This file was deleted.

14 changes: 10 additions & 4 deletions aif360/sklearn/inprocessing/infairness.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from inFairness import fairalgo
try:
from inFairness import fairalgo
from skorch import NeuralNet
from skorch.dataset import unpack_data, Dataset as Dataset_
from skorch.utils import is_pandas_ndframe
except ImportError as error:
from logging import warning
warning("{}: SenSeI and SenSR will be unavailable. To install, run:\n"
"pip install 'aif360[inFairness]'".format(error))
Dataset_ = NeuralNet = object
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.multiclass import type_of_target
from sklearn.exceptions import NotFittedError
from skorch import NeuralNet
from skorch.dataset import unpack_data, Dataset as Dataset_
from skorch.utils import is_pandas_ndframe


class Dataset(Dataset_):
Expand Down
24 changes: 0 additions & 24 deletions aif360/sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,6 @@
from aif360.detectors.mdss.MDSS import MDSS


__all__ = [
# meta-metrics
'difference', 'ratio', 'intersection', 'one_vs_rest',
# scorer factory
'make_scorer',
# helpers
'num_samples', 'num_pos_neg',
'specificity_score', 'base_rate', 'selection_rate', 'smoothed_base_rate',
'smoothed_selection_rate', 'generalized_fpr', 'generalized_fnr',
# group fairness
'ot_distance', 'statistical_parity_difference', 'disparate_impact_ratio',
'equal_opportunity_difference', 'average_odds_difference', 'average_predictive_value_difference',
'average_odds_error', 'class_imbalance', 'kl_divergence',
'conditional_demographic_disparity', 'smoothed_edf',
'df_bias_amplification', 'mdss_bias_score',
# individual fairness
'generalized_entropy_index', 'generalized_entropy_error',
'between_group_generalized_entropy_error', 'theil_index',
'coefficient_of_variation', 'consistency_score',
# aliases
'sensitivity_score', 'mean_difference', 'false_negative_rate_error',
'false_positive_rate_error'
]

# ============================= META-METRICS ===================================
def difference(func, y_true, y_pred=None, prot_attr=None, priv_group=1,
sample_weight=None, **kwargs):
Expand Down
Loading

0 comments on commit 251d112

Please sign in to comment.