Skip to content

Commit

Permalink
change source for lawschool gpa dataset
Browse files Browse the repository at this point in the history
* remove tempeh requirement

Signed-off-by: Samuel Hoffman <[email protected]>
  • Loading branch information
hoffmansc committed Feb 20, 2024
1 parent ab7e52a commit 6acbe7d
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 398 deletions.
32 changes: 10 additions & 22 deletions aif360/datasets/law_school_gpa_dataset.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,24 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from aif360.datasets import RegressionDataset
try:
import tempeh.configurations as tc
except ImportError as error:
from logging import warning
warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n"
"pip install 'aif360[LawSchoolGPA]'".format(error))
from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL

class LawSchoolGPADataset(RegressionDataset):
"""Law School GPA dataset.
See https://github.com/microsoft/tempeh for details.
"""
"""Law School GPA dataset."""

def __init__(self, dep_var_name='zfygpa',
protected_attribute_names=['race'],
privileged_classes=[['white']],
protected_attribute_names=['race', 'gender'],
privileged_classes=[['white'], ['male']],
instance_weights_name=None,
categorical_features=[],
na_values=[], custom_preprocessing=None,
metadata=None):
"""See :obj:`RegressionDataset` for a description of the arguments."""
dataset = tc.datasets["lawschool_gpa"]()
X_train,X_test = dataset.get_X(format=pd.DataFrame)
y_train, y_test = dataset.get_y(format=pd.Series)
A_train, A_test = dataset.get_sensitive_features(name='race',
format=pd.Series)
all_train = pd.concat([X_train, y_train, A_train], axis=1)
all_test = pd.concat([X_test, y_test, A_test], axis=1)

df = pd.concat([all_train, all_test], axis=0)
df = pd.read_sas(LSAC_URL, encoding="utf-8")
df.race = df.race1.where(df.race1.isin(['black', 'white']))
df.gender = df.gender.fillna("female")
df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna()
df = pd.concat(train_test_split(df, test_size=0.33, random_state=123))

super(LawSchoolGPADataset, self).__init__(df=df,
dep_var_name=dep_var_name,
Expand Down
2 changes: 1 addition & 1 deletion aif360/sklearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank
from aif360.sklearn.datasets.compas_dataset import fetch_compas
from aif360.sklearn.datasets.meps_datasets import fetch_meps
from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa
from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa
89 changes: 89 additions & 0 deletions aif360/sklearn/datasets/lawschool_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from io import BytesIO
import os
import urllib

import pandas as pd
from sklearn.model_selection import train_test_split

from aif360.sklearn.datasets.utils import standardize_dataset, Dataset


# cache location
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data', 'raw')
LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat"

def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True,
binary_race=True, fillna_gender="female",
usecols=["race", "gender", "lsat", "ugpa"],
dropcols=None, numeric_only=False, dropna=True):
"""Load the Law School GPA dataset.
Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged).
The other protected attribute is gender ('male' is privileged and 'female'
is unprivileged). The outcome variable is standardized first year GPA
('zfygpa'). Note: this is a continuous variable, i.e., a regression task.
Args:
subset ({'train', 'test', or 'all'}, optional): Select the dataset to
load: 'train' for the training set, 'test' for the test set, 'all'
for both.
data_home (string, optional): Specify another download and cache folder
for the datasets. By default all AIF360 datasets are stored in
'aif360/sklearn/data/raw' subfolders.
cache (bool): Whether to cache downloaded datasets.
binary_race (bool, optional): Filter only white and black students.
fillna_gender (str or None, optional): Fill NA values for gender with
this value. If `None`, leave as NA. Note: this is used for backward-
compatibility with tempeh and may be dropped in later versions.
usecols (single label or list-like, optional): Feature column(s) to
keep. All others are dropped.
dropcols (single label or list-like, optional): Feature column(s) to
drop.
numeric_only (bool): Drop all non-numeric feature columns.
dropna (bool): Drop rows with NAs.
Returns:
namedtuple: Tuple containing X, y, and sample_weights for the Law School
GPA dataset accessible by index or name.
"""
if subset not in {'train', 'test', 'all'}:
raise ValueError("subset must be either 'train', 'test', or 'all'; "
"cannot be {}".format(subset))

cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
os.path.basename(LSAC_URL))
if cache and os.path.isfile(cache_path):
df = pd.read_sas(cache_path, encoding="utf-8")
else:
data = urllib.request.urlopen(LSAC_URL).read()
if cache:
os.makedirs(os.path.dirname (cache_path), exist_ok=True)
with open(cache_path, 'wb') as f:
f.write(data)
df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8")

df.race = df.race1.astype('category')
if binary_race:
df.race = df.race.cat.set_categories(['black', 'white'], ordered=True)

# for backwards-compatibility with tempeh
if fillna_gender is not None:
df.gender = df.gender.fillna(fillna_gender)
df.gender = df.gender.astype('category').cat.set_categories(
['female', 'male'], ordered=True)

ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)

# for backwards-compatibility with tempeh
train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123)
if subset == "train":
return Dataset(train_X, train_y)
elif subset == "test":
return Dataset(test_X, test_y)
else:
X = pd.concat([train_X, test_X], axis=0)
y = pd.concat([train_y, test_y], axis=0)
return Dataset(X, y)
59 changes: 0 additions & 59 deletions aif360/sklearn/datasets/tempeh_datasets.py

This file was deleted.

Loading

0 comments on commit 6acbe7d

Please sign in to comment.