Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove requests dependency #519

Merged
merged 2 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions aif360/sklearn/datasets/meps_datasets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from io import BytesIO
import os
from zipfile import ZipFile
import urllib

import pandas as pd
import requests

from aif360.sklearn.datasets.utils import standardize_dataset

Expand Down Expand Up @@ -59,25 +58,21 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
if panel not in {19, 20, 21}:
raise ValueError("only panels 19, 20, and 21 are currently supported.")

fname = 'h192' if panel == 21 else 'h181'
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname + '.csv')
fname = 'h192ssp.zip' if panel == 21 else 'h181ssp.zip'
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname)
if cache and os.path.isfile(cache_path):
df = pd.read_csv(cache_path)
df = pd.read_sas(cache_path, format="xport", encoding="utf-8")
else:
# skip prompt if user chooses
accept = accept_terms or input(PROMPT)
if accept != 'y' and accept != True:
if accept != 'y' and accept is not True:
raise PermissionError("Terms not agreed.")
rawz = requests.get(os.path.join(MEPS_URL, fname + 'ssp.zip')).content
with ZipFile(BytesIO(rawz)) as zf:
with zf.open(fname + '.ssp') as ssp:
df = pd.read_sas(ssp, format='xport')
# TODO: does this cause any differences?
# reduce storage size
df = df.apply(pd.to_numeric, errors='ignore', downcast='integer')
if cache:
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
df.to_csv(cache_path, index=None)
rawz = urllib.request.urlopen(os.path.join(MEPS_URL, fname)).read()
df = pd.read_sas(BytesIO(rawz), format='xport', encoding="utf-8", compression="zip")
if cache:
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "wb") as f:
f.write(rawz)
# restrict to correct panel
df = df[df['PANEL'] == panel]
# change all 15s to 16s if panel == 21
Expand Down
7 changes: 4 additions & 3 deletions tests/sklearn/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial

import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_equal, assert_allclose
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.testing import assert_frame_equal
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_cache_meps(panel):
meps_raw = fetch_meps(panel, cache=False, accept_terms=True)[0]
fetch_meps(panel, cache=True, accept_terms=True)
meps_cached = fetch_meps(panel, cache=True)[0]
assert_frame_equal(meps_raw, meps_cached, check_dtype=False, check_categorical=False)
assert_frame_equal(meps_raw, meps_cached)
assert_array_equal(meps_raw.to_numpy(), meps_cached.to_numpy())

@pytest.mark.parametrize(
Expand All @@ -254,7 +254,8 @@ def test_meps_matches_old(panel, cls):
assert len(meps) == 3
meps.X.RACE = meps.X.RACE.factorize(sort=True)[0]
MEPS = cls()
assert_array_equal(pd.get_dummies(meps.X.drop(columns=educols)), MEPS.features)
assert_allclose(pd.get_dummies(meps.X.drop(columns=educols)).astype(float),
MEPS.features, atol=1e-16)
assert_array_equal(meps.y.factorize(sort=True)[0], MEPS.labels.ravel())

@pytest.mark.parametrize("panel", [19, 20, 21])
Expand Down
Loading