diff --git a/aif360/sklearn/datasets/meps_datasets.py b/aif360/sklearn/datasets/meps_datasets.py index 1f148bb7..92b1c8c3 100644 --- a/aif360/sklearn/datasets/meps_datasets.py +++ b/aif360/sklearn/datasets/meps_datasets.py @@ -1,9 +1,8 @@ from io import BytesIO import os -from zipfile import ZipFile +import urllib import pandas as pd -import requests from aif360.sklearn.datasets.utils import standardize_dataset @@ -59,25 +58,21 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True, if panel not in {19, 20, 21}: raise ValueError("only panels 19, 20, and 21 are currently supported.") - fname = 'h192' if panel == 21 else 'h181' - cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname + '.csv') + fname = 'h192ssp.zip' if panel == 21 else 'h181ssp.zip' + cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname) if cache and os.path.isfile(cache_path): - df = pd.read_csv(cache_path) + df = pd.read_sas(cache_path, format="xport", encoding="utf-8") else: # skip prompt if user chooses accept = accept_terms or input(PROMPT) - if accept != 'y' and accept != True: + if accept != 'y' and accept is not True: raise PermissionError("Terms not agreed.") - rawz = requests.get(os.path.join(MEPS_URL, fname + 'ssp.zip')).content - with ZipFile(BytesIO(rawz)) as zf: - with zf.open(fname + '.ssp') as ssp: - df = pd.read_sas(ssp, format='xport') - # TODO: does this cause any differences? - # reduce storage size - df = df.apply(pd.to_numeric, errors='ignore', downcast='integer') - if cache: - os.makedirs(os.path.dirname(cache_path), exist_ok=True) - df.to_csv(cache_path, index=None) + rawz = urllib.request.urlopen(os.path.join(MEPS_URL, fname)).read() + df = pd.read_sas(BytesIO(rawz), format='xport', encoding="utf-8", compression="zip") + if cache: + os.makedirs(os.path.dirname(cache_path), exist_ok=True) + with open(cache_path, "wb") as f: + f.write(rawz) # restrict to correct panel df = df[df['PANEL'] == panel] # change all 15s to 16s if panel == 21 diff --git a/tests/sklearn/test_datasets.py b/tests/sklearn/test_datasets.py index 97c08279..3ef86172 100644 --- a/tests/sklearn/test_datasets.py +++ b/tests/sklearn/test_datasets.py @@ -1,7 +1,7 @@ from functools import partial import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_allclose import pandas as pd from pandas.api.types import is_numeric_dtype from pandas.testing import assert_frame_equal @@ -233,7 +233,7 @@ def test_cache_meps(panel): meps_raw = fetch_meps(panel, cache=False, accept_terms=True)[0] fetch_meps(panel, cache=True, accept_terms=True) meps_cached = fetch_meps(panel, cache=True)[0] - assert_frame_equal(meps_raw, meps_cached, check_dtype=False, check_categorical=False) + assert_frame_equal(meps_raw, meps_cached) assert_array_equal(meps_raw.to_numpy(), meps_cached.to_numpy()) @pytest.mark.parametrize( @@ -254,7 +254,8 @@ def test_meps_matches_old(panel, cls): assert len(meps) == 3 meps.X.RACE = meps.X.RACE.factorize(sort=True)[0] MEPS = cls() - assert_array_equal(pd.get_dummies(meps.X.drop(columns=educols)), MEPS.features) + assert_allclose(pd.get_dummies(meps.X.drop(columns=educols)).astype(float), + MEPS.features, atol=1e-16) assert_array_equal(meps.y.factorize(sort=True)[0], MEPS.labels.ravel()) @pytest.mark.parametrize("panel", [19, 20, 21])