-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_datasets.py
66 lines (52 loc) · 2.89 KB
/
prepare_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Prepare datasets
Script to download and save datasets from PMLB, so they can be used in the experimental pipeline.
Usage: python -m prepare_datasets --help
"""
import argparse
import pathlib
import pandas as pd
import pmlb
import tqdm
import data_handling
# Manually defined by looking for similar dataset names and dataset properties on the website
# https://epistasislab.github.io/pmlb/index.html
DUPLICATE_DATASETS = ['agaricus_lepiota', 'breast_cancer_wisconsin', 'buggyCrx', 'colic', 'crx',
'german', 'Hill_Valley_without_noise', 'kr_vs_kp', 'vote']
# Main-routine: download, pre-process, and save (to "data_dir") datasets from PMLB.
def prepare_datasets(data_dir: pathlib.Path) -> None:
if not data_dir.is_dir():
print('Dataset directory does not exist. We create it.')
data_dir.mkdir(parents=True)
if any(data_dir.iterdir()):
print('Dataset directory is not empty. Files might be overwritten, but not deleted.')
# Get an overview of datasets and filter it:
dataset_overview = pmlb.dataset_lists.df_summary
dataset_overview = dataset_overview[
(dataset_overview['task'] == 'classification') &
(dataset_overview['n_classes'] == 2) &
(dataset_overview['n_instances'] >= 100) &
(dataset_overview['n_features'] >= 15) &
(dataset_overview['n_features'] <= 200)
] # filtering steps described in paper
assert pd.Series(DUPLICATE_DATASETS).isin(dataset_overview['dataset']).all() # check for typos
dataset_overview = dataset_overview[~dataset_overview['dataset'].isin(DUPLICATE_DATASETS)]
assert len(dataset_overview) == 30 # if this changes, we would need to adapt paper as well
data_handling.save_dataset_overview(dataset_overview=dataset_overview, directory=data_dir)
# Save individual datasets:
print('Downloading and saving datasets ...')
for dataset_name in tqdm.tqdm(dataset_overview['dataset']):
dataset = pmlb.fetch_data(dataset_name=dataset_name, dropna=False)
assert dataset.notna().all().all() # datasets we chose don't contain missing values
data_handling.save_dataset(X=dataset.drop(columns='target'), y=dataset['target'],
dataset_name=dataset_name, directory=data_dir)
# Parse some command-line arguments and run the main routine.
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Retrieves datasets from PMLB, prepares them for the experiment pipeline and ' +
'stores them in the specified directory.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-d', '--directory', type=pathlib.Path, default='data/datasets/',
dest='data_dir', help='Directory to store prediction datasets.')
print('Dataset preparation started.')
prepare_datasets(**vars(parser.parse_args()))
print('Datasets prepared and saved.')