-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdataset_mapping.py
121 lines (120 loc) · 5.05 KB
/
dataset_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
def get_dataset_features(dataset):
"""
Takes in the name of a dataset and returns the relevant parameters
"""
groups_to_drop = []
if dataset == 'COMPAS':
path = 'datasets/compas-scores-two-years.csv'
label = 'two_year_recid' # binary
group = 'race' # 'sex' also an option
usable_features = ['race', 'age', 'sex', 'priors_count', 'c_charge_degree', 'juv_fel_count']
categorical_columns = []
groups_to_drop = ['race@Asian', 'race@Native American']
is_categorical = True
elif dataset == 'COMPAS_full':
path = 'datasets/compas-scores-two-years.csv'
label = 'two_year_recid' # binary
group = 'race' # 'sex' also an option
usable_features = ['race', 'age', 'sex', 'priors_count', 'c_charge_degree', 'juv_fel_count']
categorical_columns = []
is_categorical = True
elif dataset == 'COMPAS_race_and_gender':
path = 'datasets/compas-scores-two-years.csv'
label = 'two_year_recid' # binary
group = ['race', 'sex']
usable_features = ['race', 'age', 'sex', 'priors_count', 'c_charge_degree', 'juv_fel_count']
categorical_columns = []
groups_to_drop = ['race@Asian', 'race@Native American']
is_categorical = True
elif dataset == 'Default':
path = 'datasets/default.csv'
label = 'default payment next month' # binary
group = 'SEX'
usable_features = None # all
categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE']
is_categorical = True
elif dataset == 'Communities':
path = 'datasets/communities_cleaned.csv'
label = 'ViolentCrimesPerPop'
group = 'pluralityRace'
usable_features = None # all
categorical_columns = []
is_categorical = False
elif dataset == 'Adult':
path = 'datasets/adult_cleaned.csv'
label = 'income' # binary <50k or not
group = 'race'
usable_features = None # all
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship',
'race', 'sex', 'native-country']
is_categorical = True
elif dataset == 'Student':
path = 'datasets/student-mat.csv'
label = 'G3' # binary
group = 'sex' # TBD
usable_features = None # all
categorical_columns = ['Medu', 'Fedu', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absenses']
is_categorical = False
elif dataset == 'Bike':
path = 'datasets/SeoulBikeData.csv'
label = 'Rented Bike Count'
group = 'Seasons'
usable_features = ['Seasons', 'Hour', 'Temperature(C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
'Dew point temperature(C)', 'Solar Radiation (MJ/m2)',
'Rainfall(mm)', 'Snowfall (cm)', 'Seasons', 'Holiday', 'Functioning Day']
categorical_columns = ['Seasons', 'Holiday', 'Functioning']
is_categorical = False
elif dataset == 'Credit':
path = 'datasets/german_cleaned.csv'
label = 'Creditability'
group = 'Sex & Marital Status'
usable_features = None # all
categorical_columns = []
is_categorical = True
elif dataset == 'Fires':
path = 'datasets/forestfires.csv'
label = 'area'
group = 'month'
usable_features = ['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp',
'RH', 'wind', 'rain', 'area']
categorical_columns = ['day', 'month'] # non-linear (but monotone) numerical columns on scale such as 1-4
is_categorical = False
elif dataset == 'Heart':
path = 'datasets/heart_failure_clinical_records_dataset.csv'
label = 'DEATH_EVENT'
group = 'sex'
usable_features = None
categorical_columns = []
is_categorical = False
elif dataset == 'Marketing(Full)':
path = 'datasets/bank-full.csv'
label = 'y'
group = 'job'
usable_features = None
categorical_columns = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
is_categorical = True
elif dataset == 'Marketing(Small)':
path = 'datasets/bank.csv'
label = 'y'
group = 'job'
usable_features = None
categorical_columns = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
is_categorical = True
elif dataset == 'Wine':
path = 'datasets/winequality-full.csv'
label = 'quality'
group = 'color'
usable_features = None
categorical_columns = ['color']
is_categorical = False
# Use synthetic data if any other string is passed in
else:
path = ''
label = None
group = None
usable_features = None
categorical_columns = []
is_categorical = True
return path, label, group, usable_features, categorical_columns, groups_to_drop, is_categorical