-
Notifications
You must be signed in to change notification settings - Fork 0
/
comparison_experiment.py
385 lines (320 loc) · 23.8 KB
/
comparison_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold
from tqdm import tqdm
import multiprocessing as mp
import numpy as np
import pandas as pd
import time
from comparison_algorithms import rfe, sfs, sfm_svc, sfm_logistic_regression, sfm_random_forest
from utils import get_score, add_testing_score
import approaches
import config
def init_progress_bar():
""" Initialize progress bar (calculate number of steps).
Return: progressbar object
"""
# calculate number of datasets
number_datasets_classification = 0
for type, iter in config.data_ids.items():
if type == "classification":
for _, flag in iter.items():
if flag == True:
number_datasets_classification += 1
else:
raise ValueError(
"Only classification datasets are supported currently!")
# calculate number of estimators/metrics
number_classification_estimators = 0
for _, iter in approaches.classification_estimators.items():
for _, _ in iter.items():
number_classification_estimators += 1
# calculate number of dataset/estimator combinations
number_datasets_and_estimators = (number_datasets_classification * number_classification_estimators) * config.n_splits
# calculate number of bayesian approaches
#number_of_bayesian = (len(approaches.discretization_methods) * len(approaches.acquisition_functions)) * (
# (len(approaches.learning_methods)-1) + len(approaches.kernels)) # only gaussian processes use kernels
# 1. integer, n_highest, probabilistic_round and round together with GP matern and RBF kernels
# 2. categorical with GP Hamming kernel and random forest
number_of_bayesian = len(approaches.acquisition_functions) * ((4 * 2) + (2))
# calculate number of comparison approaches
number_of_comparison = 0
for _, approach in approaches.comparison_approaches.items():
for _, _ in approach.items():
number_of_comparison += 1
# calculate number of runs with different number of features
feature_runs = (config.max_nr_features-config.min_nr_features) / \
config.iter_step_nr_features + 1
# calculate total progress bar steps
progress_total = feature_runs * number_datasets_and_estimators * \
(number_of_bayesian + number_of_comparison + 1) # also consider training without feature selection (+1)
pbar = tqdm(total=progress_total)
pbar.set_description("Processed")
return pbar
def __run_all_bayesian(data_training, data_test, target_training, target_test, estimator, metric, n_calls, queue):
""" Run all bayesian optimization approaches with all possible parameters.
Keyword arguments:
data_training -- feature matrix of training data
data_test -- feature matrix of test data
target_training -- target vector of training data
target_test -- target vector of test data
estimator -- estimator used to predict target-values
metric -- metric used to calculate score
n_calls -- number of iterations in bayesian optimization
queue -- queue to synchronize progress bar
Return: Dataframe of all possible bayesian results (including testing scores)
"""
# Define result dataframes
df_results = pd.DataFrame(
columns=approaches.bay_opt_parameters+["Duration Black Box", "Duration Overhead", "Number of Iterations", "Vector", "Training Score"])
for algo, algo_descr in approaches.bayesian_approaches.items():
for learn, learn_descr in approaches.learning_methods.items():
for discr, discr_descr in approaches.discretization_methods.items():
if discr == "binary" or discr == "categorical":
# always sample points with excact n_features selected features
acq_optimizer = "n_sampling"
else:
acq_optimizer = "sampling"
for acq, _ in approaches.acquisition_functions.items():
if learn == "GP":
for kernel, kernel_descr in approaches.kernels.items():
# kernels only apply for gaussian processes
if kernel == "HAMMING":
# hamming kernel only for categorical and binary search-spaces
if discr == "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
vector, score, nr_iters, black_box_duration, overhead_duration = algo(data=data_training, target=target_training, learning_method=learn,
kernel=kernel, discretization_method=discr, n_features=n_features, estimator=estimator, acq_func=acq, metric=metric, n_calls=n_calls, cross_validation=config.n_splits_bay_opt, acq_optimizer=acq_optimizer, n_convergence=config.n_convergence, n_acq_points=config.n_acq_points)
df_results.loc[len(df_results)] = [
algo_descr, learn_descr, kernel_descr, discr_descr, acq, n_features, black_box_duration, overhead_duration, nr_iters, vector, score]
queue.put(1) # increase progress bar
else:
# Matern and RBF kernels for all discretization methods except categorical
if discr != "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
vector, score, nr_iters, black_box_duration, overhead_duration = algo(data=data_training, target=target_training, learning_method=learn,
kernel=kernel, discretization_method=discr, n_features=n_features, estimator=estimator, acq_func=acq, metric=metric, n_calls=n_calls, cross_validation=config.n_splits_bay_opt, acq_optimizer=acq_optimizer, n_convergence=config.n_convergence, n_acq_points=config.n_acq_points)
df_results.loc[len(df_results)] = [
algo_descr, learn_descr, kernel_descr, discr_descr, acq, n_features, black_box_duration, overhead_duration, nr_iters, vector, score]
queue.put(1) # increase progress bar
else:
# random forests only in categorical search space
if discr == "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
vector, score, nr_iters, black_box_duration, overhead_duration = algo(data=data_training, target=target_training, learning_method=learn,
discretization_method=discr, estimator=estimator, acq_func=acq, metric=metric, n_features=n_features, n_calls=n_calls, cross_validation=config.n_splits_bay_opt, acq_optimizer=acq_optimizer, n_convergence=config.n_convergence, n_acq_points=config.n_acq_points)
df_results.loc[len(df_results)] = [
algo_descr, learn_descr, "-", discr_descr, acq, n_features, black_box_duration, overhead_duration, nr_iters, vector, score]
queue.put(1) # increase progress bar
# generate test scores
df_results_with_test_scores = add_testing_score(
data_training, data_test, target_training, target_test, df_results, estimator, metric)
# add column with number of selected features
df_results_with_test_scores["Actual Features"] = df_results_with_test_scores.apply(
lambda row: sum(row["Vector"]), axis=1)
df_results_with_test_scores["Actual Features"] = df_results_with_test_scores.apply(
lambda row: sum(row["Vector"]), axis=1)
# add estimator and metric
df_results_with_test_scores["Estimator"] = estimator
df_results_with_test_scores["Metric"] = metric
# convert int to np.int64 to be able to aggrogate
df_results_with_test_scores["Number of Iterations"] = df_results_with_test_scores.apply(lambda row: np.int64(row["Number of Iterations"]), axis=1)
return df_results_with_test_scores
def __run_all_comparison(data_training, data_test, target_training, target_test, estimator, metric):
""" Run all comparison approaches with all possible algorithms and parameters.
Keyword arguments:
data_training -- feature matrix of training data
data_test -- feature matrix of test data
target_training -- target vector of training data
target_test -- target vector of test data
estimator -- estimator used to predict target-values
metric -- metric used to calculate score
queue -- queue to synchronize progress bar
Return: Dataframe of all possible comparison results (including testing scores)
"""
# Define result dataframe
df_results = pd.DataFrame(
columns=approaches.comparison_parameters+["Duration", "Vector", "Training Score"])
for approach, approach_descr in approaches.comparison_approaches.items():
for algo, algo_descr in approach_descr.items():
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
start_time = time.time()
vector = algo(data=data_training, target=target_training,
n_features=n_features, estimator=estimator)
duration = time.time() - start_time
score = get_score(data_training, data_training, target_training, target_training, vector,
estimator, metric)
df_results.loc[len(df_results)] = [
approach, algo_descr, n_features, duration, vector, score]
# generate test scores
df_results_with_test_scores = add_testing_score(
data_training, data_test, target_training, target_test, df_results, estimator, metric)
# add estimator and metric
df_results_with_test_scores["Estimator"] = estimator
df_results_with_test_scores["Metric"] = metric
return df_results_with_test_scores
def __run_without_fs(data_training, data_test, target_training, target_test, estimator, metric):
""" Train model withoud feature selection.
Keyword arguments:
data_training -- feature matrix of training data
data_test -- feature matrix of test data
target_training -- target vector of training data
target_test -- target vector of test data
estimator -- estimator used to predict target-values
metric -- metric used to calculate score
queue -- queue to synchronize progress bar
Return: Dataframe of the result scores without any feature selection
"""
df_results = pd.DataFrame(columns=["Training Score", "Testing Score"])
vector = [1 for _ in range(0,len(data_training.columns))]
training_score = get_score(data_training, data_training, target_training, target_training, vector,
estimator, metric)
testing_score = get_score(data_training, data_test, target_training, target_test, vector,
estimator, metric)
df_results.loc[len(df_results)] = [training_score, testing_score]
df_results["Estimator"] = estimator
df_results["Metric"] = metric
return df_results
def __run_bay_and_add_test_score(algo, algo_descr, learn_descr, kernel_descr, discr_descr, acq, data_training, data_test, target_training, target_test, learning_method, kernel, discretization_method, n_features, estimator, acq_func, metric, n_calls, cross_validation, acq_optimizer, n_convergence, n_acq_points):
""" Actual run of bayesian optimization algorithm. Adds test scores and combines all into a dataframe.
Keyword arguments:
algo -- bayesian optimization algorithm
algo_descr, learn_descr, kernel_descr, discr_descr, acq -- descriptions of algorithm
data_training -- feature matrix of training data
data_test -- feature matrix of test data
target_training -- target vector of training data
target_test -- target vector of test data
learning_method -- model used for Bayesian optimization (GP or RF)
kernel -- kernel type for Gaussian processes
discretization_method -- define method on how to work with search space
n_features -- number of features to be selected; if 'None' then best result is selected
estimator -- estimator used to determine score
acq_func -- aquisition function to be used
metric -- calculate score based on metric
n_calls -- number of iterations
cross_validation - number of folds to perform cross validation inside bayesian optimization on
acq_optimizer -- strategy to sample points of aqcuisition function
n_convergence -- stop optimization if for n_convergence iterations the optimum did not change
n_acq_points -- number of points of the acquisition function to evaluate in each iteration
Return: dataframe with results
"""
# run algorithm
vector, score, nr_iters, black_box_duration, overhead_duration = algo(data=data_training, target=target_training, learning_method=learning_method, kernel=kernel, discretization_method=discretization_method, n_features=n_features, estimator=estimator, acq_func=acq_func, metric=metric, n_calls=n_calls, cross_validation=cross_validation, acq_optimizer=acq_optimizer, n_convergence=n_convergence, n_acq_points=n_acq_points)
# create dataframe
df_results = pd.DataFrame(
columns=approaches.bay_opt_parameters+["Duration Black Box", "Duration Overhead", "Number of Iterations", "Vector", "Training Score"])
df_results.loc[len(df_results)] = [algo_descr, learn_descr, kernel_descr, discr_descr, acq, n_features, black_box_duration, overhead_duration, nr_iters, vector, score]
df_results_with_test_scores = add_testing_score(data_training, data_test, target_training, target_test, df_results, estimator, metric)
# add estimator and metric
df_results_with_test_scores["Estimator"] = estimator
df_results_with_test_scores["Metric"] = metric
# convert int to np.int64 to be able to aggrogate
df_results_with_test_scores["Number of Iterations"] = df_results_with_test_scores.apply(lambda row: np.int64(row["Number of Iterations"]), axis=1)
return df_results_with_test_scores
def __run_all_bayesian_mp(data_training, data_test, target_training, target_test, estimator, metric, n_calls, mp_results, pool, dataset_id):
""" Run all Bayesian optimization approaches and add tasks to multiprocessing pool.
Keyword arguments:
data_training -- feature matrix of training data
data_test -- feature matrix of test data
target_training -- target vector of training data
target_test -- target vector of test data
estimator -- estimator used to predict target-values
metric -- metric used to calculate score
n_calls -- maximum number of iterations of bayesian optimization
mp_results -- result array
pool -- multiprocessing pool
dataset_id -- id of dataset (to identify results)
"""
for algo, algo_descr in approaches.bayesian_approaches.items():
for learn, learn_descr in approaches.learning_methods.items():
for discr, discr_descr in approaches.discretization_methods.items():
if discr == "binary" or discr == "categorical":
# always sample points with excact n_features selected features
acq_optimizer = "n_sampling"
else:
acq_optimizer = "sampling"
for acq, _ in approaches.acquisition_functions.items():
if learn == "GP":
for kernel, kernel_descr in approaches.kernels.items():
# kernels only apply for gaussian processes
if kernel == "HAMMING":
# hamming kernel only for categorical and binary search-spaces
if discr == "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
mp_results.append(("bayopt", dataset_id, (pool.apply_async(__run_bay_and_add_test_score, [], {"algo": algo, "algo_descr":algo_descr, "learn_descr":learn_descr, "kernel_descr":kernel_descr, "discr_descr":discr_descr, "acq":acq, "data_training":data_training, "data_test": data_test, "target_training":target_training, "target_test":target_test, "learning_method":learn, "kernel":kernel, "discretization_method":discr, "n_features":n_features, "estimator":estimator, "acq_func":acq, "metric":metric, "n_calls":n_calls, "cross_validation":config.n_splits_bay_opt, "acq_optimizer":acq_optimizer, "n_convergence":config.n_convergence, "n_acq_points":config.n_acq_points}))))
else:
# Matern and RBF kernels for all discretization methods except categorical
if discr != "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
mp_results.append(("bayopt", dataset_id, (pool.apply_async(__run_bay_and_add_test_score, [], {"algo": algo, "algo_descr":algo_descr, "learn_descr":learn_descr, "kernel_descr":kernel_descr, "discr_descr":discr_descr, "acq":acq, "data_training":data_training, "data_test": data_test, "target_training":target_training, "target_test":target_test, "learning_method":learn, "kernel":kernel, "discretization_method":discr, "n_features":n_features, "estimator":estimator, "acq_func":acq, "metric":metric, "n_calls":n_calls, "cross_validation":config.n_splits_bay_opt, "acq_optimizer":acq_optimizer, "n_convergence":config.n_convergence, "n_acq_points":config.n_acq_points}))))
else:
# random forests only in categorical search space
if discr == "categorical":
for n_features in range(config.min_nr_features, config.max_nr_features+1, config.iter_step_nr_features):
mp_results.append(("bayopt", dataset_id, (pool.apply_async(__run_bay_and_add_test_score, [], {"algo": algo, "algo_descr":algo_descr, "learn_descr":learn_descr, "kernel_descr":"-", "discr_descr":discr_descr, "acq":acq, "data_training":data_training, "data_test": data_test, "target_training":target_training, "target_test":target_test, "learning_method":learn, "kernel":None, "discretization_method":discr, "n_features":n_features, "estimator":estimator, "acq_func":acq, "metric":metric, "n_calls":n_calls, "cross_validation":config.n_splits_bay_opt, "acq_optimizer":acq_optimizer, "n_convergence":config.n_convergence, "n_acq_points":config.n_acq_points}))))
def experiment_all_datasets_and_estimators():
""" Runs an experiment involving all bayesian/comparison approaches with all possible parameters, datasets, estimators and metrics.
"""
pool = mp.Pool(processes=config.n_processes)
mp_results = []
# run all datasets
for task, dataset in config.data_ids.items():
for dataset_id, flag in dataset.items():
if flag == True:
# Import dataset
data, target = fetch_openml(
data_id=dataset_id, return_X_y=True, as_frame=True)
for element in config.drop_list:
if element in data.columns.values:
data = data.drop(element, axis=1)
# run cross-validation
kf = KFold(n_splits=config.n_splits, shuffle=True).split(data)
for train_index, test_index in kf:
for estimator, metrics in approaches.classification_estimators.items():
for metric, _ in metrics.items():
# run approaches
__run_all_bayesian_mp(data_training=data.loc[train_index], data_test=data.loc[test_index], target_training=target.loc[train_index], target_test=target.loc[test_index], estimator=estimator, metric=metric, n_calls=config.n_calls, mp_results=mp_results, pool=pool, dataset_id=dataset_id)
#mp_results.append(("bayopt", dataset_id, pool.apply_async(__run_all_bayesian, [], {"data_training":data.loc[train_index], "data_test":data.loc[test_index], "target_training":target.loc[train_index], "target_test":target.loc[test_index], "estimator":estimator, "metric":metric, "n_calls":config.n_calls, "queue":queue})))
mp_results.append(("comparison", dataset_id, pool.apply_async(__run_all_comparison, [], {"data_training":data.loc[train_index], "data_test":data.loc[test_index], "target_training":target.loc[train_index], "target_test":target.loc[test_index], "estimator":estimator, "metric":metric})))
mp_results.append(("withoutfs", dataset_id, pool.apply_async(__run_without_fs, [], {"data_training":data.loc[train_index], "data_test":data.loc[test_index], "target_training":target.loc[train_index], "target_test":target.loc[test_index], "estimator":estimator, "metric":metric})))
# get results
results = [(r[0],r[1]) + (r[2].get(),) for r in tqdm(mp_results)]
# separate bay opt, comparison and without fs results
res_bay_opt = []
res_comparison = []
res_without_fs = []
for approach, dataset_id, res in results:
res["did"] = dataset_id # add dataset id to dataframe
if approach == "bayopt":
res_bay_opt.append(res)
elif approach == "comparison":
res_comparison.append(res)
elif approach == "withoutfs":
res_without_fs.append(res)
# concat to single dataframes
df_bay_opt = pd.concat(res_bay_opt)
df_comparison = pd.concat(res_comparison)
df_without_fs = pd.concat(res_without_fs)
# add column with number of selected features
df_bay_opt["Actual Features"] = df_bay_opt.apply(
lambda row: sum(row["Vector"]), axis=1)
df_comparison["Actual Features"] = df_comparison.apply(
lambda row: sum(row["Vector"]), axis=1)
df_bay_opt_grouped = df_bay_opt.groupby(["did", "Estimator", "Metric"] + approaches.bay_opt_parameters, as_index=False).agg(
{"Duration Black Box": ["mean"], "Duration Overhead": ["mean"], "Number of Iterations": ["mean"], "Actual Features": ["mean"], "Training Score": ["mean"], "Testing Score": ["mean"]})
df_comparison_grouped = df_comparison.groupby(["did", "Estimator", "Metric"] + approaches.comparison_parameters, as_index=False).agg(
{"Duration": ["mean"], "Actual Features": ["mean"], "Training Score": ["mean"], "Testing Score": ["mean"]})
df_without_fs_grouped = df_without_fs.groupby(["did", "Estimator", "Metric"], as_index=False).agg(
{"Training Score": ["mean"], "Testing Score": ["mean"]})
for name, group in df_bay_opt_grouped.groupby(["did", "Estimator", "Metric"], as_index=False):
group.iloc[:, 3:].to_csv("results/comparison_bayesian_experiment/classification" + "/bayopt_" +
str(name[0])+"_"+str(name[1])+"_"+str(name[2])+".csv", index=False)
for name, group in df_comparison_grouped.groupby(["did", "Estimator", "Metric"], as_index=False):
group.iloc[:, 3:].to_csv("results/comparison_bayesian_experiment/classification" + "/comparison_" +
str(name[0])+"_"+str(name[1])+"_"+str(name[2])+".csv", index=False)
for name, group in df_without_fs_grouped.groupby(["did", "Estimator", "Metric"], as_index=False):
group.iloc[:, 3:].to_csv("results/comparison_bayesian_experiment/classification" + "/withoutfs_" +
str(name[0])+"_"+str(name[1])+"_"+str(name[2])+".csv", index=False)
pool.close()
pool.join()
if __name__ == "__main__":
experiment_all_datasets_and_estimators()