From 3533ca414572a39665d244fb3dd4b977b954530e Mon Sep 17 00:00:00 2001 From: Serdar Kadioglu Date: Mon, 31 May 2021 07:01:22 -0400 Subject: [PATCH 01/13] added parallel bench --- feature/selector.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/feature/selector.py b/feature/selector.py index 2edf22b..8b21168 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -638,6 +638,39 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, return score_df, selected_df, runtime_df +def _parallel_bench(): + method_to_runtime = {} + score_df = pd.DataFrame(index=data.columns) + selected_df = pd.DataFrame(index=data.columns) + for method_name, method in selectors.items(): + selector = Selective(method) + t0 = time() + if verbose: + print("\n>>> Running", method_name) + scores = None + selected = [] + try: + subset = selector.fit_transform(data, labels) + scores = selector.get_absolute_scores() + selected = [1 if c in subset.columns else 0 for c in data.columns] + method_to_runtime[method_name] = round((time() - t0) / 60, 2) + except Exception as exp: + print("Exception", exp) + scores = np.repeat(0, len(data.columns)) + selected = np.repeat(0, len(data.columns)) + method_to_runtime[method_name] = str(round((time() - t0) / 60, 2)) + " (exception)" + finally: + score_df[method_name] = scores + selected_df[method_name] = selected + if output_filename is not None: + output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") + output_file.write(str(selected) + "\n") + output_file.write(str(scores) + "\n") + if verbose: + print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") + + + def calculate_statistics(scores: pd.DataFrame, selected: pd.DataFrame, columns: Optional[list] = None, From c3bce540666537cb3972320aa5214951b65b42ba Mon Sep 17 00:00:00 2001 From: irmakbky Date: Mon, 31 May 2021 17:47:51 +0300 Subject: [PATCH 02/13] edited parallel bench --- feature/selector.py | 120 ++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 59 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 8b21168..b2ad689 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -24,6 +24,7 @@ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.model_selection import KFold from xgboost import XGBClassifier, XGBRegressor +from joblib import Parallel, delayed from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector from feature.correlation import _Correlation @@ -480,6 +481,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, + n_jobs: int = 1, seed: int = Constants.default_seed) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ @@ -525,7 +527,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, labels=labels, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, - verbose=verbose) + verbose=verbose, + n_jobs=n_jobs) else: # Create K-Fold object @@ -555,7 +558,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, labels=train_labels, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, - verbose=False) + verbose=False, + n_jobs=n_jobs) # Concatenate data frames score_df = pd.concat((score_df, score_cv_df)) @@ -574,6 +578,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Statistical, SelectionMethod.Variance]], data: pd.DataFrame, + n_jobs: int, labels: Optional[pd.Series] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, @@ -605,71 +610,68 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, method_to_runtime = {} score_df = pd.DataFrame(index=data.columns) selected_df = pd.DataFrame(index=data.columns) - for method_name, method in selectors.items(): - selector = Selective(method) - t0 = time() - if verbose: - print("\n>>> Running", method_name) - scores = None - selected = [] - try: - subset = selector.fit_transform(data, labels) - scores = selector.get_absolute_scores() - selected = [1 if c in subset.columns else 0 for c in data.columns] - method_to_runtime[method_name] = round((time() - t0) / 60, 2) - except Exception as exp: - print("Exception", exp) - scores = np.repeat(0, len(data.columns)) - selected = np.repeat(0, len(data.columns)) - method_to_runtime[method_name] = str(round((time() - t0) / 60, 2)) + " (exception)" - finally: - score_df[method_name] = scores - selected_df[method_name] = selected - if output_filename is not None: - output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") - output_file.write(str(selected) + "\n") - output_file.write(str(scores) + "\n") - if verbose: - print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") + + # Parallel + size = len(selectors.items()) + if n_jobs < 0: + n_jobs = max(mp.cpu_count() + 1 + n_jobs, 1) + n_jobs = min(n_jobs, size) + + r = Parallel(n_jobs=n_jobs, verbose=10, require='sharedmem')( + delayed(_parallel_bench)( + data, labels, method_to_runtime, score_df, + selected_df, method_name, method, verbose, + output_filename) + for method_name, method in selectors.items()) + score_dfs, selected_dfs, method_to_runtimes = zip(*r) + score_df = score_dfs[0] + selected_df = selected_dfs[0] # Format runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis("method").reset_index() return score_df, selected_df, runtime_df - -def _parallel_bench(): - method_to_runtime = {} - score_df = pd.DataFrame(index=data.columns) - selected_df = pd.DataFrame(index=data.columns) - for method_name, method in selectors.items(): - selector = Selective(method) - t0 = time() +def _parallel_bench(data: pd.DataFrame, + labels: Optional[pd.Series], + method_to_runtime: dict, + score_df: pd.DataFrame, + selected_df: pd.DataFrame, + method_name: str, + method: Union[SelectionMethod.Correlation, + SelectionMethod.Linear, + SelectionMethod.TreeBased, + SelectionMethod.Statistical, + SelectionMethod.Variance], + verbose: bool, + output_filename: Optional[str]): + selector = Selective(method) + t0 = time() + if verbose: + print("\n>>> Running", method_name) + scores = None + selected = [] + try: + subset = selector.fit_transform(data, labels) + scores = selector.get_absolute_scores() + selected = [1 if c in subset.columns else 0 for c in data.columns] + method_to_runtime[method_name] = round((time() - t0) / 60, 2) + except Exception as exp: + print("Exception", exp) + scores = np.repeat(0, len(data.columns)) + selected = np.repeat(0, len(data.columns)) + method_to_runtime[method_name] = str(round((time() - t0) / 60, 2)) + " (exception)" + finally: + score_df[method_name] = scores + selected_df[method_name] = selected + if output_filename is not None: + output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") + output_file.write(str(selected) + "\n") + output_file.write(str(scores) + "\n") if verbose: - print("\n>>> Running", method_name) - scores = None - selected = [] - try: - subset = selector.fit_transform(data, labels) - scores = selector.get_absolute_scores() - selected = [1 if c in subset.columns else 0 for c in data.columns] - method_to_runtime[method_name] = round((time() - t0) / 60, 2) - except Exception as exp: - print("Exception", exp) - scores = np.repeat(0, len(data.columns)) - selected = np.repeat(0, len(data.columns)) - method_to_runtime[method_name] = str(round((time() - t0) / 60, 2)) + " (exception)" - finally: - score_df[method_name] = scores - selected_df[method_name] = selected - if output_filename is not None: - output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") - output_file.write(str(selected) + "\n") - output_file.write(str(scores) + "\n") - if verbose: - print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") - + print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") + return score_df, selected_df, method_to_runtime def calculate_statistics(scores: pd.DataFrame, selected: pd.DataFrame, From e9eba84d1e9597a9d019edd62027c98f4c2d6185 Mon Sep 17 00:00:00 2001 From: Serdar Kadioglu Date: Mon, 31 May 2021 16:34:12 -0400 Subject: [PATCH 03/13] add parallel test --- feature/selector.py | 70 ++++++++++++++++---------------- tests/test_parallel.py | 90 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 33 deletions(-) create mode 100644 tests/test_parallel.py diff --git a/feature/selector.py b/feature/selector.py index b2ad689..3b8aa05 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -11,7 +11,7 @@ """ from time import time -from typing import Dict, Union, NamedTuple, NoReturn, Tuple, Optional +from typing import Dict, Union, NamedTuple, NoReturn, Tuple, Optional, IO, Any import numpy as np import pandas as pd @@ -25,6 +25,7 @@ from sklearn.model_selection import KFold from xgboost import XGBClassifier, XGBRegressor from joblib import Parallel, delayed +import multiprocessing as mp from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector from feature.correlation import _Correlation @@ -440,7 +441,7 @@ def _validate_args(seed, selection_method) -> NoReturn: SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance)), - TypeError("Unknown selection type: " + str(selection_method))) + TypeError("Unknown selection type: " + str(selection_method) + " " + str(type(selection_method)))) # Selection method value selection_method._validate() @@ -596,7 +597,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, check_true(selectors is not None, ValueError("Benchmark selectors cannot be none.")) check_true(data is not None, ValueError("Benchmark data cannot be none.")) - # Output files + # Output file if output_filename is not None: output_file = open(output_filename, "a") else: @@ -611,21 +612,29 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, score_df = pd.DataFrame(index=data.columns) selected_df = pd.DataFrame(index=data.columns) - # Parallel + # Find the effective number of jobs size = len(selectors.items()) if n_jobs < 0: n_jobs = max(mp.cpu_count() + 1 + n_jobs, 1) n_jobs = min(n_jobs, size) - r = Parallel(n_jobs=n_jobs, verbose=10, require='sharedmem')( + # Parallel benchmarks for each method + output_list = Parallel(n_jobs=n_jobs, verbose=10, require="sharedmem")( delayed(_parallel_bench)( - data, labels, method_to_runtime, score_df, - selected_df, method_name, method, verbose, - output_filename) + data, labels, method_name, method, verbose) for method_name, method in selectors.items()) - score_dfs, selected_dfs, method_to_runtimes = zip(*r) - score_df = score_dfs[0] - selected_df = selected_dfs[0] + + # Collect the output from each method + for output in output_list: + for method_name, results_dict in output.items(): + score_df[method_name] = results_dict["scores"] + selected_df[method_name] = results_dict["selected"] + method_to_runtime[method_name] = results_dict["runtime"] + + if output_filename is not None: + output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") + output_file.write(str(results_dict["selected"]) + "\n") + output_file.write(str(results_dict["scores"]) + "\n") # Format runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis("method").reset_index() @@ -634,44 +643,39 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, def _parallel_bench(data: pd.DataFrame, labels: Optional[pd.Series], - method_to_runtime: dict, - score_df: pd.DataFrame, - selected_df: pd.DataFrame, method_name: str, method: Union[SelectionMethod.Correlation, - SelectionMethod.Linear, - SelectionMethod.TreeBased, - SelectionMethod.Statistical, - SelectionMethod.Variance], - verbose: bool, - output_filename: Optional[str]): + SelectionMethod.Linear, + SelectionMethod.TreeBased, + SelectionMethod.Statistical, + SelectionMethod.Variance], + verbose: bool): + selector = Selective(method) t0 = time() if verbose: - print("\n>>> Running", method_name) - scores = None - selected = [] + run_str = "\n>>> Running " + method_name + print(run_str, flush=True) + try: subset = selector.fit_transform(data, labels) scores = selector.get_absolute_scores() selected = [1 if c in subset.columns else 0 for c in data.columns] - method_to_runtime[method_name] = round((time() - t0) / 60, 2) + runtime = round((time() - t0) / 60, 2) except Exception as exp: print("Exception", exp) scores = np.repeat(0, len(data.columns)) selected = np.repeat(0, len(data.columns)) - method_to_runtime[method_name] = str(round((time() - t0) / 60, 2)) + " (exception)" + runtime = str(round((time() - t0) / 60, 2)) + " (exception)" finally: - score_df[method_name] = scores - selected_df[method_name] = selected - if output_filename is not None: - output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") - output_file.write(str(selected) + "\n") - output_file.write(str(scores) + "\n") if verbose: - print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") + done_str = f"<<< Done! {method_name} Time taken: {(time() - t0) / 60:.2f} minutes" + print(done_str, flush=True) + + results_dict = {"scores": scores, "selected": selected, "runtime": runtime} + + return {method_name: results_dict} - return score_df, selected_df, method_to_runtime def calculate_statistics(scores: pd.DataFrame, selected: pd.DataFrame, diff --git a/tests/test_parallel.py b/tests/test_parallel.py new file mode 100644 index 0000000..6bebb78 --- /dev/null +++ b/tests/test_parallel.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# Copyright FMR LLC +# SPDX-License-Identifier: GNU GPLv3 + +from catboost import CatBoostClassifier, CatBoostRegressor +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.datasets import load_boston, load_iris +from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor +from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor +from xgboost import XGBClassifier, XGBRegressor + +from feature.utils import get_data_label +from feature.selector import SelectionMethod, benchmark, calculate_statistics +from tests.test_base import BaseTest + + +class TestParallel(BaseTest): + + num_features = 3 + corr_threshold = 0.5 + alpha = 1000 + tree_params = {"random_state": 123, "n_estimators": 100} + + selectors = { + # "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), + # "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), + # "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), + # "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), + # "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), + # "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), + "linear": SelectionMethod.Linear(num_features, regularization="none"), + "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), + # "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), + # "random_forest": SelectionMethod.TreeBased(num_features), + # "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), + # "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), + # "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), + # "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), + # "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), + # "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), + # "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), + # "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), + # "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), + # "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), + # "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), + # "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) + } + + def test_benchmark_regression(self): + data, label = get_data_label(load_boston()) + data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + + # Benchmark + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, verbose=True, n_jobs=2) + stats_df = calculate_statistics(score_df, selected_df) + + print(score_df) + print(selected_df) + print(runtime_df) + print(stats_df) + + # def test_benchmark_regression_cv(self): + # data, label = get_data_label(load_boston()) + # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + # + # # Benchmark + # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) + # _ = calculate_statistics(score_df, selected_df) + # + # # Aggregate scores from different cv-folds + # score_df = score_df.groupby(score_df.index).mean() + # + # def test_benchmark_classification(self): + # data, label = get_data_label(load_iris()) + # + # # Benchmark + # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None) + # _ = calculate_statistics(score_df, selected_df) + # + # + # def test_benchmark_classification_cv(self): + # data, label = get_data_label(load_iris()) + # + # # Benchmark + # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) + # _ = calculate_statistics(score_df, selected_df) + # + # # Aggregate scores from different cv-folds + # score_df = score_df.groupby(score_df.index).mean() From 80e0ef48a9fb55ddce47f46e7abcf1450922dba4 Mon Sep 17 00:00:00 2001 From: Serdar Kadioglu Date: Tue, 1 Jun 2021 07:58:33 -0400 Subject: [PATCH 04/13] todo --- feature/selector.py | 7 +++++-- tests/test_parallel.py | 32 +++++++++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 3b8aa05..4512f34 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -510,6 +510,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Whether to drop features with zero variance before running feature selector methods or not. verbose: bool, optional (default=False) Whether to print progress messages or not. + n_jobs: int, TODO + seed: int, optional (default=Constants.default_seed) The random seed to initialize the random number generator. @@ -579,11 +581,11 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Statistical, SelectionMethod.Variance]], data: pd.DataFrame, - n_jobs: int, labels: Optional[pd.Series] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, - verbose: bool = False) \ + verbose: bool = False, + n_jobs: int = 1) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. @@ -641,6 +643,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, return score_df, selected_df, runtime_df + def _parallel_bench(data: pd.DataFrame, labels: Optional[pd.Series], method_name: str, diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 6bebb78..7d94069 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -52,13 +52,24 @@ def test_benchmark_regression(self): data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) # Benchmark - score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, verbose=True, n_jobs=2) - stats_df = calculate_statistics(score_df, selected_df) + score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label) + + score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, verbose=True, n_jobs=1) + + score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, verbose=True, n_jobs=2) + + # TODO assert test results + # self.assertListAlmostEqual(list(score_df["linear"]), [0.069011, 0.054086, 0.061452, 0.006510, 0.954662]) + + def test_benchmark_classification(self): + data, label = get_data_label(load_iris()) + + # Benchmark + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None, n_jobs=2) + _ = calculate_statistics(score_df, selected_df) print(score_df) print(selected_df) - print(runtime_df) - print(stats_df) # def test_benchmark_regression_cv(self): # data, label = get_data_label(load_boston()) @@ -70,15 +81,10 @@ def test_benchmark_regression(self): # # # Aggregate scores from different cv-folds # score_df = score_df.groupby(score_df.index).mean() - # - # def test_benchmark_classification(self): - # data, label = get_data_label(load_iris()) - # - # # Benchmark - # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None) - # _ = calculate_statistics(score_df, selected_df) - # - # + + + + # def test_benchmark_classification_cv(self): # data, label = get_data_label(load_iris()) # From 6ff5b2a2351e30494424dada4584a8fe991f0d46 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Thu, 3 Jun 2021 12:40:36 +0300 Subject: [PATCH 05/13] fixed style --- feature/selector.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 4512f34..529cbbe 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -510,8 +510,10 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Whether to drop features with zero variance before running feature selector methods or not. verbose: bool, optional (default=False) Whether to print progress messages or not. - n_jobs: int, TODO - + n_jobs: int, optional (default=1) + Number of concurrent processes/threads to use in parallelized routines. + If set to -1, all CPUs are used. + If set to -2, all CPUs but one are used, and so on. seed: int, optional (default=Constants.default_seed) The random seed to initialize the random number generator. @@ -652,7 +654,18 @@ def _parallel_bench(data: pd.DataFrame, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance], - verbose: bool): + verbose: bool) \ + -> Dict[str, Dict[str, Union[pd.DataFrame, list, float]]]: + """ + Benchmark with a given set of feature selectors. + Return a dictionary of feature selection method names with their corresponding scores, + selected features and runtime. + + Returns + ------- + Dictionary of feature selection method names with their corresponding scores, selected features + and runtime. + """ selector = Selective(method) t0 = time() From 6e9bb49f639ebf4d4222bea518b59c73cb453350 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Thu, 3 Jun 2021 15:43:18 +0300 Subject: [PATCH 06/13] test cases added --- tests/test_parallel.py | 52 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 7d94069..8e678c8 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -53,23 +53,57 @@ def test_benchmark_regression(self): # Benchmark score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label) - score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, verbose=True, n_jobs=1) - score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, verbose=True, n_jobs=2) - # TODO assert test results - # self.assertListAlmostEqual(list(score_df["linear"]), [0.069011, 0.054086, 0.061452, 0.006510, 0.954662]) + # Scores + self.assertListAlmostEqual([0.069011, 0.054086, 0.061452, 0.006510, 0.954662], + score_df_sequential["linear"].to_list()) + self.assertListAlmostEqual([0.056827, 0.051008, 0.053192, 0.007176, 0.923121], + score_df_sequential["lasso"].to_list()) + + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p1["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p2["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p1["lasso"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p2["lasso"].to_list()) + + # Selected + self.assertListEqual([1, 0, 1, 0, 1], selected_df_sequential["linear"].to_list()) + self.assertListEqual([1, 0, 1, 0, 1], selected_df_sequential["lasso"].to_list()) + + self.assertListEqual(selected_df_sequential["linear"].to_list(), selected_df_p1["linear"].to_list()) + self.assertListEqual(selected_df_sequential["linear"].to_list(), selected_df_p2["linear"].to_list()) + self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p1["lasso"].to_list()) + self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p2["lasso"].to_list()) def test_benchmark_classification(self): data, label = get_data_label(load_iris()) # Benchmark - score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None, n_jobs=2) - _ = calculate_statistics(score_df, selected_df) - - print(score_df) - print(selected_df) + score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label) + score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, n_jobs=1) + score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, n_jobs=2) + #_ = calculate_statistics(score_df, selected_df) + + # Scores + self.assertListAlmostEqual([0.289930, 0.560744, 0.262251, 0.042721], + list(score_df_sequential["linear"])) + self.assertListAlmostEqual([0.764816, 0.593482, 0.365352, 1.015095], + list(score_df_sequential["lasso"])) + + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p1["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p2["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p1["lasso"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p2["lasso"].to_list()) + + # Selected + self.assertListEqual([1, 1, 1, 0], selected_df_sequential["linear"].to_list()) + self.assertListEqual([1, 1, 0, 1], selected_df_sequential["lasso"].to_list()) + + self.assertListEqual(selected_df_sequential["linear"].to_list(), selected_df_p1["linear"].to_list()) + self.assertListEqual(selected_df_sequential["linear"].to_list(), selected_df_p2["linear"].to_list()) + self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p1["lasso"].to_list()) + self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p2["lasso"].to_list()) # def test_benchmark_regression_cv(self): # data, label = get_data_label(load_boston()) From 954e0140307a5e3272fc45fedce6636e8b7d81cf Mon Sep 17 00:00:00 2001 From: irmakbky Date: Fri, 11 Jun 2021 11:32:11 +0300 Subject: [PATCH 07/13] added backend parameter --- feature/selector.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 529cbbe..496da55 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -483,6 +483,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, n_jobs: int = 1, + backend: Optional[str] = None, seed: int = Constants.default_seed) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ @@ -514,6 +515,10 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Number of concurrent processes/threads to use in parallelized routines. If set to -1, all CPUs are used. If set to -2, all CPUs but one are used, and so on. + backend: str, optional (default=None) + A parallelization backend implementation supported in the joblib library. + Supported options are “loky” (used by default), “multiprocessing”, and “threading”. + Default value is None. In this case the default backend selected by joblib will be used. seed: int, optional (default=Constants.default_seed) The random seed to initialize the random number generator. @@ -533,7 +538,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=verbose, - n_jobs=n_jobs) + n_jobs=n_jobs, + backend=backend) else: # Create K-Fold object @@ -564,7 +570,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=False, - n_jobs=n_jobs) + n_jobs=n_jobs, + backend=backend) # Concatenate data frames score_df = pd.concat((score_df, score_cv_df)) @@ -587,7 +594,8 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, - n_jobs: int = 1) \ + n_jobs: int = 1, + backend: Optional[str] = None) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. @@ -623,7 +631,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, n_jobs = min(n_jobs, size) # Parallel benchmarks for each method - output_list = Parallel(n_jobs=n_jobs, verbose=10, require="sharedmem")( + output_list = Parallel(n_jobs=n_jobs, backend=backend, verbose=10, require="sharedmem")( delayed(_parallel_bench)( data, labels, method_name, method, verbose) for method_name, method in selectors.items()) From 9ae3c47dad14d829117da028e1abda615ac17df2 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Fri, 11 Jun 2021 11:32:56 +0300 Subject: [PATCH 08/13] more tests --- tests/test_parallel.py | 124 +++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 8e678c8..c8af9f1 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -11,7 +11,7 @@ from xgboost import XGBClassifier, XGBRegressor from feature.utils import get_data_label -from feature.selector import SelectionMethod, benchmark, calculate_statistics +from feature.selector import SelectionMethod, benchmark from tests.test_base import BaseTest @@ -23,28 +23,28 @@ class TestParallel(BaseTest): tree_params = {"random_state": 123, "n_estimators": 100} selectors = { - # "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), - # "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), - # "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), - # "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), - # "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), - # "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), + "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), + "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), + "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), + "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), + "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), + "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), "linear": SelectionMethod.Linear(num_features, regularization="none"), "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), - # "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), - # "random_forest": SelectionMethod.TreeBased(num_features), - # "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), - # "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), - # "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), - # "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), - # "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), - # "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), - # "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), - # "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), - # "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), - # "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), - # "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), - # "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) + "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), + "random_forest": SelectionMethod.TreeBased(num_features), + "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), + "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), + "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), + "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), + "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), + "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), + "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), + "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), + "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), + "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), + "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), + "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) } def test_benchmark_regression(self): @@ -83,13 +83,12 @@ def test_benchmark_classification(self): score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label) score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, n_jobs=1) score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, n_jobs=2) - #_ = calculate_statistics(score_df, selected_df) # Scores self.assertListAlmostEqual([0.289930, 0.560744, 0.262251, 0.042721], - list(score_df_sequential["linear"])) + score_df_sequential["linear"].to_list()) self.assertListAlmostEqual([0.764816, 0.593482, 0.365352, 1.015095], - list(score_df_sequential["lasso"])) + score_df_sequential["lasso"].to_list()) self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p1["linear"].to_list()) self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p2["linear"].to_list()) @@ -105,26 +104,57 @@ def test_benchmark_classification(self): self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p1["lasso"].to_list()) self.assertListEqual(selected_df_sequential["lasso"].to_list(), selected_df_p2["lasso"].to_list()) - # def test_benchmark_regression_cv(self): - # data, label = get_data_label(load_boston()) - # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - # - # # Benchmark - # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) - # _ = calculate_statistics(score_df, selected_df) - # - # # Aggregate scores from different cv-folds - # score_df = score_df.groupby(score_df.index).mean() - - - - - # def test_benchmark_classification_cv(self): - # data, label = get_data_label(load_iris()) - # - # # Benchmark - # score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) - # _ = calculate_statistics(score_df, selected_df) - # - # # Aggregate scores from different cv-folds - # score_df = score_df.groupby(score_df.index).mean() + def test_benchmark_regression_cv(self): + data, label = get_data_label(load_boston()) + data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + + # Benchmark + score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label, + cv=5, output_filename=None) + score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, cv=5, + output_filename=None, n_jobs=1) + score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, cv=5, + output_filename=None, n_jobs=2) + + # Aggregate scores from different cv-folds + score_df_sequential = score_df_sequential.groupby(score_df_sequential.index).mean() + score_df_p1 = score_df_p1.groupby(score_df_p1.index).mean() + score_df_p2 = score_df_p2.groupby(score_df_p2.index).mean() + + # Scores + self.assertListAlmostEqual([0.061577, 0.006446, 0.066933, 0.957603, 0.053797], + score_df_sequential["linear"].to_list()) + self.assertListAlmostEqual([0.053294, 0.007117, 0.054563, 0.926039, 0.050716], + score_df_sequential["lasso"].to_list()) + + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p1["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p2["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p1["lasso"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p2["lasso"].to_list()) + + def test_benchmark_classification_cv(self): + data, label = get_data_label(load_iris()) + + # Benchmark + score_df_sequential, selected_df_sequential, runtime_df_sequential = benchmark(self.selectors, data, label, + cv=5, output_filename=None) + score_df_p1, selected_df_p1, runtime_df_p1 = benchmark(self.selectors, data, label, cv=5, + output_filename=None, n_jobs=1) + score_df_p2, selected_df_p2, runtime_df_p2 = benchmark(self.selectors, data, label, cv=5, + output_filename=None, n_jobs=2) + + # Aggregate scores from different cv-folds + score_df_sequential = score_df_sequential.groupby(score_df_sequential.index).mean() + score_df_p1 = score_df_p1.groupby(score_df_p1.index).mean() + score_df_p2 = score_df_p2.groupby(score_df_p2.index).mean() + + # Scores + self.assertListAlmostEqual([0.223276, 0.035431, 0.262547, 0.506591], + score_df_sequential["linear"].to_list()) + self.assertListAlmostEqual([0.280393, 0.948935, 0.662777, 0.476188], + score_df_sequential["lasso"].to_list()) + + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p1["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["linear"].to_list(), score_df_p2["linear"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p1["lasso"].to_list()) + self.assertListAlmostEqual(score_df_sequential["lasso"].to_list(), score_df_p2["lasso"].to_list()) From 9adafea05f5d524b27c1f894285316b8551d70c6 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Tue, 15 Jun 2021 19:40:39 +0300 Subject: [PATCH 09/13] edits --- feature/selector.py | 6 +++--- requirements.txt | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 496da55..23cb717 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -10,13 +10,15 @@ This module defines the public interface of the **Selective Library** for feature selection. """ +import multiprocessing as mp from time import time -from typing import Dict, Union, NamedTuple, NoReturn, Tuple, Optional, IO, Any +from typing import Dict, Union, NamedTuple, NoReturn, Tuple, Optional import numpy as np import pandas as pd import seaborn as sns from catboost import CatBoostClassifier, CatBoostRegressor +from joblib import Parallel, delayed from lightgbm import LGBMClassifier, LGBMRegressor from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor @@ -24,8 +26,6 @@ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.model_selection import KFold from xgboost import XGBClassifier, XGBRegressor -from joblib import Parallel, delayed -import multiprocessing as mp from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector from feature.correlation import _Correlation diff --git a/requirements.txt b/requirements.txt index e9e29db..a9e32fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ catboost +joblib lightgbm minepy numpy From 616a934ae2023d42149deaeb50c61a909fb1ebb7 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Tue, 15 Jun 2021 20:00:31 +0300 Subject: [PATCH 10/13] edits --- feature/selector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index 23cb717..5540c41 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -5,7 +5,7 @@ """ :Author: FMR LLC -:Version: 1.0.0 of August 10, 2020 +:Version: 1.1.0 of August 10, 2020 This module defines the public interface of the **Selective Library** for feature selection. """ @@ -631,7 +631,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, n_jobs = min(n_jobs, size) # Parallel benchmarks for each method - output_list = Parallel(n_jobs=n_jobs, backend=backend, verbose=10, require="sharedmem")( + output_list = Parallel(n_jobs=n_jobs, backend=backend, require="sharedmem")( delayed(_parallel_bench)( data, labels, method_name, method, verbose) for method_name, method in selectors.items()) From b3d189c5e581049fabdc339082b4d1939d81c420 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Wed, 16 Jun 2021 15:14:28 +0300 Subject: [PATCH 11/13] edit --- feature/selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature/selector.py b/feature/selector.py index 5540c41..bb958ce 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -5,7 +5,7 @@ """ :Author: FMR LLC -:Version: 1.1.0 of August 10, 2020 +:Version: 1.1.0 of June 16, 2021 This module defines the public interface of the **Selective Library** for feature selection. """ From 61aa4e1dfb5cda483be89216c172a8f6e280f8da Mon Sep 17 00:00:00 2001 From: irmakbky Date: Wed, 16 Jun 2021 15:19:24 +0300 Subject: [PATCH 12/13] update --- CHANGELOG.txt | 6 ++++++ feature/_version.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 92b3bab..ba33a53 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -2,6 +2,12 @@ CHANGELOG ========= +------------------------------------------------------------------------------- +June, 16, 2021 1.1.0 +------------------------------------------------------------------------------- + +- Parallelize benchmark function. + ------------------------------------------------------------------------------- March, 23, 2021 1.0.1 ------------------------------------------------------------------------------- diff --git a/feature/_version.py b/feature/_version.py index 6670214..7e4d9f6 100644 --- a/feature/_version.py +++ b/feature/_version.py @@ -2,4 +2,4 @@ # Copyright FMR LLC # SPDX-License-Identifier: GNU GPLv3 -__version__ = "1.0.1" +__version__ = "1.1.0" From 6a39badbd7a8de015195b847a7289ca4966408f7 Mon Sep 17 00:00:00 2001 From: irmakbky Date: Wed, 16 Jun 2021 15:49:44 +0300 Subject: [PATCH 13/13] removed backend --- feature/selector.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/feature/selector.py b/feature/selector.py index bb958ce..eb03e29 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -483,7 +483,6 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, n_jobs: int = 1, - backend: Optional[str] = None, seed: int = Constants.default_seed) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ @@ -515,10 +514,6 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Number of concurrent processes/threads to use in parallelized routines. If set to -1, all CPUs are used. If set to -2, all CPUs but one are used, and so on. - backend: str, optional (default=None) - A parallelization backend implementation supported in the joblib library. - Supported options are “loky” (used by default), “multiprocessing”, and “threading”. - Default value is None. In this case the default backend selected by joblib will be used. seed: int, optional (default=Constants.default_seed) The random seed to initialize the random number generator. @@ -538,8 +533,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=verbose, - n_jobs=n_jobs, - backend=backend) + n_jobs=n_jobs) else: # Create K-Fold object @@ -570,8 +564,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=False, - n_jobs=n_jobs, - backend=backend) + n_jobs=n_jobs) # Concatenate data frames score_df = pd.concat((score_df, score_cv_df)) @@ -594,8 +587,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, - n_jobs: int = 1, - backend: Optional[str] = None) \ + n_jobs: int = 1) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. @@ -631,7 +623,7 @@ def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, n_jobs = min(n_jobs, size) # Parallel benchmarks for each method - output_list = Parallel(n_jobs=n_jobs, backend=backend, require="sharedmem")( + output_list = Parallel(n_jobs=n_jobs, require="sharedmem")( delayed(_parallel_bench)( data, labels, method_name, method, verbose) for method_name, method in selectors.items())