Skip to content

Commit

Permalink
Merge pull request #3 from fidelity/feature/cv
Browse files Browse the repository at this point in the history
feature/cv
  • Loading branch information
bkleyn authored Mar 31, 2021
2 parents 24300df + e523001 commit 98179a9
Show file tree
Hide file tree
Showing 7 changed files with 249 additions and 85 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
CHANGELOG
=========

-------------------------------------------------------------------------------
March, 23, 2021 1.0.1
-------------------------------------------------------------------------------

- Add cross-validation (cv) capability to benchmark function.

-------------------------------------------------------------------------------
February, 1, 2021 1.0.0
-------------------------------------------------------------------------------
Expand Down
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ The library provides:

* Automated task detection. No need to know what feature selection method works with what machine learning task

* Benchmarking with multiple selectors
* Benchmarking multiple selectors using cross-validation

* Inspection of results and feature importance

Expand Down Expand Up @@ -91,7 +91,7 @@ selectors = {
}

# Benchmark
score_df, selected_df, runtime_df = benchmark(selectors, data, label)
score_df, selected_df, runtime_df = benchmark(selectors, data, label, cv=5)
print(score_df, "\n\n", selected_df, "\n\n", runtime_df)

# Get benchmark statistics by feature
Expand Down Expand Up @@ -125,6 +125,18 @@ plot_importance(df)

Selective is available to install as `pip install selective`.

## Source

Alternatively, you can build a wheel package on your platform from scratch using the source code:

```bash
git clone https://github.com/fidelity/selective.git
cd selective
pip install setuptools wheel # if wheel is not installed
python setup.py sdist bdist_wheel
pip install dist/selective-X.X.X-py3-none-any.whl
```

## Support

Please submit bug reports and feature requests as [Issues](https://github.com/fidelity/selective/issues).
Expand Down
Binary file removed dist/selective-1.0.0-py3-none-any.whl
Binary file not shown.
2 changes: 1 addition & 1 deletion feature/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Copyright FMR LLC <[email protected]>
# SPDX-License-Identifier: GNU GPLv3

__version__ = "1.0.0"
__version__ = "1.0.1"
120 changes: 104 additions & 16 deletions feature/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import KFold
from xgboost import XGBClassifier, XGBRegressor

from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector
Expand Down Expand Up @@ -475,9 +476,11 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
SelectionMethod.Variance]],
data: pd.DataFrame,
labels: Optional[pd.Series] = None,
cv: Optional[int] = None,
output_filename: Optional[str] = None,
drop_zero_variance_features: Optional[bool] = True,
verbose: bool = False) \
verbose: bool = False,
seed: int = Constants.default_seed) \
-> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Benchmark with a given set of feature selectors.
Expand All @@ -495,13 +498,90 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
Data of shape (n_samples, n_features) used for feature selection.
labels: pd.Series, optional (default=None)
The target values (class labels in classification, real numbers in regression).
cv: int, optional (default=None)
Number of folds to use for cross-validation.
output_filename: str, optional (default=None)
If not None, benchmarking output is saved.
If file exists, results are appended, otherwise file is created.
drop_zero_variance_features: bool, optional (default=True)
Whether to drop features with zero variance before running feature selector methods or not.
verbose: bool, optional (default=False)
Whether to print progress messages or not.
seed: int, optional (default=Constants.default_seed)
The random seed to initialize the random number generator.
Returns
-------
Tuple of data frames with scores, selected features and runtime for each method.
If cv is not None, the data frames will contain the concatenated results from each fold.
"""

check_true(selectors is not None, ValueError("Benchmark selectors cannot be none."))
check_true(data is not None, ValueError("Benchmark data cannot be none."))

if cv is None:
return _bench(selectors=selectors,
data=data,
labels=labels,
output_filename=output_filename,
drop_zero_variance_features=drop_zero_variance_features,
verbose=verbose)
else:

# Create K-Fold object
kf = KFold(n_splits=cv, shuffle=True, random_state=seed)

# Initialize variables
t0 = time()
train_labels, test_labels = None, None
score_df, selected_df, runtime_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

# Split data into cv-folds and run _bench for each fold
if verbose:
print("\n>>> Running")
for fold, (train_index, _) in enumerate(kf.split(data)):

if verbose:
print("\tFold", fold, "...")

# Split data, labels into folds
train_data = data.iloc[train_index]
if labels is not None:
train_labels = labels.iloc[train_index]

# Run benchmark
score_cv_df, selected_cv_df, runtime_cv_df = _bench(selectors=selectors,
data=train_data,
labels=train_labels,
output_filename=output_filename,
drop_zero_variance_features=drop_zero_variance_features,
verbose=False)

# Concatenate data frames
score_df = pd.concat((score_df, score_cv_df))
selected_df = pd.concat((selected_df, selected_cv_df))
runtime_df = pd.concat((runtime_df, runtime_cv_df))

if verbose:
print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")

return score_df, selected_df, runtime_df


def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation,
SelectionMethod.Linear,
SelectionMethod.TreeBased,
SelectionMethod.Statistical,
SelectionMethod.Variance]],
data: pd.DataFrame,
labels: Optional[pd.Series] = None,
output_filename: Optional[str] = None,
drop_zero_variance_features: Optional[bool] = True,
verbose: bool = False) \
-> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Benchmark with a given set of feature selectors.
Return a tuple of data frames with scores, runtime and selected features for each method.
Returns
-------
Expand Down Expand Up @@ -552,7 +632,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
if verbose:
print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")

# Convert to series
# Format
runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis("method").reset_index()

return score_df, selected_df, runtime_df
Expand All @@ -561,15 +641,19 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
def calculate_statistics(scores: pd.DataFrame,
selected: pd.DataFrame,
columns: Optional[list] = None,
ignore_constant: Optional[bool] = True):
"""Calculate statistics for each feature using scores/selections from list of methods.
ignore_constant: Optional[bool] = True) -> pd.DataFrame:
"""
Calculate statistics for each feature using scores/selections from list of methods.
Returns data frame with calculated statistics for each feature.
Parameters
----------
scores: pd.DataFrame
Data frame with scores for each feature (index) and selector (columns).
Each feature could have multiple rows from different cross-validation folds.
selected: pd.DataFrame
Data frame with selection flag for each feature (index) and selector (columns).
Each feature could have multiple rows from different cross-validation folds.
columns: list (default=None)
List of methods (columns) to include in statistics.
If None, all methods (columns) will be used.
Expand All @@ -584,9 +668,9 @@ def calculate_statistics(scores: pd.DataFrame,
check_true(isinstance(scores, pd.DataFrame), ValueError("scores must be a data frame."))
check_true(isinstance(selected, pd.DataFrame), ValueError("selection must be a data frame."))
check_true(scores.shape == selected.shape, ValueError("Shapes of scores and selected data frames must match."))
check_true(len(scores.index.intersection(selected.index)) == selected.shape[0],
check_true(np.all(scores.index == selected.index),
ValueError("Index of score and selection data frames must match."))
check_true(len(scores.columns.intersection(selected.columns)) == selected.shape[1],
check_true(np.all(scores.columns == selected.columns),
ValueError("Columns of score and selection data frames must match."))

# Get columns to use
Expand All @@ -597,25 +681,25 @@ def calculate_statistics(scores: pd.DataFrame,
scores_df = scores[columns].copy()
selected_df = selected[columns].copy()

# Group by feature for CV results
scores_df = scores_df.groupby(scores_df.index).mean()
selected_df = selected_df.groupby(selected_df.index).mean()

# Drop methods with constant scores
if ignore_constant:
mask = ~np.isclose(np.var(scores_df, axis=0), 0)
scores_df = scores_df.loc[:, mask]
selected_df = selected_df.loc[:, mask]

# Sort by index
scores_df.sort_index(inplace=True)
selected_df.sort_index(inplace=True)

# Calculate statistics
stats_df = pd.DataFrame(index=scores.index)
stats_df["_score_mean"] = scores_df.mean(axis=1)
stats_df["_score_mean_norm"] = normalize_columns(scores_df).mean(axis=1)
stats_df["_selection_freq"] = selected_df.sum(axis=1)
stats_df["_selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1)
stats_df = pd.DataFrame(index=scores_df.index)
stats_df["score_mean"] = scores_df.mean(axis=1)
stats_df["score_mean_norm"] = normalize_columns(scores_df).mean(axis=1)
stats_df["selection_freq"] = selected_df.sum(axis=1)
stats_df["selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1)

# Sort
stats_df.sort_values(by="_score_mean_norm", ascending=False, inplace=True)
stats_df.sort_values(by="score_mean_norm", ascending=False, inplace=True)

return stats_df

Expand All @@ -632,6 +716,7 @@ def plot_importance(scores: pd.DataFrame,
----------
scores: pd.DataFrame
Data frame with scores for each feature (index) and method (columns).
Each feature could have multiple rows from different cross-validation folds.
columns: list (default=None)
List of methods (columns) to include in statistics.
If None, all methods (columns) will be used.
Expand Down Expand Up @@ -663,6 +748,9 @@ def plot_importance(scores: pd.DataFrame,
df = scores[columns].copy()
df.fillna(0, inplace=True)

# Group by feature for CV results
df = df.groupby(df.index).mean()

# Get normalized scores such that scores for each method sums to 1
if normalize:
df = normalize_columns(df)
Expand Down
4 changes: 2 additions & 2 deletions feature/tree_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ def dispatch_model(self, labels: pd.Series, *args):
# Custom estimator should be compatible with the task
if "classification_" in task_str:
if isinstance(self.estimator, CatBoost):
if self.estimator._estimator_type is not 'classifier':
if self.estimator._estimator_type != 'classifier':
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
else:
if not isinstance(self.estimator, ClassifierMixin):
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
else:
if isinstance(self.estimator, CatBoost):
if self.estimator._estimator_type is not 'regressor':
if self.estimator._estimator_type != 'regressor':
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
else:
if not isinstance(self.estimator, RegressorMixin):
Expand Down
Loading

0 comments on commit 98179a9

Please sign in to comment.