diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 4f4dc0d..df3a2d6 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -2,6 +2,14 @@ CHANGELOG ========= +------------------------------------------------------------------------------- +Feb, 09, 2022 1.1.1 +------------------------------------------------------------------------------- + +- Dropped [Maximal Information (MIC)](https://github.com/minepy/minepy) due to inactive backend library +- MINEPY installation is not compatible with setuptools>=58 as noted in [this issue](https://github.com/minepy/minepy/issues/32) +- In addition, MIC is rather slow on large datasets + ------------------------------------------------------------------------------- June, 16, 2021 1.1.0 ------------------------------------------------------------------------------- diff --git a/README.md b/README.md index 577f14d..043c9b1 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ print("Scores:", list(selector.get_absolute_scores())) | :---------------: | :-----: | | [Variance per Feature](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html) | `threshold` | | [Correlation pairwise Features](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html) | [Pearson Correlation Coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)
[Kendall Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient)
[Spearman's Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
| -| [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) | [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)
[F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html)
[Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
[Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
[Maximal Information (MIC)](https://github.com/minepy/minepy)
[Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html) | +| [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) | [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)
[F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html)
[Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
[Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
[Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html) | | [Linear Methods](https://en.wikipedia.org/wiki/Linear_regression) | [Linear Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linear%20regression#sklearn.linear_model.LinearRegression)
[Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression)
[Lasso Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso)
[Ridge Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)
| | [Tree-based Methods](https://scikit-learn.org/stable/modules/tree.html) | [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
[Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn.ensemble.RandomForestClassifier)
[Extra Trees Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html)
[XGBoost](https://xgboost.readthedocs.io/en/latest/)
[LightGBM](https://lightgbm.readthedocs.io/en/latest/)
[AdaBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
[CatBoost](https://github.com/catboost)
[Gradient Boosting Tree](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
| diff --git a/feature/_version.py b/feature/_version.py index 7e4d9f6..86b0625 100644 --- a/feature/_version.py +++ b/feature/_version.py @@ -2,4 +2,4 @@ # Copyright FMR LLC # SPDX-License-Identifier: GNU GPLv3 -__version__ = "1.1.0" +__version__ = "1.1.1" diff --git a/feature/selector.py b/feature/selector.py index eb03e29..029faa3 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -202,7 +202,8 @@ class Statistical(NamedTuple): the results might be sensitive to exact bin selection. Maximal information score (MIC) tries to address these gaps by - searching for the optimal binnign strategy. + searching for the optimal binning strategy. + Note: MIC is dropped from Selective due to inactive MINE library Notes on Randomness: - Mutual Info is non-deterministic, depends on the seed value. @@ -218,7 +219,6 @@ class Statistical(NamedTuple): * anova: Anova and Anova F-test (default) * chi_square: Chi-Square * mutual_info: Mutual Information score - * maximal_info: Maximal Information score (MIC) * variance_inflation: Variance Inflation factor (VIF) """ num_features: Num = 0.0 @@ -229,8 +229,8 @@ def _validate(self): check_true(self.num_features > 0, ValueError("Num features must be greater than zero.")) if isinstance(self.num_features, float): check_true(self.num_features <= 1, ValueError("Num features ratio must be between [0..1].")) - check_true(self.method in ["anova", "chi_square", "mutual_info", "maximal_info", "variance_inflation"], - ValueError("Statistical method can only be anova, chi_square, mutual_info, or maximal_info.")) + check_true(self.method in ["anova", "chi_square", "mutual_info", "variance_inflation"], # "maximal_info" dropped + ValueError("Statistical method can only be anova, chi_square, or mutual_info.")) class TreeBased(NamedTuple): """ diff --git a/feature/statistical.py b/feature/statistical.py index 3c4c482..16a7808 100644 --- a/feature/statistical.py +++ b/feature/statistical.py @@ -5,7 +5,7 @@ from functools import partial from typing import NoReturn, Tuple -from minepy import MINE +# from minepy import MINE (dropped) import numpy as np import pandas as pd from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression @@ -36,11 +36,11 @@ def __init__(self, seed: int, num_features: Num, method: str): self.factory = {"regression_anova": f_regression, "regression_chi_square": None, "regression_mutual_info": partial(mutual_info_regression, random_state=self.seed), - "regression_maximal_info": MINE(), + # "regression_maximal_info": MINE(), # dropped "classification_anova": f_classif, "classification_chi_square": chi2, "classification_mutual_info": partial(mutual_info_classif, random_state=self.seed), - "classification_maximal_info": MINE(), + # "classification_maximal_info": MINE(), # dropped "unsupervised_variance_inflation": variance_inflation_factor} def get_model_args(self, selection_method) -> Tuple: @@ -62,7 +62,7 @@ def dispatch_model(self, labels: pd.Series, *args): # Check scoring compatibility with task if score_func is None: raise TypeError(method + " cannot be used for task: " + get_task_string(labels)) - elif isinstance(score_func, MINE) or method == "variance_inflation": + elif method == "variance_inflation": # or isinstance(score_func, MINE) (dropped) self.imp = score_func else: # Set sklearn model selector based on scoring function @@ -71,13 +71,15 @@ def dispatch_model(self, labels: pd.Series, *args): def fit(self, data: pd.DataFrame, labels: pd.Series) -> NoReturn: # Calculate absolute scores depending on the method - if isinstance(self.imp, MINE): - self.abs_scores = [] - for col in data.columns: - self.imp.compute_score(data[col], labels) - score = self.imp.mic() - self.abs_scores.append(score) - elif self.method == "variance_inflation": + + # NOTE: mine is dropped + # if isinstance(self.imp, MINE): + # self.abs_scores = [] + # for col in data.columns: + # self.imp.compute_score(data[col], labels) + # score = self.imp.mic() + # self.abs_scores.append(score) + if self.method == "variance_inflation": # VIF is unsupervised, regression between data and each feature self.abs_scores = np.array([variance_inflation_factor(data.values, i) for i in range(data.shape[1])]) else: diff --git a/requirements.txt b/requirements.txt index a9e32fe..b9376a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ catboost joblib lightgbm -minepy numpy pandas scikit-learn diff --git a/tests/test_stat_maximal.py b/tests/test_stat_maximal.py index 0ae8809..48bb8e6 100644 --- a/tests/test_stat_maximal.py +++ b/tests/test_stat_maximal.py @@ -2,112 +2,115 @@ # Copyright FMR LLC # SPDX-License-Identifier: GNU GPLv3 -from sklearn.datasets import load_boston, load_iris -from feature.utils import get_data_label -from feature.selector import Selective, SelectionMethod +# from sklearn.datasets import load_boston, load_iris +# from feature.utils import get_data_label +# from feature.selector import Selective, SelectionMethod from tests.test_base import BaseTest class TestMaximalInfo(BaseTest): - def test_maximal_regress_top_k(self): - data, label = get_data_label(load_boston()) - data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - - method = SelectionMethod.Statistical(num_features=3, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 3) - self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT']) - - def test_maximal_regress_top_percentile(self): - data, label = get_data_label(load_boston()) - data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - - method = SelectionMethod.Statistical(num_features=0.6, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 3) - self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT']) - - def test_maximal_regress_top_k_all(self): - data, label = get_data_label(load_boston()) - data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - - method = SelectionMethod.Statistical(num_features=5, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(data.shape[1], subset.shape[1]) - self.assertListEqual(list(data.columns), list(subset.columns)) - - def test_maximal_regress_top_percentile_all(self): - data, label = get_data_label(load_boston()) - data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - - method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(data.shape[1], subset.shape[1]) - self.assertListEqual(list(data.columns), list(subset.columns)) - - def test_maximal_classif_top_k(self): - data, label = get_data_label(load_iris()) - - method = SelectionMethod.Statistical(num_features=2, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 2) - self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)']) - - def test_maximal_classif_top_percentile(self): - data, label = get_data_label(load_iris()) - - method = SelectionMethod.Statistical(num_features=0.5, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 2) - self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)']) - - def test_maximal_classif_top_percentile_all(self): - data, label = get_data_label(load_iris()) - - method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 4) - self.assertListEqual(list(subset.columns), - ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']) - - def test_maximal_classif_top_k_all(self): - data, label = get_data_label(load_iris()) - - method = SelectionMethod.Statistical(num_features=4, method="maximal_info") - selector = Selective(method) - selector.fit(data, label) - subset = selector.transform(data) - - # Reduced columns - self.assertEqual(subset.shape[1], 4) - self.assertListEqual(list(subset.columns), - ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']) + def test_maximal(self): + pass + + # def test_maximal_regress_top_k(self): + # data, label = get_data_label(load_boston()) + # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + # + # method = SelectionMethod.Statistical(num_features=3, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 3) + # self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT']) + # + # def test_maximal_regress_top_percentile(self): + # data, label = get_data_label(load_boston()) + # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + # + # method = SelectionMethod.Statistical(num_features=0.6, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 3) + # self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT']) + # + # def test_maximal_regress_top_k_all(self): + # data, label = get_data_label(load_boston()) + # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + # + # method = SelectionMethod.Statistical(num_features=5, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(data.shape[1], subset.shape[1]) + # self.assertListEqual(list(data.columns), list(subset.columns)) + # + # def test_maximal_regress_top_percentile_all(self): + # data, label = get_data_label(load_boston()) + # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) + # + # method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(data.shape[1], subset.shape[1]) + # self.assertListEqual(list(data.columns), list(subset.columns)) + # + # def test_maximal_classif_top_k(self): + # data, label = get_data_label(load_iris()) + # + # method = SelectionMethod.Statistical(num_features=2, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 2) + # self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)']) + # + # def test_maximal_classif_top_percentile(self): + # data, label = get_data_label(load_iris()) + # + # method = SelectionMethod.Statistical(num_features=0.5, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 2) + # self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)']) + # + # def test_maximal_classif_top_percentile_all(self): + # data, label = get_data_label(load_iris()) + # + # method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 4) + # self.assertListEqual(list(subset.columns), + # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']) + # + # def test_maximal_classif_top_k_all(self): + # data, label = get_data_label(load_iris()) + # + # method = SelectionMethod.Statistical(num_features=4, method="maximal_info") + # selector = Selective(method) + # selector.fit(data, label) + # subset = selector.transform(data) + # + # # Reduced columns + # self.assertEqual(subset.shape[1], 4) + # self.assertListEqual(list(subset.columns), + # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])