diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 4f4dc0d..df3a2d6 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -2,6 +2,14 @@
CHANGELOG
=========
+-------------------------------------------------------------------------------
+Feb, 09, 2022 1.1.1
+-------------------------------------------------------------------------------
+
+- Dropped [Maximal Information (MIC)](https://github.com/minepy/minepy) due to inactive backend library
+- MINEPY installation is not compatible with setuptools>=58 as noted in [this issue](https://github.com/minepy/minepy/issues/32)
+- In addition, MIC is rather slow on large datasets
+
-------------------------------------------------------------------------------
June, 16, 2021 1.1.0
-------------------------------------------------------------------------------
diff --git a/README.md b/README.md
index 577f14d..043c9b1 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ print("Scores:", list(selector.get_absolute_scores()))
| :---------------: | :-----: |
| [Variance per Feature](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html) | `threshold` |
| [Correlation pairwise Features](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html) | [Pearson Correlation Coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)
[Kendall Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient)
[Spearman's Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
|
-| [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) | [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)
[F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html)
[Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
[Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
[Maximal Information (MIC)](https://github.com/minepy/minepy)
[Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html) |
+| [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) | [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)
[F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html)
[Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
[Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
[Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html) |
| [Linear Methods](https://en.wikipedia.org/wiki/Linear_regression) | [Linear Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linear%20regression#sklearn.linear_model.LinearRegression)
[Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression)
[Lasso Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso)
[Ridge Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)
|
| [Tree-based Methods](https://scikit-learn.org/stable/modules/tree.html) | [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
[Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn.ensemble.RandomForestClassifier)
[Extra Trees Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html)
[XGBoost](https://xgboost.readthedocs.io/en/latest/)
[LightGBM](https://lightgbm.readthedocs.io/en/latest/)
[AdaBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
[CatBoost](https://github.com/catboost)
[Gradient Boosting Tree](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
|
diff --git a/feature/_version.py b/feature/_version.py
index 7e4d9f6..86b0625 100644
--- a/feature/_version.py
+++ b/feature/_version.py
@@ -2,4 +2,4 @@
# Copyright FMR LLC
# SPDX-License-Identifier: GNU GPLv3
-__version__ = "1.1.0"
+__version__ = "1.1.1"
diff --git a/feature/selector.py b/feature/selector.py
index eb03e29..029faa3 100644
--- a/feature/selector.py
+++ b/feature/selector.py
@@ -202,7 +202,8 @@ class Statistical(NamedTuple):
the results might be sensitive to exact bin selection.
Maximal information score (MIC) tries to address these gaps by
- searching for the optimal binnign strategy.
+ searching for the optimal binning strategy.
+ Note: MIC is dropped from Selective due to inactive MINE library
Notes on Randomness:
- Mutual Info is non-deterministic, depends on the seed value.
@@ -218,7 +219,6 @@ class Statistical(NamedTuple):
* anova: Anova and Anova F-test (default)
* chi_square: Chi-Square
* mutual_info: Mutual Information score
- * maximal_info: Maximal Information score (MIC)
* variance_inflation: Variance Inflation factor (VIF)
"""
num_features: Num = 0.0
@@ -229,8 +229,8 @@ def _validate(self):
check_true(self.num_features > 0, ValueError("Num features must be greater than zero."))
if isinstance(self.num_features, float):
check_true(self.num_features <= 1, ValueError("Num features ratio must be between [0..1]."))
- check_true(self.method in ["anova", "chi_square", "mutual_info", "maximal_info", "variance_inflation"],
- ValueError("Statistical method can only be anova, chi_square, mutual_info, or maximal_info."))
+ check_true(self.method in ["anova", "chi_square", "mutual_info", "variance_inflation"], # "maximal_info" dropped
+ ValueError("Statistical method can only be anova, chi_square, or mutual_info."))
class TreeBased(NamedTuple):
"""
diff --git a/feature/statistical.py b/feature/statistical.py
index 3c4c482..16a7808 100644
--- a/feature/statistical.py
+++ b/feature/statistical.py
@@ -5,7 +5,7 @@
from functools import partial
from typing import NoReturn, Tuple
-from minepy import MINE
+# from minepy import MINE (dropped)
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression
@@ -36,11 +36,11 @@ def __init__(self, seed: int, num_features: Num, method: str):
self.factory = {"regression_anova": f_regression,
"regression_chi_square": None,
"regression_mutual_info": partial(mutual_info_regression, random_state=self.seed),
- "regression_maximal_info": MINE(),
+ # "regression_maximal_info": MINE(), # dropped
"classification_anova": f_classif,
"classification_chi_square": chi2,
"classification_mutual_info": partial(mutual_info_classif, random_state=self.seed),
- "classification_maximal_info": MINE(),
+ # "classification_maximal_info": MINE(), # dropped
"unsupervised_variance_inflation": variance_inflation_factor}
def get_model_args(self, selection_method) -> Tuple:
@@ -62,7 +62,7 @@ def dispatch_model(self, labels: pd.Series, *args):
# Check scoring compatibility with task
if score_func is None:
raise TypeError(method + " cannot be used for task: " + get_task_string(labels))
- elif isinstance(score_func, MINE) or method == "variance_inflation":
+ elif method == "variance_inflation": # or isinstance(score_func, MINE) (dropped)
self.imp = score_func
else:
# Set sklearn model selector based on scoring function
@@ -71,13 +71,15 @@ def dispatch_model(self, labels: pd.Series, *args):
def fit(self, data: pd.DataFrame, labels: pd.Series) -> NoReturn:
# Calculate absolute scores depending on the method
- if isinstance(self.imp, MINE):
- self.abs_scores = []
- for col in data.columns:
- self.imp.compute_score(data[col], labels)
- score = self.imp.mic()
- self.abs_scores.append(score)
- elif self.method == "variance_inflation":
+
+ # NOTE: mine is dropped
+ # if isinstance(self.imp, MINE):
+ # self.abs_scores = []
+ # for col in data.columns:
+ # self.imp.compute_score(data[col], labels)
+ # score = self.imp.mic()
+ # self.abs_scores.append(score)
+ if self.method == "variance_inflation":
# VIF is unsupervised, regression between data and each feature
self.abs_scores = np.array([variance_inflation_factor(data.values, i) for i in range(data.shape[1])])
else:
diff --git a/requirements.txt b/requirements.txt
index a9e32fe..b9376a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
catboost
joblib
lightgbm
-minepy
numpy
pandas
scikit-learn
diff --git a/tests/test_stat_maximal.py b/tests/test_stat_maximal.py
index 0ae8809..48bb8e6 100644
--- a/tests/test_stat_maximal.py
+++ b/tests/test_stat_maximal.py
@@ -2,112 +2,115 @@
# Copyright FMR LLC
# SPDX-License-Identifier: GNU GPLv3
-from sklearn.datasets import load_boston, load_iris
-from feature.utils import get_data_label
-from feature.selector import Selective, SelectionMethod
+# from sklearn.datasets import load_boston, load_iris
+# from feature.utils import get_data_label
+# from feature.selector import Selective, SelectionMethod
from tests.test_base import BaseTest
class TestMaximalInfo(BaseTest):
- def test_maximal_regress_top_k(self):
- data, label = get_data_label(load_boston())
- data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
-
- method = SelectionMethod.Statistical(num_features=3, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 3)
- self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT'])
-
- def test_maximal_regress_top_percentile(self):
- data, label = get_data_label(load_boston())
- data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
-
- method = SelectionMethod.Statistical(num_features=0.6, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 3)
- self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT'])
-
- def test_maximal_regress_top_k_all(self):
- data, label = get_data_label(load_boston())
- data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
-
- method = SelectionMethod.Statistical(num_features=5, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(data.shape[1], subset.shape[1])
- self.assertListEqual(list(data.columns), list(subset.columns))
-
- def test_maximal_regress_top_percentile_all(self):
- data, label = get_data_label(load_boston())
- data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
-
- method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(data.shape[1], subset.shape[1])
- self.assertListEqual(list(data.columns), list(subset.columns))
-
- def test_maximal_classif_top_k(self):
- data, label = get_data_label(load_iris())
-
- method = SelectionMethod.Statistical(num_features=2, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 2)
- self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)'])
-
- def test_maximal_classif_top_percentile(self):
- data, label = get_data_label(load_iris())
-
- method = SelectionMethod.Statistical(num_features=0.5, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 2)
- self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)'])
-
- def test_maximal_classif_top_percentile_all(self):
- data, label = get_data_label(load_iris())
-
- method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 4)
- self.assertListEqual(list(subset.columns),
- ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
-
- def test_maximal_classif_top_k_all(self):
- data, label = get_data_label(load_iris())
-
- method = SelectionMethod.Statistical(num_features=4, method="maximal_info")
- selector = Selective(method)
- selector.fit(data, label)
- subset = selector.transform(data)
-
- # Reduced columns
- self.assertEqual(subset.shape[1], 4)
- self.assertListEqual(list(subset.columns),
- ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
+ def test_maximal(self):
+ pass
+
+ # def test_maximal_regress_top_k(self):
+ # data, label = get_data_label(load_boston())
+ # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
+ #
+ # method = SelectionMethod.Statistical(num_features=3, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 3)
+ # self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT'])
+ #
+ # def test_maximal_regress_top_percentile(self):
+ # data, label = get_data_label(load_boston())
+ # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
+ #
+ # method = SelectionMethod.Statistical(num_features=0.6, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 3)
+ # self.assertListEqual(list(subset.columns), ['CRIM', 'AGE', 'LSTAT'])
+ #
+ # def test_maximal_regress_top_k_all(self):
+ # data, label = get_data_label(load_boston())
+ # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
+ #
+ # method = SelectionMethod.Statistical(num_features=5, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(data.shape[1], subset.shape[1])
+ # self.assertListEqual(list(data.columns), list(subset.columns))
+ #
+ # def test_maximal_regress_top_percentile_all(self):
+ # data, label = get_data_label(load_boston())
+ # data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"])
+ #
+ # method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(data.shape[1], subset.shape[1])
+ # self.assertListEqual(list(data.columns), list(subset.columns))
+ #
+ # def test_maximal_classif_top_k(self):
+ # data, label = get_data_label(load_iris())
+ #
+ # method = SelectionMethod.Statistical(num_features=2, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 2)
+ # self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)'])
+ #
+ # def test_maximal_classif_top_percentile(self):
+ # data, label = get_data_label(load_iris())
+ #
+ # method = SelectionMethod.Statistical(num_features=0.5, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 2)
+ # self.assertListEqual(list(subset.columns), ['petal length (cm)', 'petal width (cm)'])
+ #
+ # def test_maximal_classif_top_percentile_all(self):
+ # data, label = get_data_label(load_iris())
+ #
+ # method = SelectionMethod.Statistical(num_features=1.0, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 4)
+ # self.assertListEqual(list(subset.columns),
+ # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
+ #
+ # def test_maximal_classif_top_k_all(self):
+ # data, label = get_data_label(load_iris())
+ #
+ # method = SelectionMethod.Statistical(num_features=4, method="maximal_info")
+ # selector = Selective(method)
+ # selector.fit(data, label)
+ # subset = selector.transform(data)
+ #
+ # # Reduced columns
+ # self.assertEqual(subset.shape[1], 4)
+ # self.assertListEqual(list(subset.columns),
+ # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])