From 83cd57444302686eb0fc627c724f54cb88d554e3 Mon Sep 17 00:00:00 2001 From: SIKAI ZHANG <34108862+MatthewSZhang@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:48:47 +0800 Subject: [PATCH 1/2] FEAT add extend by mini-batch --- doc/index.rst | 1 + fastcan/__init__.py | 2 + fastcan/_cancorr_fast.pyx | 4 ++ fastcan/_extend.py | 120 ++++++++++++++++++++++++++++++++++++++ fastcan/_fastcan.py | 6 ++ fastcan/_refine.py | 2 +- tests/test_extend.py | 86 +++++++++++++++++++++++++++ tests/test_refine.py | 2 +- 8 files changed, 221 insertions(+), 2 deletions(-) create mode 100644 fastcan/_extend.py create mode 100644 tests/test_extend.py diff --git a/doc/index.rst b/doc/index.rst index 20db2ae..090401b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -19,6 +19,7 @@ API Reference FastCan refine + extend ssc ols diff --git a/fastcan/__init__.py b/fastcan/__init__.py index 8bcbd35..dee6cfc 100644 --- a/fastcan/__init__.py +++ b/fastcan/__init__.py @@ -2,6 +2,7 @@ The :mod:`fastcan` module implements algorithms, including """ +from ._extend import extend from ._fastcan import FastCan from ._refine import refine from ._utils import ols, ssc @@ -11,4 +12,5 @@ "ssc", "ols", "refine", + "extend", ] diff --git a/fastcan/_cancorr_fast.pyx b/fastcan/_cancorr_fast.pyx index ae4c30c..b0afe30 100644 --- a/fastcan/_cancorr_fast.pyx +++ b/fastcan/_cancorr_fast.pyx @@ -194,6 +194,10 @@ cpdef int _forward_search( # Find max scores and update indices, X, mask, and scores index = _iamax(n_features, &r2[0], 1) + if r2[index] == 0: + raise RuntimeError( + f"No improvement can be found when selecting the {i}th feature." + ) indices[i] = index scores[i] = r2[index] diff --git a/fastcan/_extend.py b/fastcan/_extend.py new file mode 100644 index 0000000..ec1465f --- /dev/null +++ b/fastcan/_extend.py @@ -0,0 +1,120 @@ +""" +Extend feature selection +""" + +import math +from copy import deepcopy +from numbers import Integral + +import numpy as np +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, validate_params +from sklearn.utils.validation import check_is_fitted + +from ._cancorr_fast import _forward_search # type: ignore +from ._fastcan import FastCan, _prepare_search + + +@validate_params( + { + "selector": [FastCan], + "n_features_to_select": [ + Interval(Integral, 1, None, closed="left"), + ], + "batch_size": [ + Interval(Integral, 1, None, closed="left"), + ], + }, + prefer_skip_nested_validation=False, +) +def extend(selector, n_features_to_select=1, batch_size=1): + """Extend FastCan with mini batches. + + It is suitable for selecting a very large number of features + even larger than the number of samples. + + Similar to the correlation filter which selects each feature without considering + the redundancy, the function selects features in mini-batch and the + redundancy between the two mini-batches will be ignored. + + Parameters + ---------- + selector : FastCan + FastCan selector. + + n_features_to_select : int, default=1 + The parameter is the absolute number of features to select. + + batch_size : int, default=1 + The number of features in a mini-batch. + + Returns + ------- + indices : ndarray of shape (n_features_to_select,), dtype=int + The indices of the selected features. + + Examples + -------- + >>> from fastcan import FastCan, extend + >>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]] + >>> y = [1, 0, -1, 0] + >>> selector = FastCan(1, verbose=0).fit(X, y) + >>> print(f"Indices: {selector.indices_}") + Indices: [0] + >>> indices = extend(selector, 3, batch_size=2) + >>> print(f"Indices: {indices}") + Indices: [0 2 1] + """ + check_is_fitted(selector) + n_inclusions = selector.indices_include_.size + n_features = selector.n_features_in_ + n_to_select = n_features_to_select - selector.n_features_to_select + batch_size_to_select = batch_size - n_inclusions + + if n_features_to_select > n_features: + raise ValueError( + f"n_features_to_select {n_features_to_select} " + f"must be <= n_features {n_features}." + ) + if n_to_select <= 0: + raise ValueError( + f"The number of features to select ({n_to_select}) ", "is less than 0." + ) + if batch_size_to_select <= 0: + raise ValueError( + "The size of mini batch without included indices ", + f"({batch_size_to_select}) is less than 0.", + ) + + X_transformed_ = deepcopy(selector.X_transformed_) + + indices_include = selector.indices_include_ + indices_exclude = selector.indices_exclude_ + indices_select = selector.indices_[n_inclusions:] + + n_threads = _openmp_effective_n_threads() + + for i in range(math.ceil(n_to_select / batch_size_to_select)): + if i == 0: + batch_size_i = (n_to_select - 1) % batch_size_to_select + 1 + n_inclusions + else: + batch_size_i = batch_size + indices, scores, mask = _prepare_search( + n_features, + batch_size_i, + indices_include, + np.r_[indices_exclude, indices_select], + ) + _forward_search( + X=X_transformed_, + V=selector.y_transformed_, + t=batch_size_i, + tol=selector.tol, + num_threads=n_threads, + verbose=0, + mask=mask, + indices=indices, + scores=scores, + ) + indices_select = np.r_[indices_select, indices[n_inclusions:]] + return np.r_[indices_include, indices_select] diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py index bad3db6..5211ec0 100644 --- a/fastcan/_fastcan.py +++ b/fastcan/_fastcan.py @@ -77,6 +77,12 @@ class FastCan(SelectorMixin, BaseEstimator): When h-correlation method is used, `n_samples_` = n_samples. When eta-cosine method is used, `n_samples_` = n_features+n_outputs. + indices_include_ : ndarray of shape (n_inclusions,), dtype=int + The indices of the prerequisite features. + + indices_exclude_ : array-like of shape (n_exclusions,), dtype=int + The indices of the excluded features. + References ---------- * Zhang, S., & Lang, Z. Q. (2022). diff --git a/fastcan/_refine.py b/fastcan/_refine.py index bbab1c3..0fb2bfd 100644 --- a/fastcan/_refine.py +++ b/fastcan/_refine.py @@ -93,6 +93,7 @@ def refine(selector, drop=1, max_iter=None, verbose=1): n_inclusions = indices_include.size n_selections = n_features_to_select - n_inclusions + n_threads = _openmp_effective_n_threads() if drop == "all": drop = np.arange(1, n_selections) @@ -126,7 +127,6 @@ def refine(selector, drop=1, max_iter=None, verbose=1): rolled_indices[:-drop_n], indices_exclude, ) - n_threads = _openmp_effective_n_threads() _forward_search( X=X_transformed_, V=selector.y_transformed_, diff --git a/tests/test_extend.py b/tests/test_extend.py new file mode 100644 index 0000000..5a3a5e9 --- /dev/null +++ b/tests/test_extend.py @@ -0,0 +1,86 @@ +"""Test feature selection extend""" +import numpy as np +import pytest +from numpy.testing import ( + assert_array_equal, +) +from sklearn.datasets import make_classification + +from fastcan import FastCan, extend + + +def test_select_extend_cls(): + # Test whether refine work correctly with random samples. + n_samples = 200 + n_features = 20 + n_informative = 10 + n_classes = 8 + n_repeated = 5 + n_to_select = 18 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_repeated=n_repeated, + n_classes=n_classes, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + n_features_to_select = 2 + selector = FastCan(n_features_to_select).fit(X, y) + indices = extend(selector, n_to_select, batch_size=3) + selector_inc = FastCan(n_features_to_select, indices_include=[10]).fit(X, y) + indices_inc = extend(selector_inc, n_to_select, batch_size=3) + selector_exc = FastCan( + n_features_to_select, indices_include=[10], indices_exclude=[0] + ).fit(X, y) + indices_exc = extend(selector_exc, n_to_select, batch_size=3) + + + assert np.unique(indices).size == n_to_select + assert_array_equal(indices[:n_features_to_select], selector.indices_) + assert np.unique(indices_inc).size == n_to_select + assert_array_equal(indices_inc[:n_features_to_select], selector_inc.indices_) + assert np.unique(indices_exc).size == n_to_select + assert_array_equal(indices_exc[:n_features_to_select], selector_exc.indices_) + assert ~np.isin(0, indices_exc) + + +def test_extend_error(): + # Test refine raise error. + n_samples = 200 + n_features = 20 + n_informative = 10 + n_classes = 8 + n_repeated = 5 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_repeated=n_repeated, + n_classes=n_classes, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + n_features_to_select = 2 + + selector = FastCan(n_features_to_select, indices_include=[0]).fit(X, y) + + with pytest.raises(ValueError, match=r"n_features_to_select .*"): + _ = extend(selector, n_features+1, batch_size=3) + + with pytest.raises(ValueError, match=r"The number of features to select .*"): + _ = extend(selector, n_features_to_select, batch_size=3) + + with pytest.raises(ValueError, match=r"The size of mini batch without .*"): + _ = extend(selector, n_features, batch_size=1) diff --git a/tests/test_refine.py b/tests/test_refine.py index a7f059b..37ee701 100644 --- a/tests/test_refine.py +++ b/tests/test_refine.py @@ -5,7 +5,7 @@ from fastcan import FastCan, refine -def test_select_refine_random_cls(): +def test_select_refine_cls(): # Test whether refine work correctly with random samples. n_samples = 200 n_features = 20 From 57d5c6bfdec0a58734ff2827f51c03aae578534d Mon Sep 17 00:00:00 2001 From: SIKAI ZHANG <34108862+MatthewSZhang@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:58:32 +0800 Subject: [PATCH 2/2] MNT n_informative should higher than n_to_select --- tests/test_extend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extend.py b/tests/test_extend.py index 5a3a5e9..49d24f9 100644 --- a/tests/test_extend.py +++ b/tests/test_extend.py @@ -12,8 +12,8 @@ def test_select_extend_cls(): # Test whether refine work correctly with random samples. n_samples = 200 - n_features = 20 - n_informative = 10 + n_features = 30 + n_informative = 20 n_classes = 8 n_repeated = 5 n_to_select = 18