MNT change extend to minibatch to split columns of both X and y into …

…batches
scikit-learn-contrib · Nov 27, 2024 · 4e42fa3 · 4e42fa3
1 parent b53076a
commit 4e42fa3
Show file tree

Hide file tree

Showing 9 changed files with 200 additions and 213 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -2,7 +2,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.12"
+    python: "3.13"
 
 sphinx:
    configuration: doc/conf.py

diff --git a/doc/index.rst b/doc/index.rst
@@ -19,7 +19,7 @@ API Reference
 
    FastCan
    refine
-   extend
+   minibatch
    ssc
    ols
    make_poly_ids

diff --git a/fastcan/__init__.py b/fastcan/__init__.py
@@ -2,8 +2,8 @@
 The :mod:`fastcan` module implements algorithms, including
 """
 
-from ._extend import extend
 from ._fastcan import FastCan
+from ._minibatch import minibatch
 from ._narx import (
     Narx,
     make_narx,
@@ -21,7 +21,7 @@
     "ssc",
     "ols",
     "refine",
-    "extend",
+    "minibatch",
     "make_narx",
     "print_narx",
     "Narx",

diff --git a/fastcan/_extend.py b/fastcan/_extend.py
diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py
@@ -287,7 +287,6 @@ def _get_support_mask(self):
 
 
 def _prepare_search(n_features, n_features_to_select, indices_include, indices_exclude):
-    """ """
     # initiated with -1
     indices = np.full(n_features_to_select, -1, dtype=np.intc, order="F")
     indices[: indices_include.size] = indices_include

diff --git a/fastcan/_minibatch.py b/fastcan/_minibatch.py
@@ -0,0 +1,124 @@
+"""
+Feature selection with mini-batch
+"""
+
+from copy import deepcopy
+from numbers import Integral
+
+import numpy as np
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._param_validation import Interval, validate_params
+from sklearn.utils.validation import check_X_y
+
+from ._cancorr_fast import _forward_search  # type: ignore
+from ._fastcan import FastCan, _prepare_search
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "y": ["array-like"],
+        "n_features_to_select": [
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "batch_size": [
+            Interval(Integral, 1, None, closed="left"),
+        ],
+    },
+    prefer_skip_nested_validation=False,
+)
+def minibatch(X, y, n_features_to_select=1, batch_size=1):
+    """FastCan selection with mini batches.
+
+    It is suitable for selecting a very large number of features
+    even larger than the number of samples.
+
+    Similar to the correlation filter which selects each feature without considering
+    the redundancy, the function selects features in mini-batch and the
+    redundancy between the two mini-batches will be ignored.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples, n_outputs)
+        Target matrix.
+
+    n_features_to_select : int, default=1
+        The parameter is the absolute number of features to select.
+
+    batch_size : int, default=1
+        The number of features in a mini-batch.
+
+    Returns
+    -------
+    indices : ndarray of shape (n_features_to_select,), dtype=int
+        The indices of the selected features.
+
+    Examples
+    --------
+    >>> from fastcan import minibatch
+    >>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
+    >>> y = [1, 0, -1, 0]
+    >>> indices = minibatch(X, y, 3, batch_size=2)
+    >>> print(f"Indices: {indices}")
+    Indices: [0 1 2]
+    """
+    X, y = check_X_y(X, y, ensure_2d=True, multi_output=True)
+    if y.ndim == 1:
+        y = y.reshape(-1, 1)
+
+    n_features = X.shape[1]
+    n_outputs = y.shape[1]
+
+    if n_features_to_select > n_features:
+        raise ValueError(
+            f"n_features_to_select {n_features_to_select} "
+            f"must be <= n_features {n_features}."
+        )
+
+    n_threads = _openmp_effective_n_threads()
+
+    output_arange = np.r_[np.arange(n_outputs, step=batch_size, dtype=int), n_outputs]
+    n_to_select_split = np.diff(
+        np.linspace(
+            0, n_features_to_select, num=output_arange.size, endpoint=True, dtype=int
+        )
+    )
+    indices_select = np.zeros(0, dtype=int)
+    for i in range(n_to_select_split.size):
+        y_i = y[:, output_arange[i] : output_arange[i + 1]]
+        batch_split_i = np.diff(
+            np.r_[
+                np.arange(n_to_select_split[i], step=batch_size, dtype=int),
+                n_to_select_split[i],
+            ]
+        )
+        for j, batch_size_j in enumerate(batch_split_i):
+            if j == 0:
+                selector_j = FastCan(
+                    batch_size_j, indices_exclude=indices_select, verbose=0
+                ).fit(X, y_i)
+                X_transformed_ = deepcopy(selector_j.X_transformed_)
+                indices = selector_j.indices_
+            else:
+                indices, scores, mask = _prepare_search(
+                    n_features,
+                    batch_size_j,
+                    selector_j.indices_include_,
+                    np.r_[selector_j.indices_exclude_, indices_select],
+                )
+                _forward_search(
+                    X=X_transformed_,
+                    V=selector_j.y_transformed_,
+                    t=batch_size_j,
+                    tol=selector_j.tol,
+                    num_threads=n_threads,
+                    verbose=0,
+                    mask=mask,
+                    indices=indices,
+                    scores=scores,
+                )
+            indices_select = np.r_[indices_select, indices]
+    return indices_select
diff --git a/fastcan/_narx.py b/fastcan/_narx.py
@@ -453,8 +453,8 @@ def fit(self, X, y, coef_init=None, **params):
             # fit a one-step-ahead Narx model
             xy_hstack = np.c_[X, y]
             osa_narx = LinearRegression()
-            time_shift_terms = make_time_shift_features(xy_hstack, self.time_shift_ids_)
-            poly_terms = make_poly_features(time_shift_terms, self.poly_ids_)
+            time_shift_vars = make_time_shift_features(xy_hstack, self.time_shift_ids_)
+            poly_terms = make_poly_features(time_shift_vars, self.poly_ids_)
 
             osa_narx.fit(poly_terms, y)
             if coef_init is None:
@@ -644,6 +644,7 @@ def print_narx(
     | X[k-0,9] | 68  |
     """
     check_is_fitted(narx)
+
     def _get_variable_str(time_shift_id):
         if time_shift_id[0] < narx.n_features_in_:
             variable_str = f"X[k-{time_shift_id[1]},{time_shift_id[0]}]"
@@ -822,13 +823,13 @@ def make_narx(
         ),
         0,
     )
-    time_shift_terms = make_time_shift_features(xy_hstack, time_shift_ids_all)
+    time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids_all)
 
     poly_ids_all = make_poly_ids(
         time_shift_ids_all.shape[0],
         poly_degree,
     )
-    poly_terms = make_poly_features(time_shift_terms, poly_ids_all)
+    poly_terms = make_poly_features(time_shift_vars, poly_ids_all)
 
     csf = FastCan(
         n_features_to_select,
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,7 @@ API Reference @@
        FastCan
        refine
-       extend
+       minibatch
        ssc
        ols
        make_poly_ids
@@ Expand Down @@