Skip to content

Commit

Permalink
MNT change extend to minibatch to split columns of both X and y into …
Browse files Browse the repository at this point in the history
…batches
  • Loading branch information
MatthewSZhang committed Nov 27, 2024
1 parent b53076a commit 4e42fa3
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 213 deletions.
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.12"
python: "3.13"

sphinx:
configuration: doc/conf.py
Expand Down
2 changes: 1 addition & 1 deletion doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ API Reference

FastCan
refine
extend
minibatch
ssc
ols
make_poly_ids
Expand Down
4 changes: 2 additions & 2 deletions fastcan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
The :mod:`fastcan` module implements algorithms, including
"""

from ._extend import extend
from ._fastcan import FastCan
from ._minibatch import minibatch
from ._narx import (
Narx,
make_narx,
Expand All @@ -21,7 +21,7 @@
"ssc",
"ols",
"refine",
"extend",
"minibatch",
"make_narx",
"print_narx",
"Narx",
Expand Down
120 changes: 0 additions & 120 deletions fastcan/_extend.py

This file was deleted.

1 change: 0 additions & 1 deletion fastcan/_fastcan.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,6 @@ def _get_support_mask(self):


def _prepare_search(n_features, n_features_to_select, indices_include, indices_exclude):
""" """
# initiated with -1
indices = np.full(n_features_to_select, -1, dtype=np.intc, order="F")
indices[: indices_include.size] = indices_include
Expand Down
124 changes: 124 additions & 0 deletions fastcan/_minibatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
Feature selection with mini-batch
"""

from copy import deepcopy
from numbers import Integral

import numpy as np
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils._param_validation import Interval, validate_params
from sklearn.utils.validation import check_X_y

from ._cancorr_fast import _forward_search # type: ignore
from ._fastcan import FastCan, _prepare_search


@validate_params(
{
"X": ["array-like"],
"y": ["array-like"],
"n_features_to_select": [
Interval(Integral, 1, None, closed="left"),
],
"batch_size": [
Interval(Integral, 1, None, closed="left"),
],
},
prefer_skip_nested_validation=False,
)
def minibatch(X, y, n_features_to_select=1, batch_size=1):
"""FastCan selection with mini batches.
It is suitable for selecting a very large number of features
even larger than the number of samples.
Similar to the correlation filter which selects each feature without considering
the redundancy, the function selects features in mini-batch and the
redundancy between the two mini-batches will be ignored.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
y : array-like of shape (n_samples, n_outputs)
Target matrix.
n_features_to_select : int, default=1
The parameter is the absolute number of features to select.
batch_size : int, default=1
The number of features in a mini-batch.
Returns
-------
indices : ndarray of shape (n_features_to_select,), dtype=int
The indices of the selected features.
Examples
--------
>>> from fastcan import minibatch
>>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
>>> y = [1, 0, -1, 0]
>>> indices = minibatch(X, y, 3, batch_size=2)
>>> print(f"Indices: {indices}")
Indices: [0 1 2]
"""
X, y = check_X_y(X, y, ensure_2d=True, multi_output=True)
if y.ndim == 1:
y = y.reshape(-1, 1)

n_features = X.shape[1]
n_outputs = y.shape[1]

if n_features_to_select > n_features:
raise ValueError(
f"n_features_to_select {n_features_to_select} "
f"must be <= n_features {n_features}."
)

n_threads = _openmp_effective_n_threads()

output_arange = np.r_[np.arange(n_outputs, step=batch_size, dtype=int), n_outputs]
n_to_select_split = np.diff(
np.linspace(
0, n_features_to_select, num=output_arange.size, endpoint=True, dtype=int
)
)
indices_select = np.zeros(0, dtype=int)
for i in range(n_to_select_split.size):
y_i = y[:, output_arange[i] : output_arange[i + 1]]
batch_split_i = np.diff(
np.r_[
np.arange(n_to_select_split[i], step=batch_size, dtype=int),
n_to_select_split[i],
]
)
for j, batch_size_j in enumerate(batch_split_i):
if j == 0:
selector_j = FastCan(
batch_size_j, indices_exclude=indices_select, verbose=0
).fit(X, y_i)
X_transformed_ = deepcopy(selector_j.X_transformed_)
indices = selector_j.indices_
else:
indices, scores, mask = _prepare_search(
n_features,
batch_size_j,
selector_j.indices_include_,
np.r_[selector_j.indices_exclude_, indices_select],
)
_forward_search(
X=X_transformed_,
V=selector_j.y_transformed_,
t=batch_size_j,
tol=selector_j.tol,
num_threads=n_threads,
verbose=0,
mask=mask,
indices=indices,
scores=scores,
)
indices_select = np.r_[indices_select, indices]
return indices_select
9 changes: 5 additions & 4 deletions fastcan/_narx.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,8 @@ def fit(self, X, y, coef_init=None, **params):
# fit a one-step-ahead Narx model
xy_hstack = np.c_[X, y]
osa_narx = LinearRegression()
time_shift_terms = make_time_shift_features(xy_hstack, self.time_shift_ids_)
poly_terms = make_poly_features(time_shift_terms, self.poly_ids_)
time_shift_vars = make_time_shift_features(xy_hstack, self.time_shift_ids_)
poly_terms = make_poly_features(time_shift_vars, self.poly_ids_)

osa_narx.fit(poly_terms, y)
if coef_init is None:
Expand Down Expand Up @@ -644,6 +644,7 @@ def print_narx(
| X[k-0,9] | 68 |
"""
check_is_fitted(narx)

def _get_variable_str(time_shift_id):
if time_shift_id[0] < narx.n_features_in_:
variable_str = f"X[k-{time_shift_id[1]},{time_shift_id[0]}]"
Expand Down Expand Up @@ -822,13 +823,13 @@ def make_narx(
),
0,
)
time_shift_terms = make_time_shift_features(xy_hstack, time_shift_ids_all)
time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids_all)

poly_ids_all = make_poly_ids(
time_shift_ids_all.shape[0],
poly_degree,
)
poly_terms = make_poly_features(time_shift_terms, poly_ids_all)
poly_terms = make_poly_features(time_shift_vars, poly_ids_all)

csf = FastCan(
n_features_to_select,
Expand Down
Loading

0 comments on commit 4e42fa3

Please sign in to comment.