Skip to content

Commit

Permalink
Merge pull request #2 from MatthewSZhang/cython-zerodivision
Browse files Browse the repository at this point in the history
FIX zero division error in cython
  • Loading branch information
MatthewSZhang authored Aug 14, 2024
2 parents 2475b47 + a07c665 commit 438aa96
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 241 deletions.
148 changes: 54 additions & 94 deletions fastcan/_cancorr_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,25 @@ from cython cimport floating
from cython.parallel import prange
from scipy.linalg.cython_blas cimport isamax, idamax
from sklearn.utils._cython_blas cimport ColMajor, NoTrans
from sklearn.utils._cython_blas cimport _dot, _scal, _nrm2, _ger, _gemm, _axpy
from sklearn.utils._typedefs cimport int32_t, uint8_t
from sklearn.utils._cython_blas cimport _dot, _scal, _nrm2, _gemm, _axpy
from sklearn.utils._typedefs cimport int32_t

cdef unsigned int _bsum(
bint* x,
unsigned int n,
) noexcept nogil:
"""Computes the sum of the vector of bool elements.
"""
cdef:
unsigned int total = 0
unsigned int i
for i in range(n):
total += x[i]
return total

cdef int _iamax(
int n, const floating *x,
int n,
const floating *x,
int incx,
) noexcept nogil:
"""
Expand All @@ -24,31 +38,36 @@ cdef int _iamax(
else:
return idamax(&n, <double *> x, &incx) - 1

cdef void _normv(
cdef bint _normv(
floating[::1] x, # IN/OUT
) except * nogil:
) noexcept nogil:
"""
Vector normalization by Euclidean norm.
x (IN) : (1, n_samples) Vector.
x (OUT) : (1, n_samples) Normalized vector.
x (IN) : (n_samples,) Vector.
x (OUT) : (n_samples,) Normalized vector.
Return: Mask the constant vector.
"""
cdef:
unsigned int n_samples = x.shape[0]
floating x_norm

x_norm = _nrm2(n_samples, &x[0], 1)
if x_norm == 0.0:
raise ZeroDivisionError("Cannot normalize a vector of all zeros.")
return True
x_norm = 1.0/x_norm
_scal(n_samples, x_norm, &x[0], 1)
return False

cdef void _normm(
floating[::1, :] X, # IN/OUT
) except * nogil:
bint* m, # IN/OUT
) noexcept nogil:
"""
Matrix column-wise normalization by Euclidean norm.
X (IN) : (n_samples, nx) Matrix.
X (OUT) : (n_samples, nx) Column-wise normalized matrix.
m (IN): (n_features,) Mask contains only false.
m (OUT): (n_features,) Mask the constant vectors.
"""
cdef:
unsigned int n_samples = X.shape[0]
Expand All @@ -60,11 +79,10 @@ cdef void _normm(
for j in range(nx):
x_norm = _nrm2(n_samples, &X[0, j], 1)
if x_norm == 0.0:
raise ZeroDivisionError(
"Cannot normalize a matrix containing a vector of all zeros."
)
x_norm = 1.0/x_norm
_scal(n_samples, x_norm, &X[0, j], 1)
m[j] = True
else:
x_norm = 1.0/x_norm
_scal(n_samples, x_norm, &X[0, j], 1)


cdef floating _sscvm(
Expand All @@ -75,7 +93,7 @@ cdef floating _sscvm(
Sum of squared correlation coefficients.
w : (n_samples,) Centred orthogonalized feature vector.
V : (n_samples, nv) Centred orthogonalized target matrix.
r2 : (nw, ) Sum of squared correlation coefficients, where r2i means the
r2 : (nw,) Sum of squared correlation coefficients, where r2i means the
coefficient of determination between wi and V.
"""
cdef:
Expand All @@ -96,40 +114,15 @@ cdef floating _sscvm(
free(r)
return r2

cdef void _mgsvm(
const floating[::1] w, # IN
floating[::1, :] X, # IN/OUT
) noexcept nogil:
"""
Modified Gram-Schmidt process. X = X - w.T*w*X
w : (n_samples, ) Centred orthonormal selected feature vector.
X (IN) : (n_samples, nx) Centred remaining feature matrix.
X (OUT) : (n_samples, nx) Centred remaining feature matrix,
which is orthogonal to w.
"""
cdef:
unsigned int n_samples = X.shape[0]
unsigned int nx = X.shape[1]
# r (1, nx)
floating* r = <floating*> malloc(sizeof(floating) * nx)

# r = w*X (w is treated as (1, n_samples))
_gemm(ColMajor, NoTrans, NoTrans, 1, nx, n_samples, 1.0,
&w[0], 1, &X[0, 0], n_samples, 0.0, r, 1)
# X = X - w.T*r
_ger(ColMajor, n_samples, nx, -1.0, &w[0], 1, r, 1, &X[0, 0], n_samples)

free(r)

cdef void _mgsvv(
const floating[::1] w, # IN
floating[::1] x, # IN/OUT
) noexcept nogil:
"""
Modified Gram-Schmidt process. x = x - w*w.T*x
w : (n_samples, ) Centred orthonormal selected feature vector.
x (IN) : (n_samples, ) Centred remaining feature vector.
x (OUT) : (n_samples, ) Centred remaining feature vector, which is orthogonal to w.
w : (n_samples,) Centred orthonormal selected feature vector.
x (IN) : (n_samples,) Centred remaining feature vector.
x (OUT) : (n_samples,) Centred remaining feature vector, which is orthogonal to w.
"""
cdef:
unsigned int n_samples = x.shape[0]
Expand All @@ -141,105 +134,70 @@ cdef void _mgsvv(
_axpy(n_samples, -r, &w[0], 1, &x[0], 1)


cdef void _orth(
floating[::1, :] X, # IN/OUT
) except * nogil:
"""Orthogonalization of a matrix by the modified Gram-Schmidt.
X (IN) : (n_samples, n_features) Matrix.
X (OUT) : (n_samples, n_features) Orthonormal matrix.
Note: do not use scipy.linalg.orth which use Householder
transformation. As classical/modified Gram-Schmidt orthognalize
features in order, the corresponding scores reflect their
importance, while Householder will mix the feature importance
together.
Parameters
----------
n_features: integer greater or equal to 0
"""
cdef:
unsigned int n_features = X.shape[1]
unsigned int i

for i in range(n_features):
if i == 0:
_normv(X[:, 0])
else:
_mgsvm(X[:, i-1], X[:, i:])
_normv(X[:, i])


cpdef void _forward_search(
cpdef int _forward_search(
floating[::1, :] X, # IN/OUT
floating[::1, :] V, # IN/OUT
floating[::1, :] V, # IN
const unsigned int t, # IN
const floating tol, # IN
const unsigned int num_threads, # IN
const unsigned int verbose, # IN
uint8_t[::1] mask, # IN/TEMP
int32_t[::1] indices, # OUT
floating[::1] scores, # OUT
) except * nogil:
) except -1 nogil:
"""
Greedy search with SSC.
X (IN) : (n_samples, n_features) Centered feature matrix.
V (IN) : (n_samples, n_outputs) Centered target matrix.
X (IN) : (n_samples, n_features) Feature matrix.
V (IN) : (n_samples, n_outputs) Orthonormal target matrix.
W (OUT) : (n_samples, n_features) Centered normalized feature matrix, which
is orthonormal to selected features and M.
V (OUT) : (n_samples, n_outputs) Centered orthonormal target matrix.
t : Non-negative integer. The number of features to be selected.
tol : Tolerance for linear dependence check.
mask (n_features, ) Mask for valid candidate features.
indices: (t, ) The indices vector of selected features, initiated with -1.
scores: (t, ) The h-correlation/eta-cosine of selected features.
indices: (t,) The indices vector of selected features, initiated with -1.
scores: (t,) The h-correlation/eta-cosine of selected features.
"""
cdef:
unsigned int n_samples = X.shape[0]
unsigned int n_features = X.shape[1]
floating* r2 = <floating*> malloc(sizeof(floating) * n_features)
unsigned int n_masked # The number of masked features
bint* mask = <bint*> malloc(sizeof(bint) * n_features)
floating g, ssc = 0.0
unsigned int i, j
int index = -1

memset(&r2[0], 0, n_features * sizeof(floating))
memset(&mask[0], False, n_features * sizeof(bint))

for i in range(t):
if i == 0:
# Preprocessing
_orth(V)
_normm(X)
_normm(X, mask)
else:
mask[index] = False
mask[index] = True
r2[index] = 0
# Make X orthogonal to X[:, indices[i-1]]
n_masked = n_features
for j in prange(n_features, nogil=True, schedule="static",
chunksize=1, num_threads=num_threads):
if mask[j]:
if not mask[j]:
_mgsvv(X[:, index], X[:, j])
_normv(X[:, j])
# Linear dependence check
g = _dot(n_samples, &X[0, index], 1, &X[0, j], 1)
if abs(g) > tol:
mask[j] = False
mask[j] = True
r2[j] = 0
else:
n_masked -= 1

if n_masked == n_features:
if _bsum(mask, n_features) == n_features:
raise RuntimeError(
"No candidate feature can be found to form a non-singular "
f"matrix with the selected {i} features."
f"matrix with the {i} selected features."
)
if indices[i] != -1:
index = indices[i]
scores[i] = _sscvm(X[:, index], V)
else:
# Score for X
for j in range(n_features):
if mask[j]:
if not mask[j]:
r2[j] = _sscvm(X[:, j], V)

# Find max scores and update indices, X, mask, and scores
Expand All @@ -253,3 +211,5 @@ cpdef void _forward_search(
print(f"Progress: {i+1}/{t}, SSC: {ssc:.5f}", end="\r")

free(r2)
free(mask)
return 0
13 changes: 6 additions & 7 deletions fastcan/_fastcan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from numbers import Integral, Real

import numpy as np
from scipy.linalg import orth
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin
from sklearn.utils import check_array, check_consistent_length
Expand Down Expand Up @@ -205,12 +206,12 @@ def fit(self, X, y):
qxy_transformed = singular_values.reshape(-1, 1) * unitary_arrays
qxy_transformed = np.asfortranarray(qxy_transformed)
X_transformed = qxy_transformed[:, :n_features]
y_transformed = qxy_transformed[:, n_features:]
y_transformed = orth(qxy_transformed[:, n_features:])
else:
X_transformed = X - X.mean(0)
y_transformed = y - y.mean(0)
y_transformed = orth(y - y.mean(0))

mask, indices, scores = self._prepare_data(
indices, scores = self._prepare_data(
indices_include,
)
n_threads = _openmp_effective_n_threads()
Expand All @@ -221,7 +222,6 @@ def fit(self, X, y):
tol=self.tol,
num_threads=n_threads,
verbose=self.verbose,
mask=mask,
indices=indices,
scores=scores,
)
Expand All @@ -245,7 +245,7 @@ def _prepare_data(self, indices_include):
Returns
-------
mask : ndarray of shape (n_features,), dtype=np.ubyte, order='F'
Mask for valid candidate features.
Mask for invalid candidate features.
The data type is unsigned char.
indices: ndarray of shape (n_features_to_select,), dtype=np.intc, order='F'
Expand All @@ -255,12 +255,11 @@ def _prepare_data(self, indices_include):
scores: ndarray of shape (n_features_to_select,), dtype=float, order='F'
The h-correlation/eta-cosine of selected features.
"""
mask = np.ones(self.n_features_in_, dtype=np.ubyte, order="F")
# initiated with -1
indices = np.full(self.n_features_to_select, -1, dtype=np.intc, order="F")
indices[: indices_include.size] = indices_include
scores = np.zeros(self.n_features_to_select, dtype=float, order="F")
return mask, indices, scores
return indices, scores

def _get_support_mask(self):
check_is_fitted(self)
Expand Down
Loading

0 comments on commit 438aa96

Please sign in to comment.