Clearning, formatting, commenting

scikit-learn · Nov 21, 2023 · c84731a · c84731a
1 parent acd0f60
commit c84731a
Show file tree

Hide file tree

Showing 7 changed files with 227 additions and 149 deletions.
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
@@ -95,7 +95,7 @@ Estimators
 
 - :class:`decomposition.PCA` (with `svd_solver="full"`,
   `svd_solver="randomized"` and `power_iteration_normalizer="QR"`)
-- :class:`linear_model.Ridge` (with `solver="TODO"`)
+- :class:`linear_model.Ridge` (with `solver="svd"`)
 - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`)
 - :class:`preprocessing.KernelCenterer`
 - :class:`preprocessing.MaxAbsScaler`

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -283,8 +283,8 @@ Changelog
   :user:`Olivier Grisel <ogrisel>` and :user:`Edoardo Abati <EdAbati>`.
 
 - |Enhancement| :class:`linear_model.Ridge` now supports the Array API for the
-  `TODO` and `TODO` solvers. See :ref:`array_api` for more details.
-  :pr:`TODO` :user:`Franck Charras <fcharras>`, :user:`TODO <TODO>` and
+  `svd` solver. See :ref:`array_api` for more details.
+  :pr:`27800` by :user:`Franck Charras <fcharras>`, :user:`TODO <TODO>` and
   :user:`TODO <TODO>`.
 
 - |Feature| :class:`decomposition.PCA` now supports :class:`scipy.sparse.sparray`

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -22,7 +22,7 @@
 from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._arpack import _init_arpack_v0
-from ..utils._array_api import _convert_to_numpy, _is_numpy_namespace, get_namespace
+from ..utils._array_api import _convert_to_numpy, get_namespace
 from ..utils._param_validation import Interval, RealNotInt, StrOptions
 from ..utils.deprecation import deprecated
 from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip
@@ -486,11 +486,7 @@ def _fit(self, X):
                 " alternative."
             )
         # Raise an error for non-Numpy input and arpack solver.
-        if (
-            self.svd_solver == "arpack"
-            and is_array_api_compliant
-            and not _is_numpy_namespace(xp)
-        ):
+        if self.svd_solver == "arpack" and is_array_api_compliant:
             raise ValueError(
                 "PCA with svd_solver='arpack' is not supported for Array API inputs."
             )

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -34,20 +34,23 @@
 )
 from ..preprocessing._data import _is_constant_feature
 from ..utils import check_array, check_random_state
-from ..utils._array_api import (
-    _asarray_with_order,
-    _safe_per_col_average,
-    device,
-    get_namespace,
-    supported_float_dtypes,
-)
 from ..utils._seq_dataset import (
     ArrayDataset32,
     ArrayDataset64,
     CSRDataset32,
     CSRDataset64,
 )
-from ..utils.extmath import _incremental_mean_and_var, safe_sparse_dot
+from ..utils.array_api import (
+    _asarray_with_order,
+    device,
+    get_namespace,
+    supported_float_dtypes,
+)
+from ..utils.extmath import (
+    _incremental_mean_and_var,
+    _safe_average_axis0,
+    safe_sparse_dot,
+)
 from ..utils.parallel import Parallel, delayed
 from ..utils.sparsefuncs import inplace_column_scale, mean_variance_axis
 from ..utils.validation import _check_sample_weight, check_is_fitted
@@ -272,11 +275,11 @@ def _preprocess_data(
                     sample_weight=sample_weight,
                 )
             else:
-                # NB: linear models do not work with missing values
-                # so we don't worry about the different support for
-                # missing values for `X_offset` depending on if
-                # `normalize` is `True` or `False`
-                X_offset = _safe_per_col_average(X, sample_weight, xp=xp)
+                # NB: linear models will filter out inputs with missing values
+                # earlier in the pipeline so it can be assumed here that X does not
+                # contain any. Hence we don't have to worry that missing values would
+                # be handled differently when `normalize` is `True` or `False`.
+                X_offset = _safe_average_axis0(X, sample_weight, xp=xp)
 
             X_offset = xp.astype(X_offset, X.dtype, copy=False)
             X -= X_offset
@@ -300,7 +303,7 @@ def _preprocess_data(
         else:
             X_scale = xp.ones(n_features, dtype=dtype_, device=device_)
 
-        y_offset = _safe_per_col_average(y, sample_weight)
+        y_offset = _safe_average_axis0(y, sample_weight)
         y -= y_offset
     else:
         X_offset = xp.zeros(n_features, dtype=dtype_, device=device_)

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -33,7 +33,6 @@
 )
 from ..utils._array_api import (
     _asarray_with_order,
-    _is_numpy_namespace,
     device,
     get_namespace,
 )
@@ -287,8 +286,7 @@ def _solve_svd(X, y, alpha, xp):
     d = xp.zeros((s.size, alpha.size), dtype=X.dtype)
     d[idx] = s_nnz / (s_nnz**2 + alpha)
     d_UT_y = d * UTy
-    result = (Vt.T @ d_UT_y).T
-    return result
+    return (Vt.T @ d_UT_y).T
 
 
 def _solve_lbfgs(
@@ -600,7 +598,7 @@ def _ridge_regression(
     has_sw = sample_weight is not None
 
     if solver == "auto":
-        if is_array_api_compliant and not _is_numpy_namespace(xp):
+        if is_array_api_compliant:
             solver = "svd"
         elif positive:
             solver = "lbfgs"
@@ -1181,7 +1179,7 @@ def fit(self, X, y, sample_weight=None):
             X,
             y,
             accept_sparse=_accept_sparse,
-            dtype=[np.float64, np.float32],
+            dtype=[xp.float64, xp.float32],
             multi_output=True,
             y_numeric=True,
         )

diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
@@ -209,30 +209,6 @@ def __eq__(self, other):
     def newaxis(self):
         return None
 
-    def take(self, X, indices, *, axis=0):
-        # TODO: Now that array_api supports `take` we should use this directly
-        # https://github.com/data-apis/array-api/issues/177
-        if self._namespace.__name__ == "numpy.array_api":
-            X_np = numpy.take(X, indices, axis=axis)
-            return self._namespace.asarray(X_np)
-
-        # We only support axis in (0, 1) and ndim in (1, 2) because that is all we need
-        # in scikit-learn
-        if axis not in {0, 1}:
-            raise ValueError(f"Only axis in (0, 1) is supported. Got {axis}")
-
-        if X.ndim not in {1, 2}:
-            raise ValueError(f"Only X.ndim in (1, 2) is supported. Got {X.ndim}")
-
-        if axis == 0:
-            if X.ndim == 1:
-                selected = [X[i] for i in indices]
-            else:  # X.ndim == 2
-                selected = [X[i, :] for i in indices]
-        else:  # axis == 1
-            selected = [X[:, i] for i in indices]
-        return self._namespace.stack(selected, axis=axis)
-
     def isdtype(self, dtype, kind):
         return isdtype(dtype, kind, xp=self._namespace)
 
@@ -498,53 +474,6 @@ def _weighted_sum(sample_score, sample_weight, normalize=False, xp=None):
         return float(xp.sum(sample_score))
 
 
-# Use at least float64 for the accumulating functions to avoid precision issue
-# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
-# as it is in case the float overflows
-def _safe_per_col_weighted_accumulator(X, sample_weight, xp=None):
-    if xp is None:
-        xp, _ = get_namespace(X)
-
-    if _is_numpy_namespace(xp):
-        if numpy.issubdtype(X.dtype, numpy.floating) and X.dtype.itemsize < 8:
-            return numpy.matmul(sample_weight, X, dtype=numpy.float64)
-
-    _is_float16 = hasattr(xp, "float16") and xp.isdtype(X.dtype, xp.float16)
-
-    if xp.isdtype(X.dtype, xp.float32) or _is_float16:
-        sample_weight = xp.asarray(sample_weight, dtype=xp.float64, device=device(X))
-
-    return xp.matmul(sample_weight, X)
-
-
-def _safe_per_col_average(X, sample_weight, xp=None):
-    if xp is None:
-        xp, _ = get_namespace(X)
-
-    if sample_weight is not None:
-        # equivalent to xp.sum(X * sample_weight, axis=0)
-        # safer because xp.float64(X*W) != xp.float64(X)*xp.float64(W)
-        per_col_sum = _safe_per_col_weighted_accumulator(X, sample_weight)
-        total_weight = _safe_accumulator_op(xp.sum, sample_weight, axis=0)
-    else:
-        per_col_sum = _safe_accumulator_op(xp.sum, X, axis=0)
-        total_weight = X.shape[0]
-
-    return per_col_sum / total_weight
-
-
-def _safe_accumulator_op(op, X, axis=None, xp=None):
-    if xp is None:
-        xp, _ = get_namespace(X)
-
-    _is_float16 = hasattr(xp, "float16") and xp.isdtype(X.dtype, xp.float16)
-
-    if xp.isdtype(X.dtype, xp.float32) or _is_float16:
-        return op(X, axis=axis, dtype=xp.float64)
-
-    return op(X, axis=axis)
-
-
 def _nanmin(X, axis=None, xp=None):
     # TODO: refactor once nan-aware reductions are standardized:
     # https://github.com/data-apis/array-api/issues/621
@@ -599,46 +528,7 @@ def _nansum(X, axis=None, dtype=None, xp=None):
         )
 
 
-def _float_itemwise_divide_and_ignore_errors(dividend, divisor, xp=None):
-    if xp is None:
-        xp, _ = get_namespace(dividend)
-
-    if _is_numpy_namespace(xp):
-        with numpy.errstate(divide="ignore", invalid="ignore"):
-            return xp.asarray(dividend / divisor)
-
-    device_ = device(dividend)
-    dtype = divisor.dtype
-    one_ = xp.asarray(1.0, dtype=dtype, device=device_)
-    nan_ = xp.asarray(xp.nan, dtype=dtype, device=device_)
-    inf_ = xp.asarray(xp.inf, dtype=dtype, device=device_)
-
-    dividend_isinf = xp.isinf(dividend)
-    divisor_iszero = ~xp.astype(divisor, xp.bool)
-    divisor_isinf = xp.isinf(divisor)
-
-    invalid_result = (dividend_isinf & divisor_isinf) | (
-        (~xp.astype(dividend, xp.bool)) & divisor_iszero
-    )
-
-    divisor = xp.where(invalid_result, one_, divisor)
-
-    division_by_zero = (~invalid_result) & divisor_iszero  # & (~divisor_isinf)
-    division_by_zero_pinf = division_by_zero & (
-        _signbit(X=divisor, X_isinf=divisor_isinf)
-        == _signbit(X=dividend, X_isinf=dividend_isinf)
-    )
-    division_by_zero_ninf = division_by_zero & (~division_by_zero_pinf)
-
-    divisor = xp.where(division_by_zero, one_, divisor)
-
-    result = dividend / divisor
-    result = xp.where(invalid_result, nan_, result)
-    result = xp.where(division_by_zero_pinf, inf_, result)
-    return xp.where(division_by_zero_ninf, -inf_, result)
-
-
-def _signbit(X, X_isnan=None, X_isinf=None, xp=None):
+def _signbit(X, X_isinf=None, xp=None):
     # TODO: refactor once signbit is standardized:
     # https://github.com/data-apis/array-api/issues/670
     if xp is None:
@@ -647,11 +537,11 @@ def _signbit(X, X_isnan=None, X_isinf=None, xp=None):
     if _is_numpy_namespace(xp):
         return numpy.signbit(X)
 
-    one = xp.asarray(1.0, device=device(X), dtype=X.dtype)
-
-    X = xp.where(X_isnan or xp.isnan(X), one, X)
     X = xp.where(X_isinf or xp.isinf(X), xp.sign(X), X)
-    return (xp.sign(X) < 0) | (xp.sign(xp.inf / X) < 0)
+
+    # NB: this trick is necessary because signbit(0) can be either True or False,
+    # and it affects the result from +-inf/0, despite always having +0 == -0 !
+    return (xp.inf / X) < 0
 
 
 def _asarray_with_order(array, dtype=None, order=None, copy=None, *, xp=None):