diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 0540f469d8..406789fffc 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -50,7 +50,6 @@ dependencies: - numpydoc - nvcc_linux-64=11.8 - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 - pynndescent @@ -78,6 +77,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - dask-glm==0.3.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index ad8d12f1a3..28c9197192 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -46,7 +46,6 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 - pynndescent @@ -74,6 +73,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - dask-glm==0.3.0 name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index d176e382ad..687c0bd9aa 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -518,14 +518,6 @@ dependencies: - umap-learn==0.5.6 - pynndescent - setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default - - output_types: conda - packages: - - pip - - pip: - - dask-glm==0.3.0 - - output_types: pyproject - packages: - - dask-glm==0.3.0 test_notebooks: common: - output_types: [conda, requirements] diff --git a/python/cuml/cuml/dask/extended/__init__.py b/python/cuml/cuml/dask/extended/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/python/cuml/cuml/dask/extended/linear_model/__init__.py b/python/cuml/cuml/dask/extended/linear_model/__init__.py deleted file mode 100644 index 8f8cba28a1..0000000000 --- a/python/cuml/cuml/dask/extended/linear_model/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from cuml.internals.import_utils import has_daskglm -import warnings - -if has_daskglm(): - from cuml.dask.extended.linear_model.logistic_regression import ( - LogisticRegression, - ) -else: - warnings.warn( - "Dask-glm not found. Multi-GPU logistic regression is disabled." - ) diff --git a/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py b/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py deleted file mode 100644 index cbaccdc193..0000000000 --- a/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from dask.utils import is_dataframe_like, is_series_like, is_arraylike -from cuml.internals.safe_imports import cpu_only_import -from cuml.dask.common.base import BaseEstimator -from cuml.common import with_cupy_rmm -from cuml.internals.import_utils import has_daskglm - -from cuml.internals.safe_imports import gpu_only_import - -cp = gpu_only_import("cupy") -np = cpu_only_import("numpy") -cudf = gpu_only_import("cudf") - - -class LogisticRegression(BaseEstimator): - """ - Distributed Logistic Regression for Binary classification. - - - Parameters - ---------- - fit_intercept: boolean (default = True) - If True, the model tries to correct for the global mean of y. - If False, the model expects that you have centered the data. - solver : 'admm' - Solver to use. Only admm is supported currently. - penalty : {'l1', 'l2', 'elastic_net'} (default = 'l2') - Regularization technique for the solver. - C: float (default = 1.0) - Inverse of regularization strength; must be a positive float. - max_iter: int (default = 100) - Maximum number of iterations taken for the solvers to converge. - verbose : int or boolean (default=False) - Sets logging level. It must be one of `cuml.common.logger.level_*`. - See :ref:`verbosity-levels` for more info. - - Attributes - ---------- - coef_: device array (n_features, 1) - The estimated coefficients for the logistic regression model. - intercept_: device array (1,) - The independent term. If `fit_intercept` is False, will be 0. - solver: string - Algorithm to use in the optimization process. Currently only `admm` is - supported. - - Notes - ------ - - This estimator is a wrapper class around Dask-GLM's - Logistic Regression estimator. Several methods in this wrapper class - duplicate code from Dask-GLM to create a user-friendly namespace. - """ - - def __init__( - self, - *, - client=None, - fit_intercept=True, - solver="admm", - penalty="l2", - C=1.0, - max_iter=100, - verbose=False, - **kwargs, - ): - super(LogisticRegression, self).__init__( - client=client, verbose=verbose, **kwargs - ) - - if not has_daskglm("0.2.1.dev"): - raise ImportError( - "dask-glm >= 0.2.1.dev was not found, please install it" - " to use multi-GPU logistic regression." - ) - - from dask_glm.estimators import ( - LogisticRegression as LogisticRegressionGLM, - ) - - self.fit_intercept = fit_intercept - self.solver = solver - self.penalty = penalty - self.C = C - self.max_iter = max_iter - - if self.penalty not in ("l2", "l1", "elastic_net"): - raise TypeError( - "Only l2, l1, and elastic_net penalties are" - " currently supported." - ) - - self.solver_model = LogisticRegressionGLM( - solver=self.solver, - fit_intercept=self.fit_intercept, - regularizer=self.penalty, - max_iter=self.max_iter, - lamduh=1 / self.C, - ) - - @with_cupy_rmm - def fit(self, X, y): - """ - Fit the model with X and y. - - Parameters - ---------- - X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) - Features for regression - y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) - Label (outcome values) - """ - - X = self._input_to_dask_cupy_array(X) - y = self._input_to_dask_cupy_array(y) - self.solver_model.fit(X, y) - self._finalize_coefs() - return self - - @with_cupy_rmm - def predict(self, X): - """ - Predicts the ลท for X. - - Parameters - ---------- - X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) - Distributed dense matrix (floats or doubles) of shape - (n_samples, n_features). - - Returns - ------- - y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) - """ - return self.predict_proba(X) > 0.5 - - @with_cupy_rmm - def predict_proba(self, X): - from dask_glm.utils import sigmoid - - X = self._input_to_dask_cupy_array(X) - return sigmoid(self.decision_function(X)) - - @with_cupy_rmm - def decision_function(self, X): - X = self._input_to_dask_cupy_array(X) - X_ = self._maybe_add_intercept(X) - return np.dot(X_, self._coef) - - @with_cupy_rmm - def score(self, X, y): - from dask_glm.utils import accuracy_score - - X = self._input_to_dask_cupy_array(X) - y = self._input_to_dask_cupy_array(y) - return accuracy_score(y, self.predict(X)) - - @with_cupy_rmm - def _finalize_coefs(self): - # _coef contains coefficients and (potentially) intercept - self._coef = cp.asarray(self.solver_model._coef) - if self.fit_intercept: - self.coef_ = self._coef[:-1] - self.intercept_ = self.solver_model._coef[-1] - else: - self.coef_ = self._coef - - @with_cupy_rmm - def _maybe_add_intercept(self, X): - from dask_glm.utils import add_intercept - - if self.fit_intercept: - return add_intercept(X) - else: - return X - - @with_cupy_rmm - def _input_to_dask_cupy_array(self, X): - if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"): - - if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)): - raise TypeError( - "Please convert your Dask DataFrame" - " to a Dask-cuDF DataFrame using dask_cudf." - ) - X = X.values - X._meta = cp.asarray(X._meta) - - elif is_arraylike(X) and hasattr(X, "dask"): - if not isinstance(X._meta, cp.ndarray): - raise TypeError( - "Please convert your CPU Dask Array" - " to a GPU Dask Array using" - " arr.map_blocks(cp.asarray)." - ) - else: - raise TypeError( - "Please pass a GPU backed Dask DataFrame" " or Dask Array." - ) - - X.compute_chunk_sizes() - return X - - def get_param_names(self): - return list(self.kwargs.keys()) diff --git a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py index 2d07329b4f..f208d5a330 100644 --- a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py +++ b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py @@ -103,92 +103,6 @@ def make_classification_dataset( return X, y -def select_sk_solver(cuml_solver): - if cuml_solver == "newton": - return "newton-cg" - elif cuml_solver in ["admm", "lbfgs"]: - return "lbfgs" - else: - pytest.xfail("No matched sklearn solver") - - -@pytest.mark.mg -@pytest.mark.parametrize("nrows", [1e5]) -@pytest.mark.parametrize("ncols", [20]) -@pytest.mark.parametrize("n_parts", [2, 6]) -@pytest.mark.parametrize("fit_intercept", [False, True]) -@pytest.mark.parametrize("datatype", [np.float32, np.float64]) -@pytest.mark.parametrize("gpu_array_input", [False, True]) -@pytest.mark.parametrize( - "solver", ["admm", "gradient_descent", "newton", "lbfgs", "proximal_grad"] -) -def test_lr_fit_predict_score( - nrows, - ncols, - n_parts, - fit_intercept, - datatype, - gpu_array_input, - solver, - client, -): - sk_solver = select_sk_solver(cuml_solver=solver) - - def imp(): - import cuml.comm.serialize # NOQA - - client.run(imp) - - from cuml.dask.extended.linear_model import ( - LogisticRegression as cumlLR_dask, - ) - - n_info = 5 - nrows = int(nrows) - ncols = int(ncols) - X, y = make_classification_dataset(datatype, nrows, ncols, n_info) - - gX, gy = _prep_training_data(client, X, y, n_parts) - - if gpu_array_input: - gX = gX.values - gX._meta = cp.asarray(gX._meta) - gy = gy.values - gy._meta = cp.asarray(gy._meta) - - cuml_model = cumlLR_dask( - fit_intercept=fit_intercept, solver=solver, max_iter=10 - ) - - # test fit and predict - cuml_model.fit(gX, gy) - cu_preds = cuml_model.predict(gX) - accuracy_cuml = accuracy_score(y, cu_preds.compute().get()) - - sk_model = skLR(fit_intercept=fit_intercept, solver=sk_solver, max_iter=10) - sk_model.fit(X, y) - sk_preds = sk_model.predict(X) - accuracy_sk = accuracy_score(y, sk_preds) - - assert (accuracy_cuml >= accuracy_sk) | ( - np.abs(accuracy_cuml - accuracy_sk) < 1e-3 - ) - - # score - accuracy_cuml = cuml_model.score(gX, gy).compute().item() - accuracy_sk = sk_model.score(X, y) - - assert (accuracy_cuml >= accuracy_sk) | ( - np.abs(accuracy_cuml - accuracy_sk) < 1e-3 - ) - - # predicted probabilities should differ by <= 5% - # even with different solvers (arbitrary) - probs_cuml = cuml_model.predict_proba(gX).compute() - probs_sk = sk_model.predict_proba(X)[:, 1] - assert np.abs(probs_sk - probs_cuml.get()).max() <= 0.05 - - @pytest.mark.mg @pytest.mark.parametrize("n_parts", [2]) @pytest.mark.parametrize("datatype", [np.float32, np.float64]) diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index 8934a0f226..228cb92b5c 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -111,7 +111,6 @@ classifiers = [ [project.optional-dependencies] test = [ "cython>=3.0.0", - "dask-glm==0.3.0", "dask-ml", "hdbscan>=0.8.38,<0.8.39", "hypothesis>=6.0,<7",