Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove old dask-glm based logistic regression #6028

Open
wants to merge 7 commits into
base: branch-24.10
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ dependencies:
- numpydoc
- nvcc_linux-64=11.8
- packaging
- pip
- pydata-sphinx-theme!=0.14.2
- pylibraft==24.10.*,>=0.0.0a0
- pynndescent
Expand Down Expand Up @@ -78,6 +77,4 @@ dependencies:
- sysroot_linux-64==2.17
- treelite==4.3.0
- umap-learn==0.5.6
- pip:
- dask-glm==0.3.0
name: all_cuda-118_arch-x86_64
3 changes: 0 additions & 3 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ dependencies:
- numpy>=1.23,<3.0a0
- numpydoc
- packaging
- pip
- pydata-sphinx-theme!=0.14.2
- pylibraft==24.10.*,>=0.0.0a0
- pynndescent
Expand Down Expand Up @@ -74,6 +73,4 @@ dependencies:
- sysroot_linux-64==2.17
- treelite==4.3.0
- umap-learn==0.5.6
- pip:
- dask-glm==0.3.0
name: all_cuda-125_arch-x86_64
9 changes: 0 additions & 9 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -517,15 +517,6 @@ dependencies:
- statsmodels
- umap-learn==0.5.6
- pynndescent
- setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default
- output_types: conda
packages:
- pip
- pip:
- dask-glm==0.3.0
- output_types: pyproject
packages:
- dask-glm==0.3.0
test_notebooks:
common:
- output_types: [conda, requirements]
Expand Down
Empty file.
27 changes: 0 additions & 27 deletions python/cuml/cuml/dask/extended/linear_model/__init__.py

This file was deleted.

219 changes: 0 additions & 219 deletions python/cuml/cuml/dask/extended/linear_model/logistic_regression.py

This file was deleted.

86 changes: 0 additions & 86 deletions python/cuml/cuml/tests/dask/test_dask_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,92 +103,6 @@ def make_classification_dataset(
return X, y


def select_sk_solver(cuml_solver):
if cuml_solver == "newton":
return "newton-cg"
elif cuml_solver in ["admm", "lbfgs"]:
return "lbfgs"
else:
pytest.xfail("No matched sklearn solver")


@pytest.mark.mg
@pytest.mark.parametrize("nrows", [1e5])
@pytest.mark.parametrize("ncols", [20])
@pytest.mark.parametrize("n_parts", [2, 6])
@pytest.mark.parametrize("fit_intercept", [False, True])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("gpu_array_input", [False, True])
@pytest.mark.parametrize(
"solver", ["admm", "gradient_descent", "newton", "lbfgs", "proximal_grad"]
)
def test_lr_fit_predict_score(
nrows,
ncols,
n_parts,
fit_intercept,
datatype,
gpu_array_input,
solver,
client,
):
sk_solver = select_sk_solver(cuml_solver=solver)

def imp():
import cuml.comm.serialize # NOQA

client.run(imp)

from cuml.dask.extended.linear_model import (
LogisticRegression as cumlLR_dask,
)

n_info = 5
nrows = int(nrows)
ncols = int(ncols)
X, y = make_classification_dataset(datatype, nrows, ncols, n_info)

gX, gy = _prep_training_data(client, X, y, n_parts)

if gpu_array_input:
gX = gX.values
gX._meta = cp.asarray(gX._meta)
gy = gy.values
gy._meta = cp.asarray(gy._meta)

cuml_model = cumlLR_dask(
fit_intercept=fit_intercept, solver=solver, max_iter=10
)

# test fit and predict
cuml_model.fit(gX, gy)
cu_preds = cuml_model.predict(gX)
accuracy_cuml = accuracy_score(y, cu_preds.compute().get())

sk_model = skLR(fit_intercept=fit_intercept, solver=sk_solver, max_iter=10)
sk_model.fit(X, y)
sk_preds = sk_model.predict(X)
accuracy_sk = accuracy_score(y, sk_preds)

assert (accuracy_cuml >= accuracy_sk) | (
np.abs(accuracy_cuml - accuracy_sk) < 1e-3
)

# score
accuracy_cuml = cuml_model.score(gX, gy).compute().item()
accuracy_sk = sk_model.score(X, y)

assert (accuracy_cuml >= accuracy_sk) | (
np.abs(accuracy_cuml - accuracy_sk) < 1e-3
)

# predicted probabilities should differ by <= 5%
# even with different solvers (arbitrary)
probs_cuml = cuml_model.predict_proba(gX).compute()
probs_sk = sk_model.predict_proba(X)[:, 1]
assert np.abs(probs_sk - probs_cuml.get()).max() <= 0.05


@pytest.mark.mg
@pytest.mark.parametrize("n_parts", [2])
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
Expand Down
Loading
Loading