Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Dask: Logistic Regression #6512

Merged
merged 2 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def __call__(self, data, progress_callback=None):

progress_callback(0.1, "Fitting...")
model = self._fit_model(data)
model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T]
model.used_vals = [np.asarray(np.unique(y), dtype=int) for y in data.Y[:, None].T]
if not hasattr(model, "domain") or model.domain is None:
# some models set domain themself and it should be respected
# e.g. calibration learners set the base_learner's domain which
Expand Down
29 changes: 26 additions & 3 deletions Orange/classification/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import warnings

import numpy as np
import dask.array as da
import sklearn.linear_model as skl_linear_model

try:
import dask_ml.linear_model as dask_linear_model
except ImportError:
dask_linear_model = skl_linear_model

Check warning on line 10 in Orange/classification/logistic_regression.py

View check run for this annotation

Codecov / codecov/patch

Orange/classification/logistic_regression.py#L9-L10

Added lines #L9 - L10 were not covered by tests

from Orange.classification import SklLearner, SklModel
from Orange.preprocess import Normalize
from Orange.preprocess.score import LearnerScorer
from Orange.data import Variable, DiscreteVariable


__all__ = ["LogisticRegressionLearner"]


Expand All @@ -22,11 +31,11 @@
class LogisticRegressionClassifier(SklModel):
@property
def intercept(self):
return self.skl_model.intercept_
return np.atleast_1d(self.skl_model.intercept_)

@property
def coefficients(self):
return self.skl_model.coef_
return np.atleast_2d(self.skl_model.coef_)


class LogisticRegressionLearner(SklLearner, _FeatureScorerMixin):
Expand All @@ -43,9 +52,23 @@

def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()
solver = params.pop("solver")
penalty = params.get("penalty") or "none"

if isinstance(X, da.Array) or isinstance(Y, da.Array):
if dask_linear_model is skl_linear_model:
warnings.warn("dask_ml is not installed, using sklearn instead.")

Check warning on line 60 in Orange/classification/logistic_regression.py

View check run for this annotation

Codecov / codecov/patch

Orange/classification/logistic_regression.py#L60

Added line #L60 was not covered by tests
else:
if solver == "auto":
if penalty in "none":
solver = "gradient_descent"
else:
solver = "admm"
params["solver"], params["penalty"] = solver, penalty
return dask_linear_model.LogisticRegression(**params)

# The default scikit-learn solver `lbfgs` (v0.22) does not support the
# l1 penalty.
solver, penalty = params.pop("solver"), params.get("penalty")
if solver == "auto":
if penalty == "l1":
solver = "liblinear"
Expand Down
58 changes: 55 additions & 3 deletions Orange/tests/test_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Orange.data import Table, ContinuousVariable, Domain
from Orange.classification import LogisticRegressionLearner, Model
from Orange.evaluation import CrossValidation, CA
from Orange.tests.test_dasktable import temp_dasktable


class TestLogisticRegressionLearner(unittest.TestCase):
Expand Down Expand Up @@ -114,9 +115,9 @@ def test_coefficients(self):

def test_predict_on_instance(self):
lr = LogisticRegressionLearner()
m = lr(self.zoo)
probs = m(self.zoo[50], m.Probs)
probs2 = m(self.zoo[50, :], m.Probs)
m = lr(self.heart_disease)
probs = m(self.heart_disease[50], m.Probs)
probs2 = m(self.heart_disease[50, :], m.Probs)
np.testing.assert_almost_equal(probs, probs2[0])

def test_single_class(self):
Expand Down Expand Up @@ -151,3 +152,54 @@ def test_auto_solver(self):
skl_clf = lr._initialize_wrapped()
self.assertEqual(skl_clf.solver, "liblinear")
self.assertEqual(skl_clf.penalty, "l1")


class TestLogisticRegressionOnDask(TestLogisticRegressionLearner):
@classmethod
def setUpClass(cls):
cls.iris = temp_dasktable(Table('iris'))
cls.heart_disease = temp_dasktable(Table('heart_disease.tab'))

def test_learner_scorer(self):
# for some reason dask_ml and sklearn yield different results
learner = LogisticRegressionLearner()
scores = learner.score_data(self.heart_disease)
self.assertEqual('major vessels colored',
self.heart_disease.domain.attributes[np.argmax(scores)].name)
self.assertEqual(scores.shape, (1, len(self.heart_disease.domain.attributes)))

@unittest.skip("Discretizer not yet implemented")
def test_learner_scorer_previous_transformation(self):
super().test_learner_scorer_previous_transformation()

@unittest.skip("Dask-ML does not support multiclass regression")
def test_learner_scorer_multiclass(self):
super().test_learner_scorer_multiclass()

@unittest.skip("Dask-ML does not support multiclass regression")
def test_learner_scorer_multiclass_feature(self):
super().test_learner_scorer_multiclass_feature()

@unittest.skip("Dask-ML accepts single class")
def test_single_class(self):
super().test_single_class()

@unittest.skip("Dask-ML accepts single class")
def test_sklearn_single_class(self):
super().test_sklearn_single_class()

def test_auto_solver(self):
lr = LogisticRegressionLearner(penalty="l2", solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "admm")
self.assertEqual(skl_clf.penalty, "l2")

lr = LogisticRegressionLearner(penalty="l1", solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "admm")
self.assertEqual(skl_clf.penalty, "l1")

lr = LogisticRegressionLearner(penalty=None, solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "gradient_descent")
self.assertEqual(skl_clf.penalty, "none")
2 changes: 1 addition & 1 deletion Orange/widgets/evaluate/owtestandscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def __update(self):
do_stratify = False
elif self.data.domain.class_var.is_discrete:
least = min(filter(None,
np.bincount(self.data.Y.astype(int))))
np.bincount(np.asarray(self.data.Y, dtype=int))))
if least < k:
self.Warning.cant_stratify(k, least)
do_stratify = False
Expand Down
10 changes: 10 additions & 0 deletions Orange/widgets/model/owlogisticregression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from orangewidget.report import bool_str

from Orange.data import Table, Domain, ContinuousVariable, StringVariable
from Orange.data.dask import DaskTable
from Orange.classification.logistic_regression import LogisticRegressionLearner
from Orange.widgets import settings, gui
from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
Expand Down Expand Up @@ -139,6 +140,15 @@
self.penalty_types[self.penalty_type], self.C_s[self.C_index],
bool_str(self.class_weight))),)

def check_data(self):
valid = super().check_data()
if valid and isinstance(self.data, DaskTable) \
and len(self.data.domain.class_var.values) > 2 \
and len(np.unique(self.data).compute()) > 2:
self.Error.data_error("Data contains too many target values.")
valid = False

Check warning on line 149 in Orange/widgets/model/owlogisticregression.py

View check run for this annotation

Codecov / codecov/patch

Orange/widgets/model/owlogisticregression.py#L148-L149

Added lines #L148 - L149 were not covered by tests
return valid


def create_coef_table(classifier):
i = classifier.intercept
Expand Down
Loading