From c8e8d29fff36f263863396a62150b4bd09cad19b Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Tue, 30 May 2023 16:27:46 +0200 Subject: [PATCH 1/2] dask: logistic regression # Conflicts: # Orange/data/dask.py --- Orange/base.py | 2 +- Orange/classification/logistic_regression.py | 29 ++++++++++++++++++-- Orange/widgets/evaluate/owtestandscore.py | 2 +- Orange/widgets/model/owlogisticregression.py | 10 +++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/Orange/base.py b/Orange/base.py index 3376b1080f1..47e125ff1f6 100644 --- a/Orange/base.py +++ b/Orange/base.py @@ -141,7 +141,7 @@ def __call__(self, data, progress_callback=None): progress_callback(0.1, "Fitting...") model = self._fit_model(data) - model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T] + model.used_vals = [np.asarray(np.unique(y), dtype=int) for y in data.Y[:, None].T] if not hasattr(model, "domain") or model.domain is None: # some models set domain themself and it should be respected # e.g. calibration learners set the base_learner's domain which diff --git a/Orange/classification/logistic_regression.py b/Orange/classification/logistic_regression.py index cd886dea7fd..cc4d0af5688 100644 --- a/Orange/classification/logistic_regression.py +++ b/Orange/classification/logistic_regression.py @@ -1,11 +1,20 @@ +import warnings + import numpy as np +import dask.array as da import sklearn.linear_model as skl_linear_model +try: + import dask_ml.linear_model as dask_linear_model +except ImportError: + dask_linear_model = skl_linear_model + from Orange.classification import SklLearner, SklModel from Orange.preprocess import Normalize from Orange.preprocess.score import LearnerScorer from Orange.data import Variable, DiscreteVariable + __all__ = ["LogisticRegressionLearner"] @@ -22,11 +31,11 @@ def score(self, data): class LogisticRegressionClassifier(SklModel): @property def intercept(self): - return self.skl_model.intercept_ + return np.atleast_1d(self.skl_model.intercept_) @property def coefficients(self): - return self.skl_model.coef_ + return np.atleast_2d(self.skl_model.coef_) class LogisticRegressionLearner(SklLearner, _FeatureScorerMixin): @@ -43,9 +52,23 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0, def _initialize_wrapped(self, X=None, Y=None): params = self.params.copy() + solver = params.pop("solver") + penalty = params.get("penalty") or "none" + + if isinstance(X, da.Array) or isinstance(Y, da.Array): + if dask_linear_model is skl_linear_model: + warnings.warn("dask_ml is not installed, using sklearn instead.") + else: + if solver == "auto": + if penalty in "none": + solver = "gradient_descent" + else: + solver = "admm" + params["solver"], params["penalty"] = solver, penalty + return dask_linear_model.LogisticRegression(**params) + # The default scikit-learn solver `lbfgs` (v0.22) does not support the # l1 penalty. - solver, penalty = params.pop("solver"), params.get("penalty") if solver == "auto": if penalty == "l1": solver = "liblinear" diff --git a/Orange/widgets/evaluate/owtestandscore.py b/Orange/widgets/evaluate/owtestandscore.py index ac0c839d592..b67ed0faf8c 100644 --- a/Orange/widgets/evaluate/owtestandscore.py +++ b/Orange/widgets/evaluate/owtestandscore.py @@ -950,7 +950,7 @@ def __update(self): do_stratify = False elif self.data.domain.class_var.is_discrete: least = min(filter(None, - np.bincount(self.data.Y.astype(int)))) + np.bincount(np.asarray(self.data.Y, dtype=int)))) if least < k: self.Warning.cant_stratify(k, least) do_stratify = False diff --git a/Orange/widgets/model/owlogisticregression.py b/Orange/widgets/model/owlogisticregression.py index 63c401b7797..7acae849bfd 100644 --- a/Orange/widgets/model/owlogisticregression.py +++ b/Orange/widgets/model/owlogisticregression.py @@ -5,6 +5,7 @@ from orangewidget.report import bool_str from Orange.data import Table, Domain, ContinuousVariable, StringVariable +from Orange.data.dask import DaskTable from Orange.classification.logistic_regression import LogisticRegressionLearner from Orange.widgets import settings, gui from Orange.widgets.utils.owlearnerwidget import OWBaseLearner @@ -139,6 +140,15 @@ def get_learner_parameters(self): self.penalty_types[self.penalty_type], self.C_s[self.C_index], bool_str(self.class_weight))),) + def check_data(self): + valid = super().check_data() + if valid and isinstance(self.data, DaskTable) \ + and len(self.data.domain.class_var.values) > 2 \ + and len(np.unique(self.data).compute()) > 2: + self.Error.data_error("Data contains too many target values.") + valid = False + return valid + def create_coef_table(classifier): i = classifier.intercept From 7a016f863b1ad63ea3a680238dbc99600c1c63d7 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Tue, 25 Jul 2023 14:07:09 +0200 Subject: [PATCH 2/2] tests --- Orange/tests/test_logistic_regression.py | 58 ++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/Orange/tests/test_logistic_regression.py b/Orange/tests/test_logistic_regression.py index 4ab3cdc3a68..518b280cfa6 100644 --- a/Orange/tests/test_logistic_regression.py +++ b/Orange/tests/test_logistic_regression.py @@ -9,6 +9,7 @@ from Orange.data import Table, ContinuousVariable, Domain from Orange.classification import LogisticRegressionLearner, Model from Orange.evaluation import CrossValidation, CA +from Orange.tests.test_dasktable import temp_dasktable class TestLogisticRegressionLearner(unittest.TestCase): @@ -114,9 +115,9 @@ def test_coefficients(self): def test_predict_on_instance(self): lr = LogisticRegressionLearner() - m = lr(self.zoo) - probs = m(self.zoo[50], m.Probs) - probs2 = m(self.zoo[50, :], m.Probs) + m = lr(self.heart_disease) + probs = m(self.heart_disease[50], m.Probs) + probs2 = m(self.heart_disease[50, :], m.Probs) np.testing.assert_almost_equal(probs, probs2[0]) def test_single_class(self): @@ -151,3 +152,54 @@ def test_auto_solver(self): skl_clf = lr._initialize_wrapped() self.assertEqual(skl_clf.solver, "liblinear") self.assertEqual(skl_clf.penalty, "l1") + + +class TestLogisticRegressionOnDask(TestLogisticRegressionLearner): + @classmethod + def setUpClass(cls): + cls.iris = temp_dasktable(Table('iris')) + cls.heart_disease = temp_dasktable(Table('heart_disease.tab')) + + def test_learner_scorer(self): + # for some reason dask_ml and sklearn yield different results + learner = LogisticRegressionLearner() + scores = learner.score_data(self.heart_disease) + self.assertEqual('major vessels colored', + self.heart_disease.domain.attributes[np.argmax(scores)].name) + self.assertEqual(scores.shape, (1, len(self.heart_disease.domain.attributes))) + + @unittest.skip("Discretizer not yet implemented") + def test_learner_scorer_previous_transformation(self): + super().test_learner_scorer_previous_transformation() + + @unittest.skip("Dask-ML does not support multiclass regression") + def test_learner_scorer_multiclass(self): + super().test_learner_scorer_multiclass() + + @unittest.skip("Dask-ML does not support multiclass regression") + def test_learner_scorer_multiclass_feature(self): + super().test_learner_scorer_multiclass_feature() + + @unittest.skip("Dask-ML accepts single class") + def test_single_class(self): + super().test_single_class() + + @unittest.skip("Dask-ML accepts single class") + def test_sklearn_single_class(self): + super().test_sklearn_single_class() + + def test_auto_solver(self): + lr = LogisticRegressionLearner(penalty="l2", solver="auto") + skl_clf = lr._initialize_wrapped(self.iris.X) + self.assertEqual(skl_clf.solver, "admm") + self.assertEqual(skl_clf.penalty, "l2") + + lr = LogisticRegressionLearner(penalty="l1", solver="auto") + skl_clf = lr._initialize_wrapped(self.iris.X) + self.assertEqual(skl_clf.solver, "admm") + self.assertEqual(skl_clf.penalty, "l1") + + lr = LogisticRegressionLearner(penalty=None, solver="auto") + skl_clf = lr._initialize_wrapped(self.iris.X) + self.assertEqual(skl_clf.solver, "gradient_descent") + self.assertEqual(skl_clf.penalty, "none")