Skip to content

Commit

Permalink
Merge pull request #6512 from noahnovsak/dask-logisticregression
Browse files Browse the repository at this point in the history
[ENH] Dask: Logistic Regression
  • Loading branch information
markotoplak committed Sep 18, 2023
2 parents c3a5538 + c807e0c commit f293f2a
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def __call__(self, data, progress_callback=None):

progress_callback(0.1, "Fitting...")
model = self._fit_model(data)
model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T]
model.used_vals = [np.asarray(np.unique(y), dtype=int) for y in data.Y[:, None].T]
if not hasattr(model, "domain") or model.domain is None:
# some models set domain themself and it should be respected
# e.g. calibration learners set the base_learner's domain which
Expand Down
29 changes: 26 additions & 3 deletions Orange/classification/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import warnings

import numpy as np
import dask.array as da
import sklearn.linear_model as skl_linear_model

try:
import dask_ml.linear_model as dask_linear_model
except ImportError:
dask_linear_model = skl_linear_model

from Orange.classification import SklLearner, SklModel
from Orange.preprocess import Normalize
from Orange.preprocess.score import LearnerScorer
from Orange.data import Variable, DiscreteVariable


__all__ = ["LogisticRegressionLearner"]


Expand All @@ -22,11 +31,11 @@ def score(self, data):
class LogisticRegressionClassifier(SklModel):
@property
def intercept(self):
return self.skl_model.intercept_
return np.atleast_1d(self.skl_model.intercept_)

@property
def coefficients(self):
return self.skl_model.coef_
return np.atleast_2d(self.skl_model.coef_)


class LogisticRegressionLearner(SklLearner, _FeatureScorerMixin):
Expand All @@ -43,9 +52,23 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0,

def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()
solver = params.pop("solver")
penalty = params.get("penalty") or "none"

if isinstance(X, da.Array) or isinstance(Y, da.Array):
if dask_linear_model is skl_linear_model:
warnings.warn("dask_ml is not installed, using sklearn instead.")
else:
if solver == "auto":
if penalty in "none":
solver = "gradient_descent"
else:
solver = "admm"
params["solver"], params["penalty"] = solver, penalty
return dask_linear_model.LogisticRegression(**params)

# The default scikit-learn solver `lbfgs` (v0.22) does not support the
# l1 penalty.
solver, penalty = params.pop("solver"), params.get("penalty")
if solver == "auto":
if penalty == "l1":
solver = "liblinear"
Expand Down
58 changes: 55 additions & 3 deletions Orange/tests/test_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Orange.data import Table, ContinuousVariable, Domain
from Orange.classification import LogisticRegressionLearner, Model
from Orange.evaluation import CrossValidation, CA
from Orange.tests.test_dasktable import temp_dasktable


class TestLogisticRegressionLearner(unittest.TestCase):
Expand Down Expand Up @@ -114,9 +115,9 @@ def test_coefficients(self):

def test_predict_on_instance(self):
lr = LogisticRegressionLearner()
m = lr(self.zoo)
probs = m(self.zoo[50], m.Probs)
probs2 = m(self.zoo[50, :], m.Probs)
m = lr(self.heart_disease)
probs = m(self.heart_disease[50], m.Probs)
probs2 = m(self.heart_disease[50, :], m.Probs)
np.testing.assert_almost_equal(probs, probs2[0])

def test_single_class(self):
Expand Down Expand Up @@ -151,3 +152,54 @@ def test_auto_solver(self):
skl_clf = lr._initialize_wrapped()
self.assertEqual(skl_clf.solver, "liblinear")
self.assertEqual(skl_clf.penalty, "l1")


class TestLogisticRegressionOnDask(TestLogisticRegressionLearner):
@classmethod
def setUpClass(cls):
cls.iris = temp_dasktable(Table('iris'))
cls.heart_disease = temp_dasktable(Table('heart_disease.tab'))

def test_learner_scorer(self):
# for some reason dask_ml and sklearn yield different results
learner = LogisticRegressionLearner()
scores = learner.score_data(self.heart_disease)
self.assertEqual('major vessels colored',
self.heart_disease.domain.attributes[np.argmax(scores)].name)
self.assertEqual(scores.shape, (1, len(self.heart_disease.domain.attributes)))

@unittest.skip("Discretizer not yet implemented")
def test_learner_scorer_previous_transformation(self):
super().test_learner_scorer_previous_transformation()

@unittest.skip("Dask-ML does not support multiclass regression")
def test_learner_scorer_multiclass(self):
super().test_learner_scorer_multiclass()

@unittest.skip("Dask-ML does not support multiclass regression")
def test_learner_scorer_multiclass_feature(self):
super().test_learner_scorer_multiclass_feature()

@unittest.skip("Dask-ML accepts single class")
def test_single_class(self):
super().test_single_class()

@unittest.skip("Dask-ML accepts single class")
def test_sklearn_single_class(self):
super().test_sklearn_single_class()

def test_auto_solver(self):
lr = LogisticRegressionLearner(penalty="l2", solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "admm")
self.assertEqual(skl_clf.penalty, "l2")

lr = LogisticRegressionLearner(penalty="l1", solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "admm")
self.assertEqual(skl_clf.penalty, "l1")

lr = LogisticRegressionLearner(penalty=None, solver="auto")
skl_clf = lr._initialize_wrapped(self.iris.X)
self.assertEqual(skl_clf.solver, "gradient_descent")
self.assertEqual(skl_clf.penalty, "none")
2 changes: 1 addition & 1 deletion Orange/widgets/evaluate/owtestandscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def __update(self):
do_stratify = False
elif self.data.domain.class_var.is_discrete:
least = min(filter(None,
np.bincount(self.data.Y.astype(int))))
np.bincount(np.asarray(self.data.Y, dtype=int))))
if least < k:
self.Warning.cant_stratify(k, least)
do_stratify = False
Expand Down
10 changes: 10 additions & 0 deletions Orange/widgets/model/owlogisticregression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from orangewidget.report import bool_str

from Orange.data import Table, Domain, ContinuousVariable, StringVariable
from Orange.data.dask import DaskTable
from Orange.classification.logistic_regression import LogisticRegressionLearner
from Orange.widgets import settings, gui
from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
Expand Down Expand Up @@ -139,6 +140,15 @@ def get_learner_parameters(self):
self.penalty_types[self.penalty_type], self.C_s[self.C_index],
bool_str(self.class_weight))),)

def check_data(self):
valid = super().check_data()
if valid and isinstance(self.data, DaskTable) \
and len(self.data.domain.class_var.values) > 2 \
and len(np.unique(self.data).compute()) > 2:
self.Error.data_error("Data contains too many target values.")
valid = False
return valid


def create_coef_table(classifier):
i = classifier.intercept
Expand Down

0 comments on commit f293f2a

Please sign in to comment.