Skip to content

Commit

Permalink
dask compatible logistic regression
Browse files Browse the repository at this point in the history
  • Loading branch information
noahnovsak committed Jul 20, 2023
1 parent 27f8e75 commit 6f14e87
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 14 deletions.
11 changes: 8 additions & 3 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import scipy
import dask.array as da

from Orange.data import Table, Storage, Instance, Value, Domain
from Orange.data.filter import HasClass
Expand Down Expand Up @@ -507,7 +508,9 @@ def __init__(self, skl_model):
self.skl_model = skl_model

def predict(self, X):
value = self.skl_model.predict(X)
if isinstance(X, da.Array):
X = X.rechunk({0: "auto", 1: -1})
value = np.asarray(self.skl_model.predict(X))
# SVM has probability attribute which defines if method compute probs
has_prob_attr = hasattr(self.skl_model, "probability")
if (has_prob_attr and self.skl_model.probability
Expand Down Expand Up @@ -581,12 +584,14 @@ def __call__(self, data, progress_callback=None):
m.params = self.params
return m

def _initialize_wrapped(self):
# pylint: disable=unused-argument
def _initialize_wrapped(self, X=None, Y=None):
# wrap sklearn/dask_ml according to type of X/Y
# pylint: disable=not-callable
return self.__wraps__(**self.params)

def fit(self, X, Y, W=None):
clf = self._initialize_wrapped()
clf = self._initialize_wrapped(X, Y)
Y = Y.reshape(-1)
if W is None or not self.supports_weights:
return self.__returns__(clf.fit(X, Y))
Expand Down
24 changes: 23 additions & 1 deletion Orange/classification/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import warnings

import numpy as np
import dask.array as da
import sklearn.linear_model as skl_linear_model

from Orange.classification import SklLearner, SklModel
from Orange.preprocess import Normalize
from Orange.preprocess.score import LearnerScorer
from Orange.data import Variable, DiscreteVariable


__all__ = ["LogisticRegressionLearner"]


Expand Down Expand Up @@ -41,8 +45,21 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0,
super().__init__(preprocessors=preprocessors)
self.params = vars()

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()

if isinstance(X, da.Array) or isinstance(Y, da.Array):
try:
import dask_ml.linear_model

params["solver"] = "admm"
if params["penalty"] == "none":
params["solver"] = "gradient_descent"

return dask_ml.linear_model.LogisticRegression(**params)
except ImportError:
warnings.warn("dask_ml is not installed, using sklearn instead.")

# The default scikit-learn solver `lbfgs` (v0.22) does not support the
# l1 penalty.
solver, penalty = params.pop("solver"), params.get("penalty")
Expand All @@ -55,3 +72,8 @@ def _initialize_wrapped(self):

return self.__wraps__(**params)

def fit(self, X, Y, W=None):
if isinstance(X, da.Array) or isinstance(Y, da.Array):
X = X.rechunk({0: "auto", 1: -1})
Y = Y.rechunk({0: X.chunksize[0]})
return super().fit(X, Y, W)
2 changes: 1 addition & 1 deletion Orange/classification/neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin):
class NNClassificationLearner(NNBase, SklLearner):
__wraps__ = MLPClassifierWCallback

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
clf = SklLearner._initialize_wrapped(self)
clf.orange_callback = getattr(self, "callback", None)
return clf
8 changes: 8 additions & 0 deletions Orange/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ def __len__(self):
self.X.compute_chunk_sizes()
return self.X.shape[0]

def _filter_has_class(self, negate=False):
if self._Y.ndim == 1:
retain = np.isnan(self._Y)
else:
retain = np.any(np.isnan(self._Y), axis=1)
if not negate:
retain = np.logical_not(retain)
return self.from_table_rows(self, np.asarray(retain))

def dask_stats(X, compute_variance=False):
is_numeric = np.issubdtype(X.dtype, np.number)
Expand Down
2 changes: 1 addition & 1 deletion Orange/regression/neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class MLPRegressorWCallback(skl_nn.MLPRegressor, NIterCallbackMixin):
class NNRegressionLearner(NNBase, SklLearner):
__wraps__ = MLPRegressorWCallback

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
clf = SklLearner._initialize_wrapped(self)
clf.orange_callback = getattr(self, "callback", None)
return clf
2 changes: 1 addition & 1 deletion Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def nanmode(x, axis=0):
def unique(x, return_counts=False):
""" Equivalent of np.unique that supports sparse or dense matrices. """
if not sp.issparse(x):
return np.unique(x, return_counts=return_counts)
return np.unique(np.asarray(x), return_counts=return_counts)

implicit_zeros = sparse_count_implicit_zeros(x)
explicit_zeros = not np.all(x.data)
Expand Down
9 changes: 9 additions & 0 deletions Orange/tests/test_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Orange.data import Table, ContinuousVariable, Domain
from Orange.classification import LogisticRegressionLearner, Model
from Orange.evaluation import CrossValidation, CA
from Orange.tests.test_dasktable import temp_dasktable


class TestLogisticRegressionLearner(unittest.TestCase):
Expand Down Expand Up @@ -151,3 +152,11 @@ def test_auto_solver(self):
skl_clf = lr._initialize_wrapped()
self.assertEqual(skl_clf.solver, "liblinear")
self.assertEqual(skl_clf.penalty, "l1")


class TestLRLOnDask(TestLogisticRegressionLearner):
@classmethod
def setUpClass(cls):
cls.iris = temp_dasktable(Table('iris'))
cls.heart_disease = temp_dasktable(Table('heart_disease.tab'))
cls.zoo = temp_dasktable(Table('zoo'))
2 changes: 1 addition & 1 deletion Orange/widgets/evaluate/owpredictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def _call_predictors(self):
results.domain = self.data.domain
results.row_indices = numpy.arange(len(self.data))
results.folds = (Ellipsis, )
results.actual = self.data.Y
results.actual = numpy.asarray(self.data.Y)
results.unmapped_probabilities = prob
results.unmapped_predicted = pred
results.probabilities = results.predicted = None
Expand Down
2 changes: 1 addition & 1 deletion Orange/widgets/evaluate/owtestandscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def __update(self):
do_stratify = False
elif self.data.domain.class_var.is_discrete:
least = min(filter(None,
np.bincount(self.data.Y.astype(int))))
np.bincount(np.asarray(self.data.Y, dtype=int))))
if least < k:
self.Warning.cant_stratify(k, least)
do_stratify = False
Expand Down
20 changes: 15 additions & 5 deletions Orange/widgets/model/owlogisticregression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from orangewidget.report import bool_str

from Orange.data import Table, Domain, ContinuousVariable, StringVariable
from Orange.data.dask import DaskTable
from Orange.classification.logistic_regression import LogisticRegressionLearner
from Orange.widgets import settings, gui
from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
Expand Down Expand Up @@ -139,14 +140,23 @@ def get_learner_parameters(self):
self.penalty_types[self.penalty_type], self.C_s[self.C_index],
bool_str(self.class_weight))),)

def check_data(self):
valid = super().check_data()
if valid and isinstance(self.data, DaskTable) \
and len(self.data.domain.class_var.values) > 2:
self.Error.data_error("Data contains too many target values.")
valid = False
return valid


def create_coef_table(classifier):
i = classifier.intercept
c = classifier.coefficients
if c.shape[0] > 2:
values = [classifier.domain.class_var.values[int(i)] for i in classifier.used_vals[0]]
i = np.atleast_1d(classifier.intercept)
c = np.atleast_2d(classifier.coefficients)
if c.shape[0] > 2: # multi-class
values = [classifier.domain.class_var.values[int(i)]
for i in np.asarray(classifier.used_vals[0])]
else:
values = [classifier.domain.class_var.values[int(classifier.used_vals[0][1])]]
values = [classifier.domain.class_var.values[int(np.asarray(classifier.used_vals[0])[1])]]
domain = Domain([ContinuousVariable(value) for value in values],
metas=[StringVariable("name")])
coefs = np.vstack((i.reshape(1, len(i)), c.T))
Expand Down

0 comments on commit 6f14e87

Please sign in to comment.