dask compatible logistic regression

biolab · Jul 20, 2023 · 6f14e87 · 6f14e87
1 parent 27f8e75
commit 6f14e87
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 14 deletions.
diff --git a/Orange/base.py b/Orange/base.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import scipy
+import dask.array as da
 
 from Orange.data import Table, Storage, Instance, Value, Domain
 from Orange.data.filter import HasClass
@@ -507,7 +508,9 @@ def __init__(self, skl_model):
         self.skl_model = skl_model
 
     def predict(self, X):
-        value = self.skl_model.predict(X)
+        if isinstance(X, da.Array):
+            X = X.rechunk({0: "auto", 1: -1})
+        value = np.asarray(self.skl_model.predict(X))
         # SVM has probability attribute which defines if method compute probs
         has_prob_attr = hasattr(self.skl_model, "probability")
         if (has_prob_attr and self.skl_model.probability
@@ -581,12 +584,14 @@ def __call__(self, data, progress_callback=None):
         m.params = self.params
         return m
 
-    def _initialize_wrapped(self):
+    # pylint: disable=unused-argument
+    def _initialize_wrapped(self, X=None, Y=None):
+        # wrap sklearn/dask_ml according to type of X/Y
         # pylint: disable=not-callable
         return self.__wraps__(**self.params)
 
     def fit(self, X, Y, W=None):
-        clf = self._initialize_wrapped()
+        clf = self._initialize_wrapped(X, Y)
         Y = Y.reshape(-1)
         if W is None or not self.supports_weights:
             return self.__returns__(clf.fit(X, Y))

diff --git a/Orange/classification/logistic_regression.py b/Orange/classification/logistic_regression.py
@@ -1,11 +1,15 @@
+import warnings
+
 import numpy as np
+import dask.array as da
 import sklearn.linear_model as skl_linear_model
 
 from Orange.classification import SklLearner, SklModel
 from Orange.preprocess import Normalize
 from Orange.preprocess.score import LearnerScorer
 from Orange.data import Variable, DiscreteVariable
 
+
 __all__ = ["LogisticRegressionLearner"]
 
 
@@ -41,8 +45,21 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0,
         super().__init__(preprocessors=preprocessors)
         self.params = vars()
 
-    def _initialize_wrapped(self):
+    def _initialize_wrapped(self, X=None, Y=None):
         params = self.params.copy()
+
+        if isinstance(X, da.Array) or isinstance(Y, da.Array):
+            try:
+                import dask_ml.linear_model
+
+                params["solver"] = "admm"
+                if params["penalty"] == "none":
+                    params["solver"] = "gradient_descent"
+
+                return dask_ml.linear_model.LogisticRegression(**params)
+            except ImportError:
+                warnings.warn("dask_ml is not installed, using sklearn instead.")
+
         # The default scikit-learn solver `lbfgs` (v0.22) does not support the
         # l1 penalty.
         solver, penalty = params.pop("solver"), params.get("penalty")
@@ -55,3 +72,8 @@ def _initialize_wrapped(self):
 
         return self.__wraps__(**params)
 
+    def fit(self, X, Y, W=None):
+        if isinstance(X, da.Array) or isinstance(Y, da.Array):
+            X = X.rechunk({0: "auto", 1: -1})
+            Y = Y.rechunk({0: X.chunksize[0]})
+        return super().fit(X, Y, W)
diff --git a/Orange/classification/neural_network.py b/Orange/classification/neural_network.py
@@ -26,7 +26,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin):
 class NNClassificationLearner(NNBase, SklLearner):
     __wraps__ = MLPClassifierWCallback
 
-    def _initialize_wrapped(self):
+    def _initialize_wrapped(self, X=None, Y=None):
         clf = SklLearner._initialize_wrapped(self)
         clf.orange_callback = getattr(self, "callback", None)
         return clf
diff --git a/Orange/data/dask.py b/Orange/data/dask.py
@@ -260,6 +260,14 @@ def __len__(self):
             self.X.compute_chunk_sizes()
         return self.X.shape[0]
 
+    def _filter_has_class(self, negate=False):
+        if self._Y.ndim == 1:
+            retain = np.isnan(self._Y)
+        else:
+            retain = np.any(np.isnan(self._Y), axis=1)
+        if not negate:
+            retain = np.logical_not(retain)
+        return self.from_table_rows(self, np.asarray(retain))
 
 def dask_stats(X, compute_variance=False):
     is_numeric = np.issubdtype(X.dtype, np.number)

diff --git a/Orange/regression/neural_network.py b/Orange/regression/neural_network.py
@@ -13,7 +13,7 @@ class MLPRegressorWCallback(skl_nn.MLPRegressor, NIterCallbackMixin):
 class NNRegressionLearner(NNBase, SklLearner):
     __wraps__ = MLPRegressorWCallback
 
-    def _initialize_wrapped(self):
+    def _initialize_wrapped(self, X=None, Y=None):
         clf = SklLearner._initialize_wrapped(self)
         clf.orange_callback = getattr(self, "callback", None)
         return clf
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -562,7 +562,7 @@ def nanmode(x, axis=0):
 def unique(x, return_counts=False):
     """ Equivalent of np.unique that supports sparse or dense matrices. """
     if not sp.issparse(x):
-        return np.unique(x, return_counts=return_counts)
+        return np.unique(np.asarray(x), return_counts=return_counts)
 
     implicit_zeros = sparse_count_implicit_zeros(x)
     explicit_zeros = not np.all(x.data)

diff --git a/Orange/tests/test_logistic_regression.py b/Orange/tests/test_logistic_regression.py
@@ -9,6 +9,7 @@
 from Orange.data import Table, ContinuousVariable, Domain
 from Orange.classification import LogisticRegressionLearner, Model
 from Orange.evaluation import CrossValidation, CA
+from Orange.tests.test_dasktable import temp_dasktable
 
 
 class TestLogisticRegressionLearner(unittest.TestCase):
@@ -151,3 +152,11 @@ def test_auto_solver(self):
         skl_clf = lr._initialize_wrapped()
         self.assertEqual(skl_clf.solver, "liblinear")
         self.assertEqual(skl_clf.penalty, "l1")
+
+
+class TestLRLOnDask(TestLogisticRegressionLearner):
+    @classmethod
+    def setUpClass(cls):
+        cls.iris = temp_dasktable(Table('iris'))
+        cls.heart_disease = temp_dasktable(Table('heart_disease.tab'))
+        cls.zoo = temp_dasktable(Table('zoo'))
diff --git a/Orange/widgets/evaluate/owpredictions.py b/Orange/widgets/evaluate/owpredictions.py
@@ -397,7 +397,7 @@ def _call_predictors(self):
             results.domain = self.data.domain
             results.row_indices = numpy.arange(len(self.data))
             results.folds = (Ellipsis, )
-            results.actual = self.data.Y
+            results.actual = numpy.asarray(self.data.Y)
             results.unmapped_probabilities = prob
             results.unmapped_predicted = pred
             results.probabilities = results.predicted = None

diff --git a/Orange/widgets/evaluate/owtestandscore.py b/Orange/widgets/evaluate/owtestandscore.py
@@ -950,7 +950,7 @@ def __update(self):
                     do_stratify = False
                 elif self.data.domain.class_var.is_discrete:
                     least = min(filter(None,
-                                       np.bincount(self.data.Y.astype(int))))
+                                       np.bincount(np.asarray(self.data.Y, dtype=int))))
                     if least < k:
                         self.Warning.cant_stratify(k, least)
                         do_stratify = False

diff --git a/Orange/widgets/model/owlogisticregression.py b/Orange/widgets/model/owlogisticregression.py
@@ -5,6 +5,7 @@
 from orangewidget.report import bool_str
 
 from Orange.data import Table, Domain, ContinuousVariable, StringVariable
+from Orange.data.dask import DaskTable
 from Orange.classification.logistic_regression import LogisticRegressionLearner
 from Orange.widgets import settings, gui
 from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
@@ -139,14 +140,23 @@ def get_learner_parameters(self):
             self.penalty_types[self.penalty_type], self.C_s[self.C_index],
             bool_str(self.class_weight))),)
 
+    def check_data(self):
+        valid = super().check_data()
+        if valid and isinstance(self.data, DaskTable) \
+                and len(self.data.domain.class_var.values) > 2:
+            self.Error.data_error("Data contains too many target values.")
+            valid = False
+        return valid
+
 
 def create_coef_table(classifier):
-    i = classifier.intercept
-    c = classifier.coefficients
-    if c.shape[0] > 2:
-        values = [classifier.domain.class_var.values[int(i)] for i in classifier.used_vals[0]]
+    i = np.atleast_1d(classifier.intercept)
+    c = np.atleast_2d(classifier.coefficients)
+    if c.shape[0] > 2:  # multi-class
+        values = [classifier.domain.class_var.values[int(i)]
+                  for i in np.asarray(classifier.used_vals[0])]
     else:
-        values = [classifier.domain.class_var.values[int(classifier.used_vals[0][1])]]
+        values = [classifier.domain.class_var.values[int(np.asarray(classifier.used_vals[0])[1])]]
     domain = Domain([ContinuousVariable(value) for value in values],
                     metas=[StringVariable("name")])
     coefs = np.vstack((i.reshape(1, len(i)), c.T))