diff --git a/Orange/base.py b/Orange/base.py index 07500b96a5b..8f518aa3af4 100644 --- a/Orange/base.py +++ b/Orange/base.py @@ -7,6 +7,7 @@ import numpy as np import scipy +import dask.array as da from Orange.data import Table, Storage, Instance, Value, Domain from Orange.data.filter import HasClass @@ -507,7 +508,9 @@ def __init__(self, skl_model): self.skl_model = skl_model def predict(self, X): - value = self.skl_model.predict(X) + if isinstance(X, da.Array): + X = X.rechunk({0: "auto", 1: -1}) + value = np.asarray(self.skl_model.predict(X)) # SVM has probability attribute which defines if method compute probs has_prob_attr = hasattr(self.skl_model, "probability") if (has_prob_attr and self.skl_model.probability @@ -581,12 +584,14 @@ def __call__(self, data, progress_callback=None): m.params = self.params return m - def _initialize_wrapped(self): + # pylint: disable=unused-argument + def _initialize_wrapped(self, X=None, Y=None): + # wrap sklearn/dask_ml according to type of X/Y # pylint: disable=not-callable return self.__wraps__(**self.params) def fit(self, X, Y, W=None): - clf = self._initialize_wrapped() + clf = self._initialize_wrapped(X, Y) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) diff --git a/Orange/classification/logistic_regression.py b/Orange/classification/logistic_regression.py index aeb4fbfc1cb..b9f84575211 100644 --- a/Orange/classification/logistic_regression.py +++ b/Orange/classification/logistic_regression.py @@ -1,4 +1,7 @@ +import warnings + import numpy as np +import dask.array as da import sklearn.linear_model as skl_linear_model from Orange.classification import SklLearner, SklModel @@ -6,6 +9,7 @@ from Orange.preprocess.score import LearnerScorer from Orange.data import Variable, DiscreteVariable + __all__ = ["LogisticRegressionLearner"] @@ -41,8 +45,21 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0, super().__init__(preprocessors=preprocessors) self.params = vars() - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): params = self.params.copy() + + if isinstance(X, da.Array) or isinstance(Y, da.Array): + try: + import dask_ml.linear_model + + params["solver"] = "admm" + if params["penalty"] == "none": + params["solver"] = "gradient_descent" + + return dask_ml.linear_model.LogisticRegression(**params) + except ImportError: + warnings.warn("dask_ml is not installed, using sklearn instead.") + # The default scikit-learn solver `lbfgs` (v0.22) does not support the # l1 penalty. solver, penalty = params.pop("solver"), params.get("penalty") @@ -55,3 +72,8 @@ def _initialize_wrapped(self): return self.__wraps__(**params) + def fit(self, X, Y, W=None): + if isinstance(X, da.Array) or isinstance(Y, da.Array): + X = X.rechunk({0: "auto", 1: -1}) + Y = Y.rechunk({0: X.chunksize[0]}) + return super().fit(X, Y, W) diff --git a/Orange/classification/neural_network.py b/Orange/classification/neural_network.py index 53dff79bed4..ee29cfff330 100644 --- a/Orange/classification/neural_network.py +++ b/Orange/classification/neural_network.py @@ -26,7 +26,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin): class NNClassificationLearner(NNBase, SklLearner): __wraps__ = MLPClassifierWCallback - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): clf = SklLearner._initialize_wrapped(self) clf.orange_callback = getattr(self, "callback", None) return clf diff --git a/Orange/data/dask.py b/Orange/data/dask.py index 6bb0f077fd1..5c2dc6286d9 100644 --- a/Orange/data/dask.py +++ b/Orange/data/dask.py @@ -260,6 +260,14 @@ def __len__(self): self.X.compute_chunk_sizes() return self.X.shape[0] + def _filter_has_class(self, negate=False): + if self._Y.ndim == 1: + retain = np.isnan(self._Y) + else: + retain = np.any(np.isnan(self._Y), axis=1) + if not negate: + retain = np.logical_not(retain) + return self.from_table_rows(self, np.asarray(retain)) def dask_stats(X, compute_variance=False): is_numeric = np.issubdtype(X.dtype, np.number) diff --git a/Orange/regression/neural_network.py b/Orange/regression/neural_network.py index 7a8b553756d..4c384411ad3 100644 --- a/Orange/regression/neural_network.py +++ b/Orange/regression/neural_network.py @@ -13,7 +13,7 @@ class MLPRegressorWCallback(skl_nn.MLPRegressor, NIterCallbackMixin): class NNRegressionLearner(NNBase, SklLearner): __wraps__ = MLPRegressorWCallback - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): clf = SklLearner._initialize_wrapped(self) clf.orange_callback = getattr(self, "callback", None) return clf diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index e3b9374aaf3..48f75e9ef83 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -562,7 +562,7 @@ def nanmode(x, axis=0): def unique(x, return_counts=False): """ Equivalent of np.unique that supports sparse or dense matrices. """ if not sp.issparse(x): - return np.unique(x, return_counts=return_counts) + return np.unique(np.asarray(x), return_counts=return_counts) implicit_zeros = sparse_count_implicit_zeros(x) explicit_zeros = not np.all(x.data) diff --git a/Orange/tests/test_logistic_regression.py b/Orange/tests/test_logistic_regression.py index 4ab3cdc3a68..2fd4298e96b 100644 --- a/Orange/tests/test_logistic_regression.py +++ b/Orange/tests/test_logistic_regression.py @@ -9,6 +9,7 @@ from Orange.data import Table, ContinuousVariable, Domain from Orange.classification import LogisticRegressionLearner, Model from Orange.evaluation import CrossValidation, CA +from Orange.tests.test_dasktable import temp_dasktable class TestLogisticRegressionLearner(unittest.TestCase): @@ -151,3 +152,11 @@ def test_auto_solver(self): skl_clf = lr._initialize_wrapped() self.assertEqual(skl_clf.solver, "liblinear") self.assertEqual(skl_clf.penalty, "l1") + + +class TestLRLOnDask(TestLogisticRegressionLearner): + @classmethod + def setUpClass(cls): + cls.iris = temp_dasktable(Table('iris')) + cls.heart_disease = temp_dasktable(Table('heart_disease.tab')) + cls.zoo = temp_dasktable(Table('zoo')) diff --git a/Orange/widgets/evaluate/owpredictions.py b/Orange/widgets/evaluate/owpredictions.py index b9b5ec36e74..72c478a961e 100644 --- a/Orange/widgets/evaluate/owpredictions.py +++ b/Orange/widgets/evaluate/owpredictions.py @@ -397,7 +397,7 @@ def _call_predictors(self): results.domain = self.data.domain results.row_indices = numpy.arange(len(self.data)) results.folds = (Ellipsis, ) - results.actual = self.data.Y + results.actual = numpy.asarray(self.data.Y) results.unmapped_probabilities = prob results.unmapped_predicted = pred results.probabilities = results.predicted = None diff --git a/Orange/widgets/evaluate/owtestandscore.py b/Orange/widgets/evaluate/owtestandscore.py index ac0c839d592..b67ed0faf8c 100644 --- a/Orange/widgets/evaluate/owtestandscore.py +++ b/Orange/widgets/evaluate/owtestandscore.py @@ -950,7 +950,7 @@ def __update(self): do_stratify = False elif self.data.domain.class_var.is_discrete: least = min(filter(None, - np.bincount(self.data.Y.astype(int)))) + np.bincount(np.asarray(self.data.Y, dtype=int)))) if least < k: self.Warning.cant_stratify(k, least) do_stratify = False diff --git a/Orange/widgets/model/owlogisticregression.py b/Orange/widgets/model/owlogisticregression.py index 63c401b7797..7782c0baddc 100644 --- a/Orange/widgets/model/owlogisticregression.py +++ b/Orange/widgets/model/owlogisticregression.py @@ -5,6 +5,7 @@ from orangewidget.report import bool_str from Orange.data import Table, Domain, ContinuousVariable, StringVariable +from Orange.data.dask import DaskTable from Orange.classification.logistic_regression import LogisticRegressionLearner from Orange.widgets import settings, gui from Orange.widgets.utils.owlearnerwidget import OWBaseLearner @@ -139,14 +140,23 @@ def get_learner_parameters(self): self.penalty_types[self.penalty_type], self.C_s[self.C_index], bool_str(self.class_weight))),) + def check_data(self): + valid = super().check_data() + if valid and isinstance(self.data, DaskTable) \ + and len(self.data.domain.class_var.values) > 2: + self.Error.data_error("Data contains too many target values.") + valid = False + return valid + def create_coef_table(classifier): - i = classifier.intercept - c = classifier.coefficients - if c.shape[0] > 2: - values = [classifier.domain.class_var.values[int(i)] for i in classifier.used_vals[0]] + i = np.atleast_1d(classifier.intercept) + c = np.atleast_2d(classifier.coefficients) + if c.shape[0] > 2: # multi-class + values = [classifier.domain.class_var.values[int(i)] + for i in np.asarray(classifier.used_vals[0])] else: - values = [classifier.domain.class_var.values[int(classifier.used_vals[0][1])]] + values = [classifier.domain.class_var.values[int(np.asarray(classifier.used_vals[0])[1])]] domain = Domain([ContinuousVariable(value) for value in values], metas=[StringVariable("name")]) coefs = np.vstack((i.reshape(1, len(i)), c.T))