BouchardLab · JesseLivezey · Jun 17, 2019 · May 7, 2019 · May 8, 2019 · May 8, 2019
diff --git a/Makefile b/Makefile
@@ -38,7 +38,7 @@ test:
 	tox
 
 flake:
-	$(FLAKE) src/
+	$(FLAKE) pyuoi/
 	$(FLAKE) tests/
 	$(FLAKE) --ignore E402,W504 docs/gallery
 

diff --git a/pytest.ini b/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
 norecursedirs = mpi
+markers =
+    fast: mark a test as a fast test e.g. unit test
+    slow: mark a test as a slow test e.g. end-to-end test
diff --git a/pyuoi/decomposition/NMF.py b/pyuoi/decomposition/NMF.py
@@ -1,8 +1,11 @@
 import scipy.optimize as spo
 import numpy as np
+import logging
 
 from .base import AbstractDecompositionModel
 
+from ..utils import check_logger
+
 from sklearn.decomposition import NMF as skNMF
 from sklearn.cluster import DBSCAN
 from sklearn.preprocessing import normalize
@@ -41,10 +44,22 @@ class UoI_NMF_Base(AbstractDecompositionModel):
     cons_meth : function
         The method for computing consensus bases after clustering. If None,
         uses np.median.
+
+    random_state : int, RandomState instance or None, default None
+        The seed of the pseudo random number generator that selects a random
+        feature to update.  If int, random_state is the seed used by the random
+        number generator; If RandomState instance, random_state is the random
+        number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`.
+
+    logger : Logger, default None
+        The logger to use for messages when ``verbose=True`` in ``fit``.
+        If *None* is passed, a logger that writes to ``sys.stdout`` will be
+        used.
     """
     def __init__(
         self, n_boots=10, ranks=None, nmf=None, cluster=None, nnreg=None,
-        cons_meth=None, random_state=None
+        cons_meth=None, random_state=None, logger=None
     ):
         self.__initialize(
             n_boots=n_boots,
@@ -53,6 +68,7 @@ def __init__(
             cluster=cluster,
             nnreg=nnreg,
             cons_meth=cons_meth,
+            logger=logger,
             random_state=random_state
         )
 
@@ -71,6 +87,7 @@ def __initialize(self, **kwargs):
 
         self.n_boots = n_boots
         self.components_ = None
+        logger = kwargs['logger']
 
         # initialize NMF ranks to use
         if ranks is not None:
@@ -128,10 +145,15 @@ def __initialize(self, **kwargs):
         self.components_ = None
         self.bases_samples_ = None
         self.bases_samples_labels_ = None
-        self.boostraps_ = None
+        self.bootstraps_ = None
+
+        self.comm = None
 
-    def fit(self, X):
-        """Perform first phase of UoI NMF decomposition.
+        self._logger = check_logger(logger, 'uoi_decomposition', self.comm)
+
+    def fit(self, X, y=None, verbose=False):
+        """
+        Perform first phase of UoI NMF decomposition.
 
         Compute H matrix.
 
@@ -142,6 +164,11 @@ def fit(self, X):
         X : ndarray, shape (n_samples, n_features)
             Data matrix to be decomposed.
         """
+        if verbose:
+            self._logger.setLevel(logging.DEBUG)
+        else:
+            self._logger.setLevel(logging.WARNING)
+
         check_non_negative(X, 'UoI NMF')
         n_samples, n_features = X.shape
 
@@ -151,6 +178,7 @@ def fit(self, X):
 
         rep_idx = self._rand.randint(n_samples, size=(self.n_boots, n_samples))
         for i in range(self.n_boots):
+            self._logger.info("bootstrap %d" % i)
             # compute NMF bases for k across bootstrap replicates
             H_i = i * k_tot
             sample = X[rep_idx[i]]
@@ -165,11 +193,14 @@ def fit(self, X):
         H_samples = normalize(H_samples, norm='l2', axis=1)
 
         # cluster all bases
+        self._logger.info("clustering bases samples")
         labels = self.cluster.fit_predict(H_samples)
 
         # compute consensus bases from clusters
         cluster_ids = np.unique(labels[labels != -1])
         n_clusters = cluster_ids.size
+        self._logger.info("found %d bases, computing consensus bases" %
+                          n_clusters)
         H_cons = np.zeros((n_clusters, n_features))
 
         for c_id in cluster_ids:
@@ -180,6 +211,9 @@ def fit(self, X):
 
         self.components_ = H_cons
         self.n_components = self.components_.shape[0]
+        self.bases_samples_ = H_samples
+        self.bases_samples_labels_ = labels
+        self.bootstraps_ = rep_idx
         self.reconstruction_err_ = None
         return self
 
@@ -222,23 +256,22 @@ def transform(self, X, reconstruction_err=True):
 
         return W
 
-    def fit_transform(self, X, reconstruction_err=True):
-        """Transform the data X according to the fitted UoI-NMF model.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Data matrix to be decomposed.
-
-        reconstruction_err : bool, default True
-            True to compute reconstruction error, False otherwise.
-
-        Returns
-        -------
-        W : array-like, shape (n_samples, n_components)
-            Transformed data.
+    def fit_transform(self, X, y=None, reconstruction_err=True, verbose=None):
+        """
+        Transform the data X according to the fitted UoI-NMF model
+
+        Args:
+            X : array-like; shape (n_samples, n_features)
+            y
+                ignored
+            reconstruction_err : bool
+                True to compute reconstruction error, False otherwise.
+                default True.
+        Returns:
+            W : array-like; shape (n_samples, n_components)
+                Transformed data.
         """
-        self.fit(X)
+        self.fit(X, verbose=verbose)
         return self.transform(X, reconstruction_err=reconstruction_err)
 
     def inverse_transform(self, W):
@@ -352,14 +385,26 @@ class UoI_NMF(UoI_NMF_Base):
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
+
+    random_state : int, RandomState instance or None, default None
+        The seed of the pseudo random number generator that selects a random
+        feature to update.  If int, random_state is the seed used by the random
+        number generator; If RandomState instance, random_state is the random
+        number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`.
+
+    logger : Logger, default None
+        The logger to use for messages when ``verbose=True`` in ``fit``.
+        If *None* is passed, a logger that writes to ``sys.stdout`` will be
+        used.
     """
     def __init__(
         self, n_boots, ranks=None,
         nmf_init='random', nmf_solver='mu', nmf_beta_loss='kullback-leibler',
         nmf_tol=0.0001, nmf_max_iter=400,
         db_eps=0.5, db_min_samples=None, db_metric='euclidean',
         db_metric_params=None, db_algorithm='auto', db_leaf_size=30,
-        random_state=None
+        random_state=None, logger=None,
     ):
         # create NMF solver
         nmf = skNMF(init=nmf_init,
@@ -384,5 +429,6 @@ def __init__(
             cluster=dbscan,
             nnreg=None,
             cons_meth=np.median,
-            random_state=random_state
+            random_state=random_state,
+            logger=logger
         )
diff --git a/pyuoi/linear_model/base.py b/pyuoi/linear_model/base.py
@@ -1,17 +1,21 @@
 import abc as _abc
 import six as _six
 import numpy as np
+import logging
 
 from sklearn.linear_model.base import SparseCoefMixin
 from sklearn.metrics import r2_score, accuracy_score, log_loss
 from sklearn.model_selection import train_test_split
 from sklearn.utils import check_X_y
 from sklearn.preprocessing import StandardScaler
 
+from scipy.sparse import issparse, csr_matrix
+
 from pyuoi import utils
 from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)
 
 from .utils import stability_selection_to_threshold, intersection
+from ..utils import check_logger
 
 
 class AbstractUoILinearModel(
@@ -75,6 +79,11 @@ class AbstractUoILinearModel(
     comm : MPI communicator, default None
         If passed, the selection and estimation steps are parallelized.
 
+    logger : Logger, default None
+        The logger to use for messages when ``verbose=True`` in ``fit``.
+        If *None* is passed, a logger that writes to ``sys.stdout`` will be
+        used.
+
     Attributes
     ----------
     coef_ : array, shape (n_features,) or (n_targets, n_features)
@@ -92,7 +101,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
                  estimation_frac=0.9, stability_selection=1.,
                  fit_intercept=True, standardize=True,
                  shared_support=True, max_iter=None, random_state=None,
-                 comm=None):
+                 comm=None, logger=None):
         # data split fractions
         self.selection_frac = selection_frac
         self.estimation_frac = estimation_frac
@@ -124,6 +133,8 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
 
         self.n_supports_ = None
 
+        self._logger = check_logger(logger, 'uoi_linear_model', self.comm)
+
     @_abc.abstractproperty
     def estimation_score(self):
         pass
@@ -145,6 +156,10 @@ def _pre_fit(self, X, y):
         """Perform class-specific setup for fit().
         """
         if self.standardize:
+            if self.fit_intercept and issparse(X):
+                msg = ("Cannot center sparse matrices: "
+                       "pass `fit_intercept=False`")
+                raise ValueError(msg)
             self._X_scaler = StandardScaler(with_mean=self.fit_intercept)
             X = self._X_scaler.fit_transform(X)
         return X, y
@@ -196,6 +211,11 @@ def fit(self, X, y, stratify=None, verbose=False):
             A switch indicating whether the fitting should print out messages
             displaying progress.
         """
+        if verbose:
+            self._logger.setLevel(logging.DEBUG)
+        else:
+            self._logger.setLevel(logging.WARNING)
+
         X, y = self._pre_fit(X, y)
 
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
@@ -235,7 +255,7 @@ def fit(self, X, y, stratify=None, verbose=False):
             my_boots = dict((task_idx, None) for task_idx in tasks)
 
         for boot in range(self.n_boots_sel):
-            if self.comm is not None:
+            if size > 1:
                 if rank == 0:
                     rvals = train_test_split(np.arange(X.shape[0]),
                                              test_size=1 - self.selection_frac,
@@ -280,12 +300,20 @@ def fit(self, X, y, stratify=None, verbose=False):
             y_test = y[idxs_test]
 
             # fit the coefficients
+            if size > self.n_boots_sel:
+                msg = ("selection bootstrap %d, "
+                       "regularization parameter set %d"
+                       % (boot_idx, reg_idx))
+                self._logger.info(msg)
+
+            else:
+                self._logger.info("selection bootstrap %d" % (boot_idx))
             selection_coefs[ii] = np.squeeze(
                 self.uoi_selection_sweep(X_rep, y_rep, my_reg_params))
 
         # if distributed, gather selection coefficients to 0,
         # perform intersection, and broadcast results
-        if self.comm is not None:
+        if size > 1:
             selection_coefs = Gatherv_rows(selection_coefs, self.comm, root=0)
             if rank == 0:
                 if size > self.n_boots_sel:
@@ -306,6 +334,9 @@ def fit(self, X, y, stratify=None, verbose=False):
 
         self.n_supports_ = self.supports_.shape[0]
 
+        if rank == 0:
+            self._logger.info("Found %d supports" % self.n_supports_)
+
         #####################
         # Estimation Module #
         #####################
@@ -317,7 +348,7 @@ def fit(self, X, y, stratify=None, verbose=False):
         estimates = np.zeros((tasks.size, n_coef))
 
         for boot in range(self.n_boots_est):
-            if self.comm is not None:
+            if size > 1:
                 if rank == 0:
                     rvals = train_test_split(np.arange(X.shape[0]),
                                              test_size=1 - self.estimation_frac,
@@ -350,6 +381,8 @@ def fit(self, X, y, stratify=None, verbose=False):
             X_test = X[idxs_test]
             y_rep = y[idxs_train]
             y_test = y[idxs_test]
+            self._logger.info("estimation bootstrap %d, support %d"
+                              % (boot_idx, support_idx))
             if np.any(support):
 
                 # compute the estimate and store the fitted coefficients
@@ -368,13 +401,17 @@ def fit(self, X, y, stratify=None, verbose=False):
                     support=support)
             else:
                 fitter = self._fit_intercept_no_features(y_rep)
+                if issparse(X_test):
+                    X_test = csr_matrix(X_test.shape, dtype=X_test.dtype)
+                else:
+                    X_test = np.zeros_like(X_test)
                 scores[ii] = self._score_predictions(
                     metric=self.estimation_score,
                     fitter=fitter,
-                    X=np.zeros_like(X_test), y=y_test,
+                    X=X_test, y=y_test,
                     support=np.zeros(X_test.shape[1], dtype=bool))
 
-        if self.comm is not None:
+        if size > 1:
             estimates = Gatherv_rows(send=estimates, comm=self.comm,
                                      root=0)
             scores = Gatherv_rows(send=scores, comm=self.comm,
@@ -475,7 +512,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
                  estimation_frac=0.9, stability_selection=1.,
                  estimation_score='r2', copy_X=True, fit_intercept=True,
                  standardize=True, random_state=None, max_iter=None,
-                 comm=None):
+                 comm=None, logger=None):
         super(AbstractUoILinearRegressor, self).__init__(
             n_boots_sel=n_boots_sel,
             n_boots_est=n_boots_est,
@@ -487,6 +524,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
             max_iter=max_iter,
             random_state=random_state,
             comm=comm,
+            logger=logger
         )
 
         if estimation_score not in self._valid_estimation_metrics:
@@ -614,7 +652,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
                  estimation_score='acc',
                  copy_X=True, fit_intercept=True, standardize=True,
                  random_state=None, max_iter=None, shared_support=True,
-                 comm=None):
+                 comm=None, logger=None):
         super(AbstractUoIGeneralizedLinearRegressor, self).__init__(
             n_boots_sel=n_boots_sel,
             n_boots_est=n_boots_est,
@@ -627,6 +665,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
             shared_support=shared_support,
             max_iter=max_iter,
             comm=comm,
+            logger=logger
         )
 
         if estimation_score not in self._valid_estimation_metrics: