Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add logger support #116

Merged
merged 18 commits into from
Jun 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ test:
tox

flake:
$(FLAKE) src/
$(FLAKE) pyuoi/
$(FLAKE) tests/
$(FLAKE) --ignore E402,W504 docs/gallery

Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
[pytest]
norecursedirs = mpi
markers =
fast: mark a test as a fast test e.g. unit test
slow: mark a test as a slow test e.g. end-to-end test
90 changes: 68 additions & 22 deletions pyuoi/decomposition/NMF.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import scipy.optimize as spo
import numpy as np
import logging

from .base import AbstractDecompositionModel

from ..utils import check_logger

from sklearn.decomposition import NMF as skNMF
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
Expand Down Expand Up @@ -41,10 +44,22 @@ class UoI_NMF_Base(AbstractDecompositionModel):
cons_meth : function
The method for computing consensus bases after clustering. If None,
uses np.median.

random_state : int, RandomState instance or None, default None
The seed of the pseudo random number generator that selects a random
feature to update. If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by `np.random`.

logger : Logger, default None
The logger to use for messages when ``verbose=True`` in ``fit``.
If *None* is passed, a logger that writes to ``sys.stdout`` will be
used.
"""
def __init__(
self, n_boots=10, ranks=None, nmf=None, cluster=None, nnreg=None,
cons_meth=None, random_state=None
cons_meth=None, random_state=None, logger=None
):
self.__initialize(
n_boots=n_boots,
Expand All @@ -53,6 +68,7 @@ def __init__(
cluster=cluster,
nnreg=nnreg,
cons_meth=cons_meth,
logger=logger,
random_state=random_state
)

Expand All @@ -71,6 +87,7 @@ def __initialize(self, **kwargs):

self.n_boots = n_boots
self.components_ = None
logger = kwargs['logger']

# initialize NMF ranks to use
if ranks is not None:
Expand Down Expand Up @@ -128,10 +145,15 @@ def __initialize(self, **kwargs):
self.components_ = None
self.bases_samples_ = None
self.bases_samples_labels_ = None
self.boostraps_ = None
self.bootstraps_ = None

self.comm = None

def fit(self, X):
"""Perform first phase of UoI NMF decomposition.
self._logger = check_logger(logger, 'uoi_decomposition', self.comm)

def fit(self, X, y=None, verbose=False):
"""
Perform first phase of UoI NMF decomposition.

Compute H matrix.

Expand All @@ -142,6 +164,11 @@ def fit(self, X):
X : ndarray, shape (n_samples, n_features)
Data matrix to be decomposed.
"""
if verbose:
self._logger.setLevel(logging.DEBUG)
else:
self._logger.setLevel(logging.WARNING)

check_non_negative(X, 'UoI NMF')
n_samples, n_features = X.shape

Expand All @@ -151,6 +178,7 @@ def fit(self, X):

rep_idx = self._rand.randint(n_samples, size=(self.n_boots, n_samples))
for i in range(self.n_boots):
self._logger.info("bootstrap %d" % i)
# compute NMF bases for k across bootstrap replicates
H_i = i * k_tot
sample = X[rep_idx[i]]
Expand All @@ -165,11 +193,14 @@ def fit(self, X):
H_samples = normalize(H_samples, norm='l2', axis=1)

# cluster all bases
self._logger.info("clustering bases samples")
labels = self.cluster.fit_predict(H_samples)

# compute consensus bases from clusters
cluster_ids = np.unique(labels[labels != -1])
n_clusters = cluster_ids.size
self._logger.info("found %d bases, computing consensus bases" %
n_clusters)
H_cons = np.zeros((n_clusters, n_features))

for c_id in cluster_ids:
Expand All @@ -180,6 +211,9 @@ def fit(self, X):

self.components_ = H_cons
self.n_components = self.components_.shape[0]
self.bases_samples_ = H_samples
self.bases_samples_labels_ = labels
self.bootstraps_ = rep_idx
self.reconstruction_err_ = None
return self

Expand Down Expand Up @@ -222,23 +256,22 @@ def transform(self, X, reconstruction_err=True):

return W

def fit_transform(self, X, reconstruction_err=True):
"""Transform the data X according to the fitted UoI-NMF model.

Parameters
----------
X : array-like, shape (n_samples, n_features)
Data matrix to be decomposed.

reconstruction_err : bool, default True
True to compute reconstruction error, False otherwise.

Returns
-------
W : array-like, shape (n_samples, n_components)
Transformed data.
def fit_transform(self, X, y=None, reconstruction_err=True, verbose=None):
"""
Transform the data X according to the fitted UoI-NMF model

Args:
X : array-like; shape (n_samples, n_features)
y
ignored
reconstruction_err : bool
True to compute reconstruction error, False otherwise.
default True.
Returns:
W : array-like; shape (n_samples, n_components)
Transformed data.
"""
self.fit(X)
self.fit(X, verbose=verbose)
return self.transform(X, reconstruction_err=reconstruction_err)

def inverse_transform(self, W):
Expand Down Expand Up @@ -352,14 +385,26 @@ class UoI_NMF(UoI_NMF_Base):
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.

random_state : int, RandomState instance or None, default None
The seed of the pseudo random number generator that selects a random
feature to update. If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by `np.random`.

logger : Logger, default None
The logger to use for messages when ``verbose=True`` in ``fit``.
If *None* is passed, a logger that writes to ``sys.stdout`` will be
used.
"""
def __init__(
self, n_boots, ranks=None,
nmf_init='random', nmf_solver='mu', nmf_beta_loss='kullback-leibler',
nmf_tol=0.0001, nmf_max_iter=400,
db_eps=0.5, db_min_samples=None, db_metric='euclidean',
db_metric_params=None, db_algorithm='auto', db_leaf_size=30,
random_state=None
random_state=None, logger=None,
):
# create NMF solver
nmf = skNMF(init=nmf_init,
Expand All @@ -384,5 +429,6 @@ def __init__(
cluster=dbscan,
nnreg=None,
cons_meth=np.median,
random_state=random_state
random_state=random_state,
logger=logger
)
55 changes: 47 additions & 8 deletions pyuoi/linear_model/base.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import abc as _abc
import six as _six
import numpy as np
import logging

from sklearn.linear_model.base import SparseCoefMixin
from sklearn.metrics import r2_score, accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y
from sklearn.preprocessing import StandardScaler

from scipy.sparse import issparse, csr_matrix

from pyuoi import utils
from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)

from .utils import stability_selection_to_threshold, intersection
from ..utils import check_logger


class AbstractUoILinearModel(
Expand Down Expand Up @@ -75,6 +79,11 @@ class AbstractUoILinearModel(
comm : MPI communicator, default None
If passed, the selection and estimation steps are parallelized.

logger : Logger, default None
The logger to use for messages when ``verbose=True`` in ``fit``.
If *None* is passed, a logger that writes to ``sys.stdout`` will be
used.

Attributes
----------
coef_ : array, shape (n_features,) or (n_targets, n_features)
Expand All @@ -92,7 +101,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
estimation_frac=0.9, stability_selection=1.,
fit_intercept=True, standardize=True,
shared_support=True, max_iter=None, random_state=None,
comm=None):
comm=None, logger=None):
# data split fractions
self.selection_frac = selection_frac
self.estimation_frac = estimation_frac
Expand Down Expand Up @@ -124,6 +133,8 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,

self.n_supports_ = None

self._logger = check_logger(logger, 'uoi_linear_model', self.comm)

@_abc.abstractproperty
def estimation_score(self):
pass
Expand All @@ -145,6 +156,10 @@ def _pre_fit(self, X, y):
"""Perform class-specific setup for fit().
"""
if self.standardize:
if self.fit_intercept and issparse(X):
msg = ("Cannot center sparse matrices: "
"pass `fit_intercept=False`")
raise ValueError(msg)
self._X_scaler = StandardScaler(with_mean=self.fit_intercept)
X = self._X_scaler.fit_transform(X)
return X, y
Expand Down Expand Up @@ -196,6 +211,11 @@ def fit(self, X, y, stratify=None, verbose=False):
A switch indicating whether the fitting should print out messages
displaying progress.
"""
if verbose:
self._logger.setLevel(logging.DEBUG)
else:
self._logger.setLevel(logging.WARNING)

X, y = self._pre_fit(X, y)

X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
Expand Down Expand Up @@ -235,7 +255,7 @@ def fit(self, X, y, stratify=None, verbose=False):
my_boots = dict((task_idx, None) for task_idx in tasks)

for boot in range(self.n_boots_sel):
if self.comm is not None:
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.selection_frac,
Expand Down Expand Up @@ -280,12 +300,20 @@ def fit(self, X, y, stratify=None, verbose=False):
y_test = y[idxs_test]

# fit the coefficients
if size > self.n_boots_sel:
msg = ("selection bootstrap %d, "
"regularization parameter set %d"
% (boot_idx, reg_idx))
self._logger.info(msg)

else:
self._logger.info("selection bootstrap %d" % (boot_idx))
selection_coefs[ii] = np.squeeze(
self.uoi_selection_sweep(X_rep, y_rep, my_reg_params))

# if distributed, gather selection coefficients to 0,
# perform intersection, and broadcast results
if self.comm is not None:
if size > 1:
selection_coefs = Gatherv_rows(selection_coefs, self.comm, root=0)
if rank == 0:
if size > self.n_boots_sel:
Expand All @@ -306,6 +334,9 @@ def fit(self, X, y, stratify=None, verbose=False):

self.n_supports_ = self.supports_.shape[0]

if rank == 0:
self._logger.info("Found %d supports" % self.n_supports_)

#####################
# Estimation Module #
#####################
Expand All @@ -317,7 +348,7 @@ def fit(self, X, y, stratify=None, verbose=False):
estimates = np.zeros((tasks.size, n_coef))

for boot in range(self.n_boots_est):
if self.comm is not None:
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.estimation_frac,
Expand Down Expand Up @@ -350,6 +381,8 @@ def fit(self, X, y, stratify=None, verbose=False):
X_test = X[idxs_test]
y_rep = y[idxs_train]
y_test = y[idxs_test]
self._logger.info("estimation bootstrap %d, support %d"
% (boot_idx, support_idx))
if np.any(support):

# compute the estimate and store the fitted coefficients
Expand All @@ -368,13 +401,17 @@ def fit(self, X, y, stratify=None, verbose=False):
support=support)
else:
fitter = self._fit_intercept_no_features(y_rep)
if issparse(X_test):
X_test = csr_matrix(X_test.shape, dtype=X_test.dtype)
else:
X_test = np.zeros_like(X_test)
scores[ii] = self._score_predictions(
metric=self.estimation_score,
fitter=fitter,
X=np.zeros_like(X_test), y=y_test,
X=X_test, y=y_test,
support=np.zeros(X_test.shape[1], dtype=bool))

if self.comm is not None:
if size > 1:
estimates = Gatherv_rows(send=estimates, comm=self.comm,
root=0)
scores = Gatherv_rows(send=scores, comm=self.comm,
Expand Down Expand Up @@ -475,7 +512,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
estimation_frac=0.9, stability_selection=1.,
estimation_score='r2', copy_X=True, fit_intercept=True,
standardize=True, random_state=None, max_iter=None,
comm=None):
comm=None, logger=None):
super(AbstractUoILinearRegressor, self).__init__(
n_boots_sel=n_boots_sel,
n_boots_est=n_boots_est,
Expand All @@ -487,6 +524,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
max_iter=max_iter,
random_state=random_state,
comm=comm,
logger=logger
)

if estimation_score not in self._valid_estimation_metrics:
Expand Down Expand Up @@ -614,7 +652,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
estimation_score='acc',
copy_X=True, fit_intercept=True, standardize=True,
random_state=None, max_iter=None, shared_support=True,
comm=None):
comm=None, logger=None):
super(AbstractUoIGeneralizedLinearRegressor, self).__init__(
n_boots_sel=n_boots_sel,
n_boots_est=n_boots_est,
Expand All @@ -627,6 +665,7 @@ def __init__(self, n_boots_sel=48, n_boots_est=48, selection_frac=0.9,
shared_support=shared_support,
max_iter=max_iter,
comm=comm,
logger=logger
)

if estimation_score not in self._valid_estimation_metrics:
Expand Down
Loading