From 5209f1342586286907256554fe6002edac6c19bb Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 13 Oct 2019 14:25:50 -0400 Subject: [PATCH 1/2] refactor tasklogger interface --- graphtools/api.py | 13 +- graphtools/base.py | 103 +++++----- graphtools/graphs.py | 446 +++++++++++++++++++++---------------------- requirements.txt | 2 +- setup.py | 2 +- 5 files changed, 278 insertions(+), 288 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index 7fdae5a..80b7746 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -1,12 +1,13 @@ import numpy as np import warnings -import tasklogger from scipy import sparse import pickle import pygsp +import tasklogger + +from . import base, graphs -from . import base -from . import graphs +_logger = tasklogger.get_tasklogger('graphtools') def Graph(data, @@ -173,7 +174,7 @@ def Graph(data, “Numerical Recipes (3rd edition)”, Cambridge University Press, 2007, page 795. """ - tasklogger.set_level(verbose) + _logger.set_level(verbose) if sample_idx is not None and len(np.unique(sample_idx)) == 1: warnings.warn("Only one unique sample. " "Not using MNNGraph") @@ -239,7 +240,7 @@ def Graph(data, else: msg = msg + " and PyGSP inheritance" - tasklogger.log_debug(msg) + _logger.debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: @@ -257,7 +258,7 @@ def Graph(data, pass # build graph and return - tasklogger.log_debug("Initializing {} with arguments {}".format( + _logger.debug("Initializing {} with arguments {}".format( parent_classes, ", ".join(["{}='{}'".format(key, value) for key, value in params.items() diff --git a/graphtools/base.py b/graphtools/base.py index 58e941b..c7c72d6 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -11,9 +11,9 @@ from scipy import sparse import warnings import numbers -import tasklogger import pickle import sys +import tasklogger try: import pandas as pd @@ -29,6 +29,8 @@ from . import utils +_logger = tasklogger.get_tasklogger('graphtools') + class Base(object): """Class that deals with key-word arguments but is otherwise @@ -179,7 +181,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): n_pca = None elif n_pca is True: # notify that we're going to estimate rank. n_pca = 'auto' - tasklogger.log_info("Estimating n_pca from matrix rank. " + _logger.info("Estimating n_pca from matrix rank. " "Supply an integer n_pca " "for fixed amount.") if not any([isinstance(n_pca, numbers.Number), @@ -233,45 +235,44 @@ def _reduce_data(self): Reduced data matrix """ if self.n_pca is not None and (self.n_pca == 'auto' or self.n_pca < self.data.shape[1]): - tasklogger.log_start("PCA") - n_pca = self.data.shape[1] - 1 if self.n_pca == 'auto' else self.n_pca - if sparse.issparse(self.data): - if isinstance(self.data, sparse.coo_matrix) or \ - isinstance(self.data, sparse.lil_matrix) or \ - isinstance(self.data, sparse.dok_matrix): - self.data = self.data.tocsr() - self.data_pca = TruncatedSVD(n_pca, random_state=self.random_state) - else: - self.data_pca = PCA(n_pca, - svd_solver='randomized', - random_state=self.random_state) - self.data_pca.fit(self.data) - if self.n_pca == 'auto': - s = self.data_pca.singular_values_ - smax = s.max() - if self.rank_threshold == 'auto': - threshold = smax * \ - np.finfo(self.data.dtype).eps * max(self.data.shape) - self.rank_threshold = threshold - threshold = self.rank_threshold - gate = np.where(s >= threshold)[0] - self.n_pca = gate.shape[0] - if self.n_pca == 0: - raise ValueError("Supplied threshold {} was greater than " - "maximum singular value {} " - "for the data matrix".format(threshold, smax)) - tasklogger.log_info( - "Using rank estimate of {} as n_pca".format(self.n_pca)) - # reset the sklearn operator - op = self.data_pca # for line-width brevity.. - op.components_ = op.components_[gate, :] - op.explained_variance_ = op.explained_variance_[gate] - op.explained_variance_ratio_ = op.explained_variance_ratio_[ - gate] - op.singular_values_ = op.singular_values_[gate] - self.data_pca = op # im not clear if this is needed due to assignment rules - data_nu = self.data_pca.transform(self.data) - tasklogger.log_complete("PCA") + with _logger.task("PCA"): + n_pca = self.data.shape[1] - 1 if self.n_pca == 'auto' else self.n_pca + if sparse.issparse(self.data): + if isinstance(self.data, sparse.coo_matrix) or \ + isinstance(self.data, sparse.lil_matrix) or \ + isinstance(self.data, sparse.dok_matrix): + self.data = self.data.tocsr() + self.data_pca = TruncatedSVD(n_pca, random_state=self.random_state) + else: + self.data_pca = PCA(n_pca, + svd_solver='randomized', + random_state=self.random_state) + self.data_pca.fit(self.data) + if self.n_pca == 'auto': + s = self.data_pca.singular_values_ + smax = s.max() + if self.rank_threshold == 'auto': + threshold = smax * \ + np.finfo(self.data.dtype).eps * max(self.data.shape) + self.rank_threshold = threshold + threshold = self.rank_threshold + gate = np.where(s >= threshold)[0] + self.n_pca = gate.shape[0] + if self.n_pca == 0: + raise ValueError("Supplied threshold {} was greater than " + "maximum singular value {} " + "for the data matrix".format(threshold, smax)) + _logger.info( + "Using rank estimate of {} as n_pca".format(self.n_pca)) + # reset the sklearn operator + op = self.data_pca # for line-width brevity.. + op.components_ = op.components_[gate, :] + op.explained_variance_ = op.explained_variance_[gate] + op.explained_variance_ratio_ = op.explained_variance_ratio_[ + gate] + op.singular_values_ = op.singular_values_[gate] + self.data_pca = op # im not clear if this is needed due to assignment rules + data_nu = self.data_pca.transform(self.data) return data_nu else: data_nu = self.data @@ -472,10 +473,10 @@ def __init__(self, self.anisotropy = anisotropy if initialize: - tasklogger.log_debug("Initializing kernel...") + _logger.debug("Initializing kernel...") self.K else: - tasklogger.log_debug("Not initializing kernel.") + _logger.debug("Not initializing kernel.") super().__init__(**kwargs) def _check_symmetrization(self, kernel_symm, theta): @@ -524,18 +525,18 @@ def _build_kernel(self): def symmetrize_kernel(self, K): # symmetrize if self.kernel_symm == "+": - tasklogger.log_debug("Using addition symmetrization.") + _logger.debug("Using addition symmetrization.") K = (K + K.T) / 2 elif self.kernel_symm == "*": - tasklogger.log_debug("Using multiplication symmetrization.") + _logger.debug("Using multiplication symmetrization.") K = K.multiply(K.T) elif self.kernel_symm == 'mnn': - tasklogger.log_debug( + _logger.debug( "Using mnn symmetrization (theta = {}).".format(self.theta)) K = self.theta * utils.elementwise_minimum(K, K.T) + \ (1 - self.theta) * utils.elementwise_maximum(K, K.T) elif self.kernel_symm is None: - tasklogger.log_debug("Using no symmetrization.") + _logger.debug("Using no symmetrization.") pass else: # this should never happen @@ -787,10 +788,10 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if not self.weighted: distance = 'data' - tasklogger.log_info("Using ambient data distances.") + _logger.info("Using ambient data distances.") else: distance = 'affinity' - tasklogger.log_info("Using negative log affinity distances.") + _logger.info("Using negative log affinity distances.") return distance def shortest_path(self, method='auto', distance=None): @@ -954,7 +955,7 @@ def __init__(self, data, # kwargs are ignored self.n_jobs = n_jobs self.verbose = verbose - tasklogger.set_level(verbose) + _logger.set_level(verbose) super().__init__(data, **kwargs) def get_params(self): @@ -1117,6 +1118,6 @@ def set_params(self, **params): self.n_jobs = params['n_jobs'] if 'verbose' in params: self.verbose = params['verbose'] - tasklogger.set_level(self.verbose) + _logger.set_level(self.verbose) super().set_params(**params) return self diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 2d4c035..6a1a021 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -15,6 +15,8 @@ from . import utils from .base import DataGraph, PyGSPGraph +_logger = tasklogger.get_tasklogger('graphtools') + class kNNGraph(DataGraph): """ @@ -290,90 +292,88 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, k=knn, n=self.data.shape[0])) Y = self._check_extension_shape(Y) - tasklogger.log_start("KNN search") if self.decay is None or self.thresh == 1: - # binary connectivity matrix - K = self.knn_tree.kneighbors_graph( - Y, n_neighbors=knn, - mode='connectivity') - tasklogger.log_complete("KNN search") + with _logger.task("KNN search"): + # binary connectivity matrix + K = self.knn_tree.kneighbors_graph( + Y, n_neighbors=knn, + mode='connectivity') else: - # sparse fast alpha decay - knn_tree = self.knn_tree - search_knn = min(knn * 20, self.data_nu.shape[0]) - distances, indices = knn_tree.kneighbors( - Y, n_neighbors=search_knn) - self._check_duplicates(distances, indices) - tasklogger.log_complete("KNN search") - tasklogger.log_start("affinities") - if bandwidth is None: - bandwidth = distances[:, knn - 1] - - bandwidth = bandwidth * bandwidth_scale - - # check for zero bandwidth - bandwidth = np.maximum(bandwidth, np.finfo(float).eps) - - radius = bandwidth * np.power(-1 * np.log(self.thresh), - 1 / self.decay) - update_idx = np.argwhere( - np.max(distances, axis=1) < radius).reshape(-1) - tasklogger.log_debug("search_knn = {}; {} remaining".format( - search_knn, len(update_idx))) - if len(update_idx) > 0: - distances = [d for d in distances] - indices = [i for i in indices] - while len(update_idx) > Y.shape[0] // 10 and \ - search_knn < self.data_nu.shape[0] / 2: - # increase the knn search - search_knn = min(search_knn * 20, self.data_nu.shape[0]) - dist_new, ind_new = knn_tree.kneighbors( - Y[update_idx], n_neighbors=search_knn) - for i, idx in enumerate(update_idx): - distances[idx] = dist_new[i] - indices[idx] = ind_new[i] - update_idx = [i for i, d in enumerate(distances) if np.max(d) < - (radius if isinstance(bandwidth, numbers.Number) - else radius[i])] - tasklogger.log_debug("search_knn = {}; {} remaining".format( - search_knn, - len(update_idx))) - if search_knn > self.data_nu.shape[0] / 2: - knn_tree = NearestNeighbors( - search_knn, algorithm='brute', - n_jobs=self.n_jobs).fit(self.data_nu) - if len(update_idx) > 0: - tasklogger.log_debug( - "radius search on {}".format(len(update_idx))) - # give up - radius search - dist_new, ind_new = knn_tree.radius_neighbors( - Y[update_idx, :], - radius=radius - if isinstance(bandwidth, numbers.Number) - else np.max(radius[update_idx])) - for i, idx in enumerate(update_idx): - distances[idx] = dist_new[i] - indices[idx] = ind_new[i] - if isinstance(bandwidth, numbers.Number): - data = np.concatenate(distances) / bandwidth - else: - data = np.concatenate([distances[i] / bandwidth[i] - for i in range(len(distances))]) - - indices = np.concatenate(indices) - indptr = np.concatenate( - [[0], np.cumsum([len(d) for d in distances])]) - K = sparse.csr_matrix((data, indices, indptr), - shape=(Y.shape[0], self.data_nu.shape[0])) - K.data = np.exp(-1 * np.power(K.data, self.decay)) - # handle nan - K.data = np.where(np.isnan(K.data), 1, K.data) - # TODO: should we zero values that are below thresh? - K.data[K.data < self.thresh] = 0 - K = K.tocoo() - K.eliminate_zeros() - K = K.tocsr() - tasklogger.log_complete("affinities") + with _logger.task("KNN search"): + # sparse fast alpha decay + knn_tree = self.knn_tree + search_knn = min(knn * 20, self.data_nu.shape[0]) + distances, indices = knn_tree.kneighbors( + Y, n_neighbors=search_knn) + self._check_duplicates(distances, indices) + with _logger.task("affinities"): + if bandwidth is None: + bandwidth = distances[:, knn - 1] + + bandwidth = bandwidth * bandwidth_scale + + # check for zero bandwidth + bandwidth = np.maximum(bandwidth, np.finfo(float).eps) + + radius = bandwidth * np.power(-1 * np.log(self.thresh), + 1 / self.decay) + update_idx = np.argwhere( + np.max(distances, axis=1) < radius).reshape(-1) + _logger.debug("search_knn = {}; {} remaining".format( + search_knn, len(update_idx))) + if len(update_idx) > 0: + distances = [d for d in distances] + indices = [i for i in indices] + while len(update_idx) > Y.shape[0] // 10 and \ + search_knn < self.data_nu.shape[0] / 2: + # increase the knn search + search_knn = min(search_knn * 20, self.data_nu.shape[0]) + dist_new, ind_new = knn_tree.kneighbors( + Y[update_idx], n_neighbors=search_knn) + for i, idx in enumerate(update_idx): + distances[idx] = dist_new[i] + indices[idx] = ind_new[i] + update_idx = [i for i, d in enumerate(distances) if np.max(d) < + (radius if isinstance(bandwidth, numbers.Number) + else radius[i])] + _logger.debug("search_knn = {}; {} remaining".format( + search_knn, + len(update_idx))) + if search_knn > self.data_nu.shape[0] / 2: + knn_tree = NearestNeighbors( + search_knn, algorithm='brute', + n_jobs=self.n_jobs).fit(self.data_nu) + if len(update_idx) > 0: + _logger.debug( + "radius search on {}".format(len(update_idx))) + # give up - radius search + dist_new, ind_new = knn_tree.radius_neighbors( + Y[update_idx, :], + radius=radius + if isinstance(bandwidth, numbers.Number) + else np.max(radius[update_idx])) + for i, idx in enumerate(update_idx): + distances[idx] = dist_new[i] + indices[idx] = ind_new[i] + if isinstance(bandwidth, numbers.Number): + data = np.concatenate(distances) / bandwidth + else: + data = np.concatenate([distances[i] / bandwidth[i] + for i in range(len(distances))]) + + indices = np.concatenate(indices) + indptr = np.concatenate( + [[0], np.cumsum([len(d) for d in distances])]) + K = sparse.csr_matrix((data, indices, indptr), + shape=(Y.shape[0], self.data_nu.shape[0])) + K.data = np.exp(-1 * np.power(K.data, self.decay)) + # handle nan + K.data = np.where(np.isnan(K.data), 1, K.data) + # TODO: should we zero values that are below thresh? + K.data[K.data < self.thresh] = 0 + K = K.tocoo() + K.eliminate_zeros() + K = K.tocsr() return K @@ -566,40 +566,36 @@ def build_landmark_op(self): probabilities between cluster centers by using transition probabilities between samples assigned to each cluster. """ - tasklogger.log_start("landmark operator") - is_sparse = sparse.issparse(self.kernel) - # spectral clustering - tasklogger.log_start("SVD") - _, _, VT = randomized_svd(self.diff_aff, - n_components=self.n_svd, - random_state=self.random_state) - tasklogger.log_complete("SVD") - tasklogger.log_start("KMeans") - kmeans = MiniBatchKMeans( - self.n_landmark, - init_size=3 * self.n_landmark, - batch_size=10000, - random_state=self.random_state) - self._clusters = kmeans.fit_predict( - self.diff_op.dot(VT.T)) - # some clusters are not assigned - tasklogger.log_complete("KMeans") - - # transition matrices - pmn = self._landmarks_to_data() - - # row normalize - pnm = pmn.transpose() - pmn = normalize(pmn, norm='l1', axis=1) - pnm = normalize(pnm, norm='l1', axis=1) - landmark_op = pmn.dot(pnm) # sparsity agnostic matrix multiplication - if is_sparse: - # no need to have a sparse landmark operator - landmark_op = landmark_op.toarray() - # store output - self._landmark_op = landmark_op - self._transitions = pnm - tasklogger.log_complete("landmark operator") + with _logger.task("landmark operator"): + is_sparse = sparse.issparse(self.kernel) + # spectral clustering + with _logger.task("SVD"): + _, _, VT = randomized_svd(self.diff_aff, + n_components=self.n_svd, + random_state=self.random_state) + with _logger.task("KMeans"): + kmeans = MiniBatchKMeans( + self.n_landmark, + init_size=3 * self.n_landmark, + batch_size=10000, + random_state=self.random_state) + self._clusters = kmeans.fit_predict( + self.diff_op.dot(VT.T)) + + # transition matrices + pmn = self._landmarks_to_data() + + # row normalize + pnm = pmn.transpose() + pmn = normalize(pmn, norm='l1', axis=1) + pnm = normalize(pnm, norm='l1', axis=1) + landmark_op = pmn.dot(pnm) # sparsity agnostic matrix multiplication + if is_sparse: + # no need to have a sparse landmark operator + landmark_op = landmark_op.toarray() + # store output + self._landmark_op = landmark_op + self._transitions = pnm def extend_to_data(self, data, **kwargs): """Build transition matrix from new data to the graph @@ -873,46 +869,45 @@ def build_kernel(self): K = K.tolil() K = utils.set_diagonal(K, 1) else: - tasklogger.log_start("affinities") - if sparse.issparse(self.data_nu): - self.data_nu = self.data_nu.toarray() - if self.precomputed == "distance": - pdx = self.data_nu - elif self.precomputed is None: - pdx = pdist(self.data_nu, metric=self.distance) - if np.any(pdx == 0): - pdx = squareform(pdx) - duplicate_ids = np.array( - [i for i in np.argwhere(pdx == 0) - if i[1] > i[0]]) - duplicate_names = ", ".join(["{} and {}".format(i[0], i[1]) - for i in duplicate_ids]) - warnings.warn( - "Detected zero distance between samples {}. " - "Consider removing duplicates to avoid errors in " - "downstream processing.".format(duplicate_names), - RuntimeWarning) + with _logger.task("affinities"): + if sparse.issparse(self.data_nu): + self.data_nu = self.data_nu.toarray() + if self.precomputed == "distance": + pdx = self.data_nu + elif self.precomputed is None: + pdx = pdist(self.data_nu, metric=self.distance) + if np.any(pdx == 0): + pdx = squareform(pdx) + duplicate_ids = np.array( + [i for i in np.argwhere(pdx == 0) + if i[1] > i[0]]) + duplicate_names = ", ".join(["{} and {}".format(i[0], i[1]) + for i in duplicate_ids]) + warnings.warn( + "Detected zero distance between samples {}. " + "Consider removing duplicates to avoid errors in " + "downstream processing.".format(duplicate_names), + RuntimeWarning) + else: + pdx = squareform(pdx) else: - pdx = squareform(pdx) - else: - raise ValueError( - "precomputed='{}' not recognized. " - "Choose from ['affinity', 'adjacency', 'distance', " - "None]".format(self.precomputed)) - if self.bandwidth is None: - knn_dist = np.partition( - pdx, self.knn + 1, axis=1)[:, :self.knn + 1] - bandwidth = np.max(knn_dist, axis=1) - elif callable(self.bandwidth): - bandwidth = self.bandwidth(pdx) - else: - bandwidth = self.bandwidth - bandwidth = bandwidth * self.bandwidth_scale - pdx = (pdx.T / bandwidth).T - K = np.exp(-1 * np.power(pdx, self.decay)) - # handle nan - K = np.where(np.isnan(K), 1, K) - tasklogger.log_complete("affinities") + raise ValueError( + "precomputed='{}' not recognized. " + "Choose from ['affinity', 'adjacency', 'distance', " + "None]".format(self.precomputed)) + if self.bandwidth is None: + knn_dist = np.partition( + pdx, self.knn + 1, axis=1)[:, :self.knn + 1] + bandwidth = np.max(knn_dist, axis=1) + elif callable(self.bandwidth): + bandwidth = self.bandwidth(pdx) + else: + bandwidth = self.bandwidth + bandwidth = bandwidth * self.bandwidth_scale + pdx = (pdx.T / bandwidth).T + K = np.exp(-1 * np.power(pdx, self.decay)) + # handle nan + K = np.where(np.isnan(K), 1, K) # truncate if sparse.issparse(K): if not (isinstance(K, sparse.csr_matrix) or @@ -966,21 +961,20 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: - tasklogger.log_start("affinities") - Y = self._check_extension_shape(Y) - pdx = cdist(Y, self.data_nu, metric=self.distance) - if bandwidth is None: - knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] - bandwidth = np.max(knn_dist, axis=1) - elif callable(bandwidth): - bandwidth = bandwidth(pdx) - bandwidth = bandwidth_scale * bandwidth - pdx = (pdx.T / bandwidth).T - K = np.exp(-1 * pdx**self.decay) - # handle nan - K = np.where(np.isnan(K), 1, K) - K[K < self.thresh] = 0 - tasklogger.log_complete("affinities") + with _logger.task("affinities"): + Y = self._check_extension_shape(Y) + pdx = cdist(Y, self.data_nu, metric=self.distance) + if bandwidth is None: + knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] + bandwidth = np.max(knn_dist, axis=1) + elif callable(bandwidth): + bandwidth = bandwidth(pdx) + bandwidth = bandwidth_scale * bandwidth + pdx = (pdx.T / bandwidth).T + K = np.exp(-1 * pdx**self.decay) + # handle nan + K = np.where(np.isnan(K), 1, K) + K[K < self.thresh] = 0 return K @property @@ -1003,7 +997,7 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if self.precomputed is not None and not self.weighted: distance = 'constant' - tasklogger.log_info("Using constant distances.") + _logger.info("Using constant distances.") else: distance = super()._default_shortest_path_distance() return distance @@ -1155,66 +1149,60 @@ def build_kernel(self): symmetric matrix with ones down the diagonal with no non-negative entries. """ - tasklogger.log_start("subgraphs") - self.subgraphs = [] - from .api import Graph - # iterate through sample ids - for i, idx in enumerate(self.samples): - tasklogger.log_debug("subgraph {}: sample {}, " - "n = {}, knn = {}".format( - i, idx, np.sum(self.sample_idx == idx), - self.knn)) - # select data for sample - data = self.data_nu[self.sample_idx == idx] - # build a kNN graph for cells within sample - graph = Graph(data, n_pca=None, - knn=self.knn, - decay=self.decay, - bandwidth=self.bandwidth, - distance=self.distance, - thresh=self.thresh, - verbose=self.verbose, - random_state=self.random_state, - n_jobs=self.n_jobs, - kernel_symm='+', - initialize=True) - self.subgraphs.append(graph) # append to list of subgraphs - tasklogger.log_complete("subgraphs") - - tasklogger.log_start("MNN kernel") - if self.thresh > 0 or self.decay is None: - K = sparse.lil_matrix( - (self.data_nu.shape[0], self.data_nu.shape[0])) - else: - K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) - for i, X in enumerate(self.subgraphs): - K = utils.set_submatrix( - K, self.sample_idx == self.samples[i], - self.sample_idx == self.samples[i], X.K) - within_batch_norm = np.array(np.sum(X.K, 1)).flatten() - for j, Y in enumerate(self.subgraphs): - if i == j: - continue - tasklogger.log_start( - "kernel from sample {} to {}".format(self.samples[i], - self.samples[j])) - Kij = Y.build_kernel_to_data( - X.data_nu, - knn=self.knn) - between_batch_norm = np.array(np.sum(Kij, 1)).flatten() - scale = np.minimum(1, within_batch_norm / - between_batch_norm) * self.beta - if sparse.issparse(Kij): - Kij = Kij.multiply(scale[:, None]) - else: - Kij = Kij * scale[:, None] + with _logger.task("subgraphs"): + self.subgraphs = [] + from .api import Graph + # iterate through sample ids + for i, idx in enumerate(self.samples): + _logger.debug("subgraph {}: sample {}, " + "n = {}, knn = {}".format( + i, idx, np.sum(self.sample_idx == idx), + self.knn)) + # select data for sample + data = self.data_nu[self.sample_idx == idx] + # build a kNN graph for cells within sample + graph = Graph(data, n_pca=None, + knn=self.knn, + decay=self.decay, + bandwidth=self.bandwidth, + distance=self.distance, + thresh=self.thresh, + verbose=self.verbose, + random_state=self.random_state, + n_jobs=self.n_jobs, + kernel_symm='+', + initialize=True) + self.subgraphs.append(graph) # append to list of subgraphs + + with _logger.task("MNN kernel"): + if self.thresh > 0 or self.decay is None: + K = sparse.lil_matrix( + (self.data_nu.shape[0], self.data_nu.shape[0])) + else: + K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) + for i, X in enumerate(self.subgraphs): K = utils.set_submatrix( K, self.sample_idx == self.samples[i], - self.sample_idx == self.samples[j], Kij) - tasklogger.log_complete( - "kernel from sample {} to {}".format(self.samples[i], - self.samples[j])) - tasklogger.log_complete("MNN kernel") + self.sample_idx == self.samples[i], X.K) + within_batch_norm = np.array(np.sum(X.K, 1)).flatten() + for j, Y in enumerate(self.subgraphs): + if i == j: + continue + with _logger.task( + "kernel from sample {} to {}".format(self.samples[i], self.samples[j])): + Kij = Y.build_kernel_to_data( + X.data_nu, + knn=self.knn) + between_batch_norm = np.array(np.sum(Kij, 1)).flatten() + scale = np.minimum(1, within_batch_norm / + between_batch_norm) * self.beta + if sparse.issparse(Kij): + Kij = Kij.multiply(scale[:, None]) + else: + Kij = Kij * scale[:, None] + K = utils.set_submatrix( + K, self.sample_idx == self.samples[i], + self.sample_idx == self.samples[j], Kij) return K def build_kernel_to_data(self, Y, theta=None): diff --git a/requirements.txt b/requirements.txt index f584815..08fcd1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy>=1.1.0 pygsp>=>=0.5.1 scikit-learn>=0.20.0 future -tasklogger>=0.4.0 +tasklogger>=1.0 diff --git a/setup.py b/setup.py index 0d21022..cf358d5 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ 'pygsp>=0.5.1', 'scikit-learn>=0.20.0', 'future', - 'tasklogger>=0.4.0', + 'tasklogger>=1.0', ] test_requires = [ From 209076a03dbbda3a8730851ba67f9f38ce4ebf66 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Oct 2019 11:57:29 -0400 Subject: [PATCH 2/2] don't coerce to dense on to_igraph --- graphtools/base.py | 13 +++++++++---- graphtools/version.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index c7c72d6..a960d02 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -730,12 +730,12 @@ def to_pygsp(self, **kwargs): def to_igraph(self, attribute="weight", **kwargs): """Convert to an igraph Graph - Uses the igraph.Graph.Weighted_Adjacency constructor + Uses the igraph.Graph constructor Parameters ---------- attribute : str, optional (default: "weight") - kwargs : additional arguments for igraph.Graph.Weighted_Adjacency + kwargs : additional arguments for igraph.Graph """ try: import igraph as ig @@ -748,8 +748,13 @@ def to_igraph(self, attribute="weight", **kwargs): # not a pygsp graph W = self.K.copy() W = utils.set_diagonal(W, 0) - return ig.Graph.Weighted_Adjacency(utils.to_array(W).tolist(), - attr=attribute, **kwargs) + sources, targets = W.nonzero() + edgelist = list(zip(sources, targets)) + g = ig.Graph(W.shape[0], edgelist, **kwargs) + weights = W[W.nonzero()] + weights = utils.to_array(weights) + g.es[attribute] = weights.flatten().tolist() + return g def to_pickle(self, path): """Save the current Graph to a pickle. diff --git a/graphtools/version.py b/graphtools/version.py index 67bc602..9c73af2 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "1.3.0" +__version__ = "1.3.1"