diff --git a/.travis.yml b/.travis.yml index d4accf9..64d5223 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,8 @@ sudo: required + cache: pip + addons: apt: packages: diff --git a/README.rst b/README.rst index 0b8ccee..7e3dd0b 100644 --- a/README.rst +++ b/README.rst @@ -5,6 +5,9 @@ graphtools .. image:: https://img.shields.io/pypi/v/graphtools.svg :target: https://pypi.org/project/graphtools/ :alt: Latest PyPi version +.. image:: https://anaconda.org/conda-forge/tasklogger/badges/version.svg + :target: https://anaconda.org/conda-forge/tasklogger/ + :alt: Latest Conda version .. image:: https://api.travis-ci.com/KrishnaswamyLab/graphtools.svg?branch=master :target: https://travis-ci.com/KrishnaswamyLab/graphtools :alt: Travis CI Build @@ -28,7 +31,11 @@ Installation graphtools is available on `pip`. Install by running the following in a terminal:: - pip install --user graphtools + pip install --user graphtools + +Alternatively, graphtools can be installed using `Conda `_ (most easily obtained via the `Miniconda Python distribution `_):: + + conda install -c conda-forge graphtools Or, to install the latest version from github:: @@ -45,14 +52,14 @@ The `graphtools.Graph` class provides an all-in-one interface for k-nearest neig Use it as follows:: - from sklearn import datasets - import graphtools - digits = datasets.load_digits() - G = graphtools.Graph(digits['data']) - K = G.kernel - P = G.diff_op - G = graphtools.Graph(digits['data'], n_landmark=300) - L = G.landmark_op + from sklearn import datasets + import graphtools + digits = datasets.load_digits() + G = graphtools.Graph(digits['data']) + K = G.kernel + P = G.diff_op + G = graphtools.Graph(digits['data'], n_landmark=300) + L = G.landmark_op Help ---- diff --git a/graphtools/__init__.py b/graphtools/__init__.py index 05d693d..8fc8a50 100644 --- a/graphtools/__init__.py +++ b/graphtools/__init__.py @@ -1,2 +1,2 @@ -from .api import Graph +from .api import Graph, from_igraph from .version import __version__ diff --git a/graphtools/api.py b/graphtools/api.py index ede4f39..9e5d31b 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -1,6 +1,7 @@ import numpy as np import warnings import tasklogger +from scipy import sparse from . import base from . import graphs @@ -9,14 +10,15 @@ def Graph(data, n_pca=None, sample_idx=None, - adaptive_k='sqrt', + adaptive_k=None, precomputed=None, knn=5, decay=10, + bandwidth=None, distance='euclidean', thresh=1e-4, kernel_symm='+', - gamma=None, + theta=None, n_landmark=None, n_svd=100, beta=1, @@ -61,6 +63,11 @@ def Graph(data, decay : `int` or `None`, optional (default: 10) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each + sample. + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. @@ -75,12 +82,12 @@ def Graph(data, Defines method of MNN symmetrization. '+' : additive '*' : multiplicative - 'gamma' : min-max + 'theta' : min-max 'none' : no symmetrization - gamma: float (default: None) - Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'. - K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` + theta: float (default: None) + Min-max symmetrization constant or matrix. Only used if kernel_symm='theta'. + K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) If the graph is precomputed, this variable denotes which graph @@ -88,12 +95,12 @@ def Graph(data, Only one of `precomputed` and `n_pca` can be set. beta: float, optional(default: 1) - Multiply within - batch connections by(1 - beta) + Multiply between - batch connections by beta sample_idx: array-like Batch index for MNN kernel - adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt') + adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: None) Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. @@ -221,3 +228,31 @@ def Graph(data, for key, value in params.items() if key != "data"]))) return Graph(**params) + + +def from_igraph(G, **kwargs): + """Convert an igraph.Graph to a graphtools.Graph + + Creates a graphtools.graphs.TraditionalGraph with a + precomputed adjacency matrix + + Parameters + ---------- + G : igraph.Graph + Graph to be converted + kwargs + keyword arguments for graphtools.Graph + + Returns + ------- + G : graphtools.graphs.TraditionalGraph + """ + if 'precomputed' in kwargs: + if kwargs['precomputed'] != 'adjacency': + warnings.warn( + "Cannot build graph from igraph with precomputed={}. " + "Use 'adjacency' instead.".format(kwargs['precomputed']), + UserWarning) + del kwargs['precomputed'] + return Graph(sparse.coo_matrix(G.get_adjacency().data), + precomputed='adjacency', **kwargs) diff --git a/graphtools/base.py b/graphtools/base.py index edd678a..5f1a2ab 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -311,12 +311,12 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): Defines method of MNN symmetrization. '+' : additive '*' : multiplicative - 'gamma' : min-max + 'theta' : min-max 'none' : no symmetrization - gamma: float (default: 0.5) + theta: float (default: 0.5) Min-max symmetrization constant. - K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` + K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` initialize : `bool`, optional (default : `True`) if false, don't create the kernel matrix. @@ -337,11 +337,20 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): """ def __init__(self, kernel_symm='+', + theta=None, gamma=None, initialize=True, **kwargs): + if gamma is not None: + warnings.warn("gamma is deprecated. " + "Setting theta={}".format(gamma), FutureWarning) + theta = gamma + if kernel_symm == 'gamma': + warnings.warn("kernel_symm='gamma' is deprecated. " + "Setting kernel_symm='theta'", FutureWarning) + kernel_symm = 'theta' self.kernel_symm = kernel_symm - self.gamma = gamma - self._check_symmetrization(kernel_symm, gamma) + self.theta = theta + self._check_symmetrization(kernel_symm, theta) if initialize: tasklogger.log_debug("Initializing kernel...") @@ -350,25 +359,25 @@ def __init__(self, kernel_symm='+', tasklogger.log_debug("Not initializing kernel.") super().__init__(**kwargs) - def _check_symmetrization(self, kernel_symm, gamma): - if kernel_symm not in ['+', '*', 'gamma', None]: + def _check_symmetrization(self, kernel_symm, theta): + if kernel_symm not in ['+', '*', 'theta', None]: raise ValueError( "kernel_symm '{}' not recognized. Choose from " - "'+', '*', 'gamma', or 'none'.".format(kernel_symm)) - elif kernel_symm != 'gamma' and gamma is not None: - warnings.warn("kernel_symm='{}' but gamma is not None. " - "Setting kernel_symm='gamma'.".format(kernel_symm)) - self.kernel_symm = kernel_symm = 'gamma' - - if kernel_symm == 'gamma': - if gamma is None: - warnings.warn("kernel_symm='gamma' but gamma not given. " - "Defaulting to gamma=0.5.") - self.gamma = gamma = 0.5 - elif not isinstance(gamma, numbers.Number) or \ - gamma < 0 or gamma > 1: - raise ValueError("gamma {} not recognized. Expected " - "a float between 0 and 1".format(gamma)) + "'+', '*', 'theta', or 'none'.".format(kernel_symm)) + elif kernel_symm != 'theta' and theta is not None: + warnings.warn("kernel_symm='{}' but theta is not None. " + "Setting kernel_symm='theta'.".format(kernel_symm)) + self.kernel_symm = kernel_symm = 'theta' + + if kernel_symm == 'theta': + if theta is None: + warnings.warn("kernel_symm='theta' but theta not given. " + "Defaulting to theta=0.5.") + self.theta = theta = 0.5 + elif not isinstance(theta, numbers.Number) or \ + theta < 0 or theta > 1: + raise ValueError("theta {} not recognized. Expected " + "a float between 0 and 1".format(theta)) def _build_kernel(self): """Private method to build kernel matrix @@ -400,26 +409,26 @@ def symmetrize_kernel(self, K): elif self.kernel_symm == "*": tasklogger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) - elif self.kernel_symm == 'gamma': + elif self.kernel_symm == 'theta': tasklogger.log_debug( - "Using gamma symmetrization (gamma = {}).".format(self.gamma)) - K = self.gamma * elementwise_minimum(K, K.T) + \ - (1 - self.gamma) * elementwise_maximum(K, K.T) + "Using theta symmetrization (theta = {}).".format(self.theta)) + K = self.theta * elementwise_minimum(K, K.T) + \ + (1 - self.theta) * elementwise_maximum(K, K.T) elif self.kernel_symm is None: tasklogger.log_debug("Using no symmetrization.") pass else: # this should never happen raise ValueError( - "Expected kernel_symm in ['+', '*', 'gamma' or None]. " - "Got {}".format(self.gamma)) + "Expected kernel_symm in ['+', '*', 'theta' or None]. " + "Got {}".format(self.theta)) return K def get_params(self): """Get parameters from this object """ return {'kernel_symm': self.kernel_symm, - 'gamma': self.gamma} + 'theta': self.theta} def set_params(self, **params): """Set parameters on this object @@ -429,7 +438,7 @@ def set_params(self, **params): Valid parameters: Invalid parameters: (these would require modifying the kernel matrix) - kernel_symm - - gamma + - theta Parameters ---------- @@ -439,8 +448,8 @@ def set_params(self, **params): ------- self """ - if 'gamma' in params and params['gamma'] != self.gamma: - raise ValueError("Cannot update gamma. Please create a new graph") + if 'theta' in params and params['theta'] != self.theta: + raise ValueError("Cannot update theta. Please create a new graph") if 'kernel_symm' in params and \ params['kernel_symm'] != self.kernel_symm: raise ValueError( @@ -535,6 +544,42 @@ def build_kernel(self): """ raise NotImplementedError + def to_pygsp(self, **kwargs): + """Convert to a PyGSP graph + + For use only when the user means to create the graph using + the flag `use_pygsp=True`, and doesn't wish to recompute the kernel. + Creates a graphtools.graphs.TraditionalGraph with a precomputed + affinity matrix which also inherits from pygsp.graphs.Graph. + + Parameters + ---------- + kwargs + keyword arguments for graphtools.Graph + + Returns + ------- + G : graphtools.base.PyGSPGraph, graphtools.graphs.TraditionalGraph + """ + from . import api + if 'precomputed' in kwargs: + if kwargs['precomputed'] != 'affinity': + warnings.warn( + "Cannot build PyGSPGraph with precomputed={}. " + "Using 'affinity' instead.".format(kwargs['precomputed']), + UserWarning) + del kwargs['precomputed'] + if 'use_pygsp' in kwargs: + if kwargs['use_pygsp'] is not True: + warnings.warn( + "Cannot build PyGSPGraph with use_pygsp={}. " + "Use True instead.".format(kwargs['use_pygsp']), + UserWarning) + del kwargs['use_pygsp'] + return api.Graph(self.K, + precomputed="affinity", use_pygsp=True, + **kwargs) + class PyGSPGraph(with_metaclass(abc.ABCMeta, pygsp.graphs.Graph, Base)): """Interface between BaseGraph and PyGSP. diff --git a/graphtools/graphs.py b/graphtools/graphs.py index ce8f9ae..a0810da 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -1,3 +1,4 @@ +from __future__ import division from builtins import super import numpy as np from sklearn.neighbors import NearestNeighbors @@ -35,6 +36,12 @@ class kNNGraph(DataGraph): decay : `int` or `None`, optional (default: `None`) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each + sample. + TODO: implement `callable` bandwidth + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. @@ -55,25 +62,27 @@ class kNNGraph(DataGraph): """ def __init__(self, data, knn=5, decay=None, - distance='euclidean', + bandwidth=None, distance='euclidean', thresh=1e-4, n_pca=None, **kwargs): - self.knn = knn - self.decay = decay - self.distance = distance - self.thresh = thresh if decay is not None and thresh <= 0: raise ValueError("Cannot instantiate a kNNGraph with `decay=None` " "and `thresh=0`. Use a TraditionalGraph instead.") if knn > data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " - "data.shape[0] ({n}). Setting knn={n}".format( + "n_samples ({n}). Setting knn={n}".format( k=knn, n=data.shape[0])) + knn = data.shape[0] if n_pca is None and data.shape[1] > 500: warnings.warn("Building a kNNGraph on data of shape {} is " "expensive. Consider setting n_pca.".format( data.shape), UserWarning) + self.knn = knn + self.decay = decay + self.bandwidth = bandwidth + self.distance = distance + self.thresh = thresh super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): @@ -82,6 +91,7 @@ def get_params(self): params = super().get_params() params.update({'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'thresh': self.thresh, 'n_jobs': self.n_jobs, @@ -101,6 +111,7 @@ def set_params(self, **params): Invalid parameters: (these would require modifying the kernel matrix) - knn - decay + - bandwidth - distance - thresh @@ -116,6 +127,9 @@ def set_params(self, **params): raise ValueError("Cannot update knn. Please create a new graph") if 'decay' in params and params['decay'] != self.decay: raise ValueError("Cannot update decay. Please create a new graph") + if 'bandwidth' in params and params['bandwidth'] != self.bandwidth: + raise ValueError( + "Cannot update bandwidth. Please create a new graph") if 'distance' in params and params['distance'] != self.distance: raise ValueError("Cannot update distance. " "Please create a new graph") @@ -184,7 +198,7 @@ def build_kernel(self): K = self.build_kernel_to_data(self.data_nu) return K - def build_kernel_to_data(self, Y, knn=None): + def build_kernel_to_data(self, Y, knn=None, bandwidth=None): """Build a kernel from new input data `Y` to the `self.data` Parameters @@ -198,6 +212,9 @@ def build_kernel_to_data(self, Y, knn=None): knn : `int` or `None`, optional (default: `None`) If `None`, defaults to `self.knn` + bandwidth : `int` or `None`, optional (default: `None`) + If `None`, defaults to `self.bandwidth` + Returns ------- @@ -212,9 +229,11 @@ def build_kernel_to_data(self, Y, knn=None): """ if knn is None: knn = self.knn + if bandwidth is None: + bandwidth = self.bandwidth if knn > self.data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " - "data.shape[0] ({n}). Setting knn={n}".format( + "n_samples ({n}). Setting knn={n}".format( k=knn, n=self.data.shape[0])) Y = self._check_extension_shape(Y) @@ -247,7 +266,8 @@ def build_kernel_to_data(self, Y, knn=None): RuntimeWarning) tasklogger.log_complete("KNN search") tasklogger.log_start("affinities") - bandwidth = distances[:, knn - 1] + if bandwidth is None: + bandwidth = distances[:, knn - 1] radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere( @@ -266,8 +286,9 @@ def build_kernel_to_data(self, Y, knn=None): for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] - update_idx = [i for i, d in enumerate(distances) - if np.max(d) < radius[i]] + update_idx = [i for i, d in enumerate(distances) if np.max(d) < + (radius if isinstance(bandwidth, numbers.Number) + else radius[i])] tasklogger.log_debug("search_knn = {}; {} remaining".format( search_knn, len(update_idx))) @@ -281,12 +302,18 @@ def build_kernel_to_data(self, Y, knn=None): # give up - radius search dist_new, ind_new = knn_tree.radius_neighbors( Y[update_idx, :], - radius=np.max(radius[update_idx])) + radius=radius + if isinstance(bandwidth, numbers.Number) + else np.max(radius[update_idx])) for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] - data = np.concatenate([distances[i] / bandwidth[i] - for i in range(len(distances))]) + if isinstance(bandwidth, numbers.Number): + data = np.concatenate(distances) / bandwidth + else: + data = np.concatenate([distances[i] / bandwidth[i] + for i in range(len(distances))]) + indices = np.concatenate(indices) indptr = np.concatenate( [[0], np.cumsum([len(d) for d in distances])]) @@ -335,8 +362,14 @@ class LandmarkGraph(DataGraph): transitions : array-like, shape=[n_samples, n_landmark] Transition probabilities between samples and landmarks. - _clusters : array-like, shape=[n_samples] + clusters : array-like, shape=[n_samples] Private attribute. Cluster assignments for each sample. + + Examples + -------- + >>> G = graphtools.Graph(data, n_landmark=1000) + >>> X_landmark = transform(G.landmark_op) + >>> X_full = G.interpolate(X_landmark) """ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs): @@ -431,6 +464,23 @@ def landmark_op(self): self.build_landmark_op() return self._landmark_op + @property + def clusters(self): + """Cluster assignments for each sample. + + Compute or return the cluster assignments + + Returns + ------- + clusters : list-like, shape=[n_samples] + Cluster assignments for each sample. + """ + try: + return self._clusters + except AttributeError: + self.build_landmark_op() + return self._clusters + @property def transitions(self): """Transition matrix from samples to landmarks @@ -450,13 +500,13 @@ def transitions(self): return self._transitions def _landmarks_to_data(self): - landmarks = np.unique(self._clusters) + landmarks = np.unique(self.clusters) if sparse.issparse(self.kernel): pmn = sparse.vstack( - [sparse.csr_matrix(self.kernel[self._clusters == i, :].sum( + [sparse.csr_matrix(self.kernel[self.clusters == i, :].sum( axis=0)) for i in landmarks]) else: - pmn = np.array([np.sum(self.kernel[self._clusters == i, :], axis=0) + pmn = np.array([np.sum(self.kernel[self.clusters == i, :], axis=0) for i in landmarks]) return pmn @@ -532,12 +582,12 @@ def extend_to_data(self, data, **kwargs): kernel = self.build_kernel_to_data(data, **kwargs) if sparse.issparse(kernel): pnm = sparse.hstack( - [sparse.csr_matrix(kernel[:, self._clusters == i].sum( - axis=1)) for i in np.unique(self._clusters)]) + [sparse.csr_matrix(kernel[:, self.clusters == i].sum( + axis=1)) for i in np.unique(self.clusters)]) else: pnm = np.array([np.sum( - kernel[:, self._clusters == i], - axis=1).T for i in np.unique(self._clusters)]).transpose() + kernel[:, self.clusters == i], + axis=1).T for i in np.unique(self.clusters)]).transpose() pnm = normalize(pnm, norm='l1', axis=1) return pnm @@ -590,6 +640,12 @@ class TraditionalGraph(DataGraph): decay : `int` or `None`, optional (default: `None`) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each + sample. + TODO: implement `callable` bandwidth + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. @@ -613,19 +669,27 @@ class TraditionalGraph(DataGraph): Only one of `precomputed` and `n_pca` can be set. """ - def __init__(self, data, knn=5, decay=10, - distance='euclidean', n_pca=None, + def __init__(self, data, + knn=5, decay=10, + bandwidth=None, + distance='euclidean', + n_pca=None, thresh=1e-4, precomputed=None, **kwargs): + if decay is None and precomputed not in ['affinity', 'adjacency']: + # decay high enough is basically a binary kernel + raise ValueError("`decay` must be provided for a TraditionalGraph" + ". For kNN kernel, use kNNGraph.") if precomputed is not None and n_pca is not None: # the data itself is a matrix of distances / affinities n_pca = None warnings.warn("n_pca cannot be given on a precomputed graph." " Setting n_pca=None", RuntimeWarning) - if decay is None and precomputed not in ['affinity', 'adjacency']: - # decay high enough is basically a binary kernel - raise ValueError("`decay` must be provided for a TraditionalGraph" - ". For kNN kernel, use kNNGraph.") + if knn > data.shape[0]: + warnings.warn("Cannot set knn ({k}) to be greater than or equal to" + " n_samples ({n}). Setting knn={n}".format( + k=knn, n=data.shape[0] - 1)) + knn = data.shape[0] - 1 if precomputed is not None: if precomputed not in ["distance", "affinity", "adjacency"]: raise ValueError("Precomputed value {} not recognized. " @@ -640,6 +704,7 @@ def __init__(self, data, knn=5, decay=10, "non-negative".format(precomputed)) self.knn = knn self.decay = decay + self.bandwidth = bandwidth self.distance = distance self.thresh = thresh self.precomputed = precomputed @@ -653,6 +718,7 @@ def get_params(self): params = super().get_params() params.update({'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'precomputed': self.precomputed}) return params @@ -667,6 +733,7 @@ def set_params(self, **params): - distance - knn - decay + - bandwidth Parameters ---------- @@ -690,6 +757,10 @@ def set_params(self, **params): if 'decay' in params and params['decay'] != self.decay and \ self.precomputed is None: raise ValueError("Cannot update decay. Please create a new graph") + if 'bandwidth' in params and params['bandwidth'] != self.bandwidth and \ + self.precomputed is None: + raise ValueError( + "Cannot update bandwidth. Please create a new graph") # update superclass parameters super().set_params(**params) return self @@ -752,9 +823,12 @@ def build_kernel(self): "precomputed='{}' not recognized. " "Choose from ['affinity', 'adjacency', 'distance', " "None]".format(self.precomputed)) - knn_dist = np.partition(pdx, self.knn, axis=1)[:, :self.knn] - epsilon = np.max(knn_dist, axis=1) - pdx = (pdx.T / epsilon).T + if self.bandwidth is None: + knn_dist = np.partition(pdx, self.knn, axis=1)[:, :self.knn] + bandwidth = np.max(knn_dist, axis=1) + else: + bandwidth = self.bandwidth + pdx = (pdx.T / bandwidth).T K = np.exp(-1 * np.power(pdx, self.decay)) # handle nan K = np.where(np.isnan(K), 1, K) @@ -773,7 +847,7 @@ def build_kernel(self): K[K < self.thresh] = 0 return K - def build_kernel_to_data(self, Y, knn=None): + def build_kernel_to_data(self, Y, knn=None, bandwidth=None): """Build transition matrix from new data to the graph Creates a transition matrix such that `Y` can be approximated by @@ -805,15 +879,18 @@ def build_kernel_to_data(self, Y, knn=None): """ if knn is None: knn = self.knn + if bandwidth is None: + bandwidth = self.bandwidth if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: tasklogger.log_start("affinities") Y = self._check_extension_shape(Y) pdx = cdist(Y, self.data_nu, metric=self.distance) - knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] - epsilon = np.max(knn_dist, axis=1) - pdx = (pdx.T / epsilon).T + if bandwidth is None: + knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] + bandwidth = np.max(knn_dist, axis=1) + pdx = (pdx.T / bandwidth).T K = np.exp(-1 * pdx**self.decay) # handle nan K = np.where(np.isnan(K), 1, K) @@ -841,9 +918,9 @@ class MNNGraph(DataGraph): Batch index beta : `float`, optional (default: 1) - Downweight within-batch affinities by beta + Downweight between-batch affinities by beta - adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: 'sqrt') + adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: None) Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. @@ -855,8 +932,9 @@ class MNNGraph(DataGraph): def __init__(self, data, sample_idx, knn=5, beta=1, n_pca=None, - adaptive_k='sqrt', + adaptive_k=None, decay=None, + bandwidth=None, distance='euclidean', thresh=1e-4, n_jobs=1, @@ -869,6 +947,7 @@ def __init__(self, data, sample_idx, self.knn = knn self.decay = decay self.distance = distance + self.bandwidth = bandwidth self.thresh = thresh self.n_jobs = n_jobs self.weighted_knn = self._weight_knn() @@ -886,33 +965,33 @@ def __init__(self, data, sample_idx, super().__init__(data, n_pca=n_pca, **kwargs) - def _check_symmetrization(self, kernel_symm, gamma): - if kernel_symm == 'gamma' and gamma is not None and \ - not isinstance(gamma, numbers.Number): - # matrix gamma + def _check_symmetrization(self, kernel_symm, theta): + if kernel_symm == 'theta' and theta is not None and \ + not isinstance(theta, numbers.Number): + # matrix theta try: - gamma.shape + theta.shape except AttributeError: - raise ValueError("gamma {} not recognized. " + raise ValueError("theta {} not recognized. " "Expected a float between 0 and 1 " "or a [n_batch,n_batch] matrix of " - "floats between 0 and 1".format(gamma)) - if not np.shape(gamma) == (len(self.samples), + "floats between 0 and 1".format(theta)) + if not np.shape(theta) == (len(self.samples), len(self.samples)): raise ValueError( - "Matrix gamma must be of shape " + "Matrix theta must be of shape " "({}), got ({})".format( (len(self.samples), - len(self.samples)), gamma.shape)) - elif np.max(gamma) > 1 or np.min(gamma) < 0: + len(self.samples)), theta.shape)) + elif np.max(theta) > 1 or np.min(theta) < 0: raise ValueError( - "Values in matrix gamma must be between" + "Values in matrix theta must be between" " 0 and 1, got values between {} and {}".format( - np.max(gamma), np.min(gamma))) - elif np.any(gamma != gamma.T): - raise ValueError("gamma must be a symmetric matrix") + np.max(theta), np.min(theta))) + elif np.any(theta != theta.T): + raise ValueError("theta must be a symmetric matrix") else: - super()._check_symmetrization(kernel_symm, gamma) + super()._check_symmetrization(kernel_symm, theta) def _weight_knn(self, sample_size=None): """Select adaptive values of knn @@ -957,6 +1036,7 @@ def get_params(self): 'adaptive_k': self.adaptive_k, 'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'thresh': self.thresh, 'n_jobs': self.n_jobs}) @@ -995,7 +1075,7 @@ def set_params(self, **params): "Cannot update adaptive_k. Please create a new graph") # knn arguments - knn_kernel_args = ['knn', 'decay', 'distance', 'thresh'] + knn_kernel_args = ['knn', 'decay', 'distance', 'thresh', 'bandwidth'] knn_other_args = ['n_jobs', 'random_state', 'verbose'] for arg in knn_kernel_args: if arg in params and params[arg] != getattr(self, arg): @@ -1037,12 +1117,13 @@ def build_kernel(self): graph = Graph(data, n_pca=None, knn=self.weighted_knn[i], decay=self.decay, + bandwidth=self.bandwidth, distance=self.distance, thresh=self.thresh, verbose=self.verbose, random_state=self.random_state, n_jobs=self.n_jobs, - initialize=False) + initialize=True) self.subgraphs.append(graph) # append to list of subgraphs tasklogger.log_complete("subgraphs") @@ -1052,16 +1133,25 @@ def build_kernel(self): else: K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) for i, X in enumerate(self.subgraphs): + K = set_submatrix(K, self.sample_idx == self.samples[i], + self.sample_idx == self.samples[i], X.K) + within_batch_norm = np.array(np.sum(X.K, 1)).flatten() for j, Y in enumerate(self.subgraphs): + if i == j: + continue tasklogger.log_start( "kernel from sample {} to {}".format(self.samples[i], self.samples[j])) Kij = Y.build_kernel_to_data( X.data_nu, knn=self.weighted_knn[i]) - if i == j: - # downweight within-batch affinities by beta - Kij = Kij * self.beta + between_batch_norm = np.array(np.sum(Kij, 1)).flatten() + scale = np.minimum(1, within_batch_norm / + between_batch_norm) * self.beta + if sparse.issparse(Kij): + Kij = Kij.multiply(scale[:, None]) + else: + Kij = Kij * scale[:, None] K = set_submatrix(K, self.sample_idx == self.samples[i], self.sample_idx == self.samples[j], Kij) tasklogger.log_complete( @@ -1070,14 +1160,14 @@ def build_kernel(self): return K def symmetrize_kernel(self, K): - if self.kernel_symm == 'gamma' and self.gamma is not None and \ - not isinstance(self.gamma, numbers.Number): - # matrix gamma - # Gamma can be a matrix with specific values transitions for + if self.kernel_symm == 'theta' and self.theta is not None and \ + not isinstance(self.theta, numbers.Number): + # matrix theta + # Theta can be a matrix with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously - tasklogger.log_debug("Using gamma symmetrization. " - "Gamma:\n{}".format(self.gamma)) + tasklogger.log_debug("Using theta symmetrization. " + "Theta:\n{}".format(self.theta)) for i, sample_i in enumerate(self.samples): for j, sample_j in enumerate(self.samples): if j < i: @@ -1086,9 +1176,9 @@ def symmetrize_kernel(self, K): self.sample_idx == sample_j)] Kji = K[np.ix_(self.sample_idx == sample_j, self.sample_idx == sample_i)] - Kij_symm = self.gamma[i, j] * \ + Kij_symm = self.theta[i, j] * \ elementwise_minimum(Kij, Kji.T) + \ - (1 - self.gamma[i, j]) * \ + (1 - self.theta[i, j]) * \ elementwise_maximum(Kij, Kji.T) K = set_submatrix(K, self.sample_idx == sample_i, self.sample_idx == sample_j, Kij_symm) @@ -1100,7 +1190,7 @@ def symmetrize_kernel(self, K): K = super().symmetrize_kernel(K) return K - def build_kernel_to_data(self, Y, gamma=None): + def build_kernel_to_data(self, Y, theta=None): """Build transition matrix from new data to the graph Creates a transition matrix such that `Y` can be approximated by @@ -1120,8 +1210,8 @@ def build_kernel_to_data(self, Y, gamma=None): to the existing data. `n_features` must match either the ambient or PCA dimensions - gamma : array-like or `None`, optional (default: `None`) - if `self.gamma` is a matrix, gamma values must be explicitly + theta : array-like or `None`, optional (default: `None`) + if `self.theta` is a matrix, theta values must be explicitly specified between `Y` and each sample in `self.data` Returns @@ -1131,15 +1221,15 @@ def build_kernel_to_data(self, Y, gamma=None): Transition matrix from `Y` to `self.data` """ raise NotImplementedError - tasklogger.log_warning("building MNN kernel to gamma is experimental") - if not isinstance(self.gamma, str) and \ - not isinstance(self.gamma, numbers.Number): - if gamma is None: + tasklogger.log_warning("building MNN kernel to theta is experimental") + if not isinstance(self.theta, str) and \ + not isinstance(self.theta, numbers.Number): + if theta is None: raise ValueError( - "self.gamma is a matrix but gamma is not provided.") - elif len(gamma) != len(self.samples): + "self.theta is a matrix but theta is not provided.") + elif len(theta) != len(self.samples): raise ValueError( - "gamma should have one value for every sample") + "theta should have one value for every sample") Y = self._check_extension_shape(Y) kernel_xy = [] @@ -1156,26 +1246,26 @@ def build_kernel_to_data(self, Y, gamma=None): kernel_yx = sparse.vstack(kernel_yx) # n_cells_x x n_cells_y # symmetrize - if gamma is not None: + if theta is not None: # Gamma can be a vector with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously K = np.empty_like(kernel_xy) for i, sample in enumerate(self.samples): sample_idx = self.sample_idx == sample - K[:, sample_idx] = gamma[i] * \ + K[:, sample_idx] = theta[i] * \ kernel_xy[:, sample_idx].minimum( kernel_yx[sample_idx, :].T) + \ - (1 - gamma[i]) * \ + (1 - theta[i]) * \ kernel_xy[:, sample_idx].maximum( kernel_yx[sample_idx, :].T) - if self.gamma == "+": + if self.theta == "+": K = (kernel_xy + kernel_yx.T) / 2 - elif self.gamma == "*": + elif self.theta == "*": K = kernel_xy.multiply(kernel_yx.T) else: - K = self.gamma * kernel_xy.minimum(kernel_yx.T) + \ - (1 - self.gamma) * kernel_xy.maximum(kernel_yx.T) + K = self.theta * kernel_xy.minimum(kernel_yx.T) + \ + (1 - self.theta) * kernel_xy.maximum(kernel_yx.T) return K diff --git a/graphtools/version.py b/graphtools/version.py index 569b121..d3ec452 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "0.1.10" +__version__ = "0.2.0" diff --git a/requirements.txt b/requirements.txt index c19f67d..08e1515 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy>=1.1.0 pygsp>=>=0.5.1 scikit-learn>=0.19.1 future -tasklogger>=0.2 +tasklogger>=0.4.0 diff --git a/setup.py b/setup.py index fca28af..f67b380 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,15 @@ 'pygsp>=0.5.1', 'scikit-learn>=0.19.1', 'future', - 'tasklogger>=0.2', + 'tasklogger>=0.4.0', ] test_requires = [ 'nose2', 'pandas', 'coverage', - 'coveralls' + 'coveralls', + 'python-igraph' ] if sys.version_info[0] == 3: diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index 3f62c9d..f018957 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -16,6 +16,7 @@ def reset_warnings(): warnings.resetwarnings() warnings.simplefilter("error") ignore_numpy_warning() + ignore_igraph_warning() def ignore_numpy_warning(): @@ -25,6 +26,14 @@ def ignore_numpy_warning(): "matrices or deal with linear algebra ") +def ignore_igraph_warning(): + warnings.filterwarnings( + "ignore", category=DeprecationWarning, + message="The SafeConfigParser class has been renamed to ConfigParser " + "in Python 3.2. This alias will be removed in future versions. Use " + "ConfigParser directly instead") + + reset_warnings() global digits @@ -38,7 +47,7 @@ def generate_swiss_roll(n_samples=1000, noise=0.5, seed=42): t = 1.5 * np.pi * (1 + 2 * generator.rand(1, n_samples)) x = t * np.cos(t) y = t * np.sin(t) - sample_idx = np.random.choice([0, 1], n_samples, replace=True) + sample_idx = generator.choice([0, 1], n_samples, replace=True) z = sample_idx t = np.squeeze(t) X = np.concatenate((x, y)) diff --git a/test/test_api.py b/test/test_api.py index c099086..49d2126 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -1,10 +1,60 @@ +from __future__ import print_function from load_tests import ( nose2, data, build_graph, raises, + warns, ) +import warnings +import igraph +import numpy as np +import graphtools + + +def test_from_igraph(): + n = 100 + m = 500 + K = np.zeros((n, n)) + for _ in range(m): + e = np.random.choice(n, 2, replace=False) + K[e[0], e[1]] = K[e[1], e[0]] = 1 + g = igraph.Graph.Adjacency(K.tolist()) + G = graphtools.from_igraph(g) + G2 = graphtools.Graph(K, precomputed='adjacency') + assert np.all(G.K == G2.K) + + +@warns(UserWarning) +def test_from_igraph_invalid_precomputed(): + n = 100 + m = 500 + K = np.zeros((n, n)) + for _ in range(m): + e = np.random.choice(n, 2, replace=False) + K[e[0], e[1]] = K[e[1], e[0]] = 1 + g = igraph.Graph.Adjacency(K.tolist()) + G = graphtools.from_igraph(g, precomputed='affinity') + + +def test_to_pygsp(): + G = build_graph(data) + G2 = G.to_pygsp() + assert isinstance(G2, graphtools.graphs.PyGSPGraph) + assert np.all(G2.K == G.K) + + +@warns(UserWarning) +def test_to_pygsp_invalid_precomputed(): + G = build_graph(data) + G2 = G.to_pygsp(precomputed='adjacency') + + +@warns(UserWarning) +def test_to_pygsp_invalid_use_pygsp(): + G = build_graph(data) + G2 = G.to_pygsp(use_pygsp=False) ##################################################### # Check parameters diff --git a/test/test_data.py b/test/test_data.py index 39d7966..dfa0889 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( np, sp, diff --git a/test/test_exact.py b/test/test_exact.py index 542e625..80d84f0 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -68,6 +69,13 @@ def test_precomputed_negative(): n_pca=None) +@raises(ValueError) +def test_precomputed_invalid(): + build_graph(np.random.uniform(0, 1, [200, 200]), + precomputed='invalid', + n_pca=None) + + @warns(RuntimeWarning) def test_duplicate_data(): build_graph(np.vstack([data, data[:10]]), @@ -76,6 +84,15 @@ def test_duplicate_data(): thresh=0) +@warns(UserWarning) +def test_k_too_large(): + build_graph(data, + n_pca=20, + decay=10, + knn=len(data) + 1, + thresh=0) + + ##################################################### # Check kernel ##################################################### @@ -289,6 +306,48 @@ def test_truncated_exact_graph_no_pca(): assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) +def test_exact_graph_fixed_bandwidth(): + decay = 5 + bandwidth = 2 + n_pca = 20 + pca = PCA(n_pca, svd_solver='randomized', random_state=42).fit(data) + data_nu = pca.transform(data) + pdx = squareform(pdist(data_nu, metric='euclidean')) + K = np.exp(-1 * (pdx / bandwidth)**decay) + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + graphtype='exact', + decay=decay, bandwidth=bandwidth, + random_state=42, + thresh=0, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G2.W != G.W).sum() == 0) + assert((G.W != G2.W).nnz == 0) + bandwidth = np.random.gamma(5, 0.5, len(data)) + K = np.exp(-1 * (pdx.T / bandwidth).T**decay) + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + graphtype='exact', + decay=decay, bandwidth=bandwidth, + random_state=42, + thresh=0, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G2.W != G.W).sum() == 0) + assert((G.W != G2.W).nnz == 0) + + ##################################################### # Check interpolation ##################################################### @@ -348,15 +407,17 @@ def test_set_params(): assert G.get_params() == {'n_pca': 20, 'random_state': 42, 'kernel_symm': '+', - 'gamma': None, + 'theta': None, 'knn': 3, 'decay': 10, + 'bandwidth': None, 'distance': 'euclidean', 'precomputed': None} assert_raises(ValueError, G.set_params, knn=15) assert_raises(ValueError, G.set_params, decay=15) assert_raises(ValueError, G.set_params, distance='manhattan') assert_raises(ValueError, G.set_params, precomputed='distance') + assert_raises(ValueError, G.set_params, bandwidth=5) G.set_params(knn=G.knn, decay=G.decay, distance=G.distance, diff --git a/test/test_knn.py b/test/test_knn.py index b8682c2..7d15b0d 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -45,6 +46,24 @@ def test_duplicate_data(): thresh=1e-4) +@warns(UserWarning) +def test_balltree_cosine(): + build_graph(data, + n_pca=20, + decay=10, + distance='cosine', + thresh=1e-4) + + +@warns(UserWarning) +def test_k_too_large(): + build_graph(data, + n_pca=20, + decay=10, + knn=len(data) + 1, + thresh=1e-4) + + ##################################################### # Check kernel ##################################################### @@ -124,6 +143,52 @@ def test_sparse_alpha_knn_graph(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) +def test_knn_graph_fixed_bandwidth(): + k = 3 + decay = 5 + bandwidth = 10 + n_pca = 20 + thresh = 1e-4 + pca = PCA(n_pca, svd_solver='randomized', random_state=42).fit(data) + data_nu = pca.transform(data) + pdx = squareform(pdist(data_nu, metric='euclidean')) + K = np.exp(-1 * np.power(pdx / bandwidth, decay)) + K[K < thresh] = 0 + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + decay=decay, bandwidth=bandwidth, + knn=k, random_state=42, + thresh=thresh, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.kNNGraph)) + np.testing.assert_array_equal(G.N, G2.N) + np.testing.assert_array_equal(G.d, G2.d) + np.testing.assert_allclose( + (G.W - G2.W).data, + np.zeros_like((G.W - G2.W).data), atol=1e-14) + bandwidth = np.random.gamma(20, 0.5, len(data)) + K = np.exp(-1 * (pdx.T / bandwidth).T**decay) + K[K < thresh] = 0 + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + decay=decay, bandwidth=bandwidth, + knn=k, random_state=42, + thresh=thresh, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.kNNGraph)) + np.testing.assert_array_equal(G.N, G2.N) + np.testing.assert_allclose(G.dw, G2.dw, atol=1e-14) + np.testing.assert_allclose( + (G.W - G2.W).data, + np.zeros_like((G.W - G2.W).data), atol=1e-14) + + @warns(UserWarning) def test_knn_graph_sparse_no_pca(): build_graph(sp.coo_matrix(data), n_pca=None, # n_pca, @@ -184,9 +249,10 @@ def test_set_params(): 'n_pca': 20, 'random_state': 42, 'kernel_symm': '+', - 'gamma': None, + 'theta': None, 'knn': 3, 'decay': None, + 'bandwidth': None, 'distance': 'euclidean', 'thresh': 0, 'n_jobs': -1, @@ -204,11 +270,12 @@ def test_set_params(): assert_raises(ValueError, G.set_params, decay=10) assert_raises(ValueError, G.set_params, distance='manhattan') assert_raises(ValueError, G.set_params, thresh=1e-3) - assert_raises(ValueError, G.set_params, gamma=0.99) + assert_raises(ValueError, G.set_params, theta=0.99) assert_raises(ValueError, G.set_params, kernel_symm='*') + assert_raises(ValueError, G.set_params, bandwidth=5) G.set_params(knn=G.knn, decay=G.decay, thresh=G.thresh, distance=G.distance, - gamma=G.gamma, + theta=G.theta, kernel_symm=G.kernel_symm) diff --git a/test/test_landmark.py b/test/test_landmark.py index 42d9025..da0fbbc 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -42,6 +43,15 @@ def test_landmark_exact_graph(): assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.TraditionalGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) + assert(G.transitions.shape == (data.shape[0], n_landmark)) + assert(G.clusters.shape == (data.shape[0],)) + assert(len(np.unique(G.clusters)) <= n_landmark) + signal = np.random.normal(0, 1, [n_landmark, 10]) + interpolated_signal = G.interpolate(signal) + assert interpolated_signal.shape == (data.shape[0], signal.shape[1]) + G._reset_landmarks() + # no error on double delete + G._reset_landmarks() def test_landmark_knn_graph(): @@ -49,6 +59,7 @@ def test_landmark_knn_graph(): # knn graph G = build_graph(data, n_landmark=n_landmark, n_pca=20, decay=None, knn=5, random_state=42) + assert(G.transitions.shape == (data.shape[0], n_landmark)) assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.kNNGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) @@ -62,6 +73,7 @@ def test_landmark_mnn_graph(): thresh=1e-5, n_pca=None, decay=10, knn=5, random_state=42, sample_idx=sample_idx) + assert(G.clusters.shape == (X.shape[0],)) assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.MNNGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) @@ -132,18 +144,19 @@ def test_verbose(): def test_set_params(): G = build_graph(data, n_landmark=500, decay=None) G.landmark_op - assert G.get_params() == {'n_pca': 20, - 'random_state': 42, - 'kernel_symm': '+', - 'gamma': None, - 'n_landmark': 500, - 'knn': 3, - 'decay': None, - 'distance': - 'euclidean', - 'thresh': 0, - 'n_jobs': -1, - 'verbose': 0} + assert G.get_params() == { + 'n_pca': 20, + 'random_state': 42, + 'kernel_symm': '+', + 'theta': None, + 'n_landmark': 500, + 'knn': 3, + 'decay': None, + 'bandwidth': None, + 'distance': 'euclidean', + 'thresh': 0, + 'n_jobs': -1, + 'verbose': 0} G.set_params(n_landmark=300) assert G.landmark_op.shape == (300, 300) G.set_params(n_landmark=G.n_landmark, n_svd=G.n_svd) diff --git a/test/test_mnn.py b/test/test_mnn.py index 330a4e4..be78437 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -10,8 +11,10 @@ generate_swiss_roll, assert_raises, raises, + warns, cdist, ) +from scipy.linalg import norm ##################################################### @@ -49,38 +52,97 @@ def test_build_mnn_with_precomputed(): @raises(ValueError) -def test_mnn_with_square_gamma_wrong_length(): +def test_mnn_with_square_theta_wrong_length(): n_sample = len(np.unique(digits['target'])) - # square matrix gamma of the wrong size + # square matrix theta of the wrong size build_graph( data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='gamma', - gamma=np.tile(np.linspace(0, 1, n_sample - 1), + kernel_symm='theta', + theta=np.tile(np.linspace(0, 1, n_sample - 1), n_sample).reshape(n_sample - 1, n_sample)) @raises(ValueError) -def test_mnn_with_vector_gamma(): +def test_mnn_with_vector_theta(): n_sample = len(np.unique(digits['target'])) - # vector gamma + # vector theta + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta=np.linspace(0, 1, n_sample - 1)) + + +@raises(ValueError) +def test_mnn_with_unbounded_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta=2) + + +@raises(ValueError) +def test_mnn_with_string_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta='invalid') + + +@warns(FutureWarning) +def test_mnn_with_gamma(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + gamma=0.9) + + +@warns(FutureWarning) +def test_mnn_with_kernel_symm_gamma(): build_graph( data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], kernel_symm='gamma', - gamma=np.linspace(0, 1, n_sample - 1)) + theta=0.9) + + +@warns(UserWarning) +def test_mnn_with_theta_and_kernel_symm_not_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='+', + theta=0.9) + + +@warns(UserWarning) +def test_mnn_with_kernel_symmm_theta_and_no_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta') def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) sample_idx += 1 G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -92,11 +154,11 @@ def test_mnn_with_non_zero_indexed_sample_idx(): def test_mnn_with_string_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) sample_idx = np.where(sample_idx == 0, 'a', 'b') G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -110,13 +172,13 @@ def test_mnn_with_string_sample_idx(): ##################################################### -def test_mnn_graph_float_gamma(): +def test_mnn_graph_float_theta(): X, sample_idx = generate_swiss_roll() - gamma = 0.9 + theta = 0.9 k = 10 a = 20 metric = 'euclidean' - beta = 0 + beta = 0.5 samples = np.unique(sample_idx) K = np.zeros((len(X), len(X))) @@ -133,31 +195,46 @@ def test_mnn_graph_float_gamma(): pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel if si == sj: - K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \ - (1 - beta) # fill out values in K for NN on diagonal + K.iloc[sample_idx == si, sample_idx == sj] = ( + k_ij + k_ij.T) / 2 else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij - - W = np.array((gamma * np.minimum(K, K.T)) + - ((1 - gamma) * np.maximum(K, K.T))) + Kn = K.copy() + for i in samples: + curr_K = K.iloc[sample_idx == i, sample_idx == i] + i_norm = norm(curr_K, 1, axis=1) + for j in samples: + if i == j: + continue + else: + curr_K = K.iloc[sample_idx == i, sample_idx == j] + curr_norm = norm(curr_K, 1, axis=1) + scale = np.minimum( + np.ones(len(curr_norm)), i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, sample_idx == j] = ( + curr_K.T * scale).T + + K = Kn + W = np.array((theta * np.minimum(K, K.T)) + + ((1 - theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) - G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, - kernel_symm='gamma', gamma=gamma, + G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta, + kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N - assert np.all(G.d == G2.d) + np.testing.assert_array_equal(G.dw, G2.dw) assert (G.W != G2.W).nnz == 0 assert (G2.W != G.W).sum() == 0 assert isinstance(G2, graphtools.graphs.MNNGraph) -def test_mnn_graph_matrix_gamma(): +def test_mnn_graph_matrix_theta(): X, sample_idx = generate_swiss_roll() bs = 0.8 - gamma = np.array([[1, bs], # 0 + theta = np.array([[1, bs], # 0 [bs, 1]]) # 3 k = 10 a = 20 @@ -179,30 +256,46 @@ def test_mnn_graph_matrix_gamma(): pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel if si == sj: - K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \ - (1 - beta) # fill out values in K for NN on diagonal + K.iloc[sample_idx == si, sample_idx == sj] = ( + k_ij + k_ij.T) / 2 else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij + Kn = K.copy() + for i in samples: + curr_K = K.iloc[sample_idx == i, sample_idx == i] + i_norm = norm(curr_K, 1, axis=1) + for j in samples: + if i == j: + continue + else: + curr_K = K.iloc[sample_idx == i, sample_idx == j] + curr_norm = norm(curr_K, 1, axis=1) + scale = np.minimum( + np.ones(len(curr_norm)), i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, sample_idx == j] = ( + curr_K.T * scale).T + + K = Kn K = np.array(K) - matrix_gamma = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx)))) + matrix_theta = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx)))) for ix, si in enumerate(set(sample_idx)): for jx, sj in enumerate(set(sample_idx)): - matrix_gamma.iloc[sample_idx == si, - sample_idx == sj] = gamma[ix, jx] + matrix_theta.iloc[sample_idx == si, + sample_idx == sj] = theta[ix, jx] - W = np.array((matrix_gamma * np.minimum(K, K.T)) + - ((1 - matrix_gamma) * np.maximum(K, K.T))) + W = np.array((matrix_theta * np.minimum(K, K.T)) + + ((1 - matrix_theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) - G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, - kernel_symm='gamma', gamma=gamma, + G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta, + kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N - assert np.all(G.d == G2.d) + np.testing.assert_array_equal(G.dw, G2.dw) assert (G.W != G2.W).nnz == 0 assert (G2.W != G.W).sum() == 0 assert isinstance(G2, graphtools.graphs.MNNGraph) @@ -220,25 +313,26 @@ def test_verbose(): print() print("Verbose test: MNN") build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, verbose=True) def test_set_params(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, thresh=1e-4) assert G.get_params() == { 'n_pca': None, 'random_state': 42, - 'kernel_symm': 'gamma', - 'gamma': 0.5, + 'kernel_symm': 'theta', + 'theta': 0.5, 'beta': 1, - 'adaptive_k': 'sqrt', + 'adaptive_k': None, 'knn': 3, 'decay': 10, + 'bandwidth': None, 'distance': 'euclidean', 'thresh': 1e-4, 'n_jobs': 1 diff --git a/unittest.cfg b/unittest.cfg index 0f1a4ec..85c81ba 100644 --- a/unittest.cfg +++ b/unittest.cfg @@ -3,4 +3,4 @@ verbose = True [coverage] always-on = True -coverage = graphtools \ No newline at end of file +coverage = graphtools