diff --git a/.gitignore b/.gitignore index c2f0954..2ae892c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ build dist *egg-info .coverage +.eggs #syncthing .syncthing.* diff --git a/.travis.yml b/.travis.yml index 7640295..e7159a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,9 +16,11 @@ libjs-mathjax script: - - pip install -U .[test,doc] + - pip install -U .[test] - python setup.py test - - cd doc; make html; cd .. + - pip install -U .[doc] + - cd doc; make html + - cd .. deploy: provider: pypi diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt index 7ee549c..6c2acd8 100644 --- a/doc/source/requirements.txt +++ b/doc/source/requirements.txt @@ -7,4 +7,3 @@ sphinx sphinxcontrib-napoleon sphinxcontrib-bibtex tasklogger - diff --git a/graphtools/api.py b/graphtools/api.py index ed43d04..7fdae5a 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -11,11 +11,12 @@ def Graph(data, n_pca=None, + rank_threshold=None, sample_idx=None, adaptive_k=None, precomputed=None, knn=5, - decay=10, + decay=40, bandwidth=None, bandwidth_scale=1.0, anisotropy=0, @@ -53,19 +54,29 @@ def Graph(data, ---------- data : array-like, shape=[n_samples,n_features] accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. - TODO: accept pandas dataframes + TODO: accept pandas dataframes' - n_pca : `int` or `None`, optional (default: `None`) + n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`) number of PC dimensions to retain for graph building. - If `None`, uses the original data. + If n_pca in `[None, False, 0]`, uses the original data. + If 'auto' or `True` then estimate using a singular value threshold Note: if data is sparse, uses SVD instead of PCA TODO: should we subtract and store the mean? + rank_threshold : `float`, 'auto', optional (default: 'auto') + threshold to use when estimating rank for + `n_pca in [True, 'auto']`. + If 'auto', this threshold is + s_max * eps * max(n_samples, n_features) + where s_max is the maximum singular value of the data matrix + and eps is numerical precision. [press2007]_. + knn : `int`, optional (default: 5) Number of nearest neighbors (including self) to use to build the graph - decay : `int` or `None`, optional (default: 10) - Rate of alpha decay to use. If `None`, alpha decay is not used. + decay : `int` or `None`, optional (default: 40) + Rate of alpha decay to use. If `None`, alpha decay is not used and a vanilla + k-Nearest Neighbors graph is returned. bandwidth : `float`, list-like,`callable`, or `None`, optional (default: `None`) Fixed bandwidth to use. If given, overrides `knn`. Can be a single @@ -91,14 +102,14 @@ def Graph(data, on time and memory constraints. kernel_symm : string, optional (default: '+') - Defines method of MNN symmetrization. + Defines method of kernel symmetrization. '+' : additive '*' : multiplicative - 'theta' : min-max + 'mnn' : min-max MNN symmetrization 'none' : no symmetrization theta: float (default: None) - Min-max symmetrization constant or matrix. Only used if kernel_symm='theta'. + Min-max symmetrization constant or matrix. Only used if kernel_symm='mnn'. K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) @@ -155,6 +166,12 @@ def Graph(data, Raises ------ ValueError : if selected parameters are incompatible. + + References + ---------- + .. [press2007] W. Press, S. Teukolsky, W. Vetterling and B. Flannery, + “Numerical Recipes (3rd edition)”, + Cambridge University Press, 2007, page 795. """ tasklogger.set_level(verbose) if sample_idx is not None and len(np.unique(sample_idx)) == 1: diff --git a/graphtools/base.py b/graphtools/base.py index d0e1fa4..58e941b 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -1,11 +1,13 @@ from future.utils import with_metaclass from builtins import super +from copy import copy as shallow_copy import numpy as np import abc import pygsp from inspect import signature from sklearn.decomposition import PCA, TruncatedSVD from sklearn.preprocessing import normalize +from sklearn.utils.graph import graph_shortest_path from scipy import sparse import warnings import numbers @@ -81,12 +83,21 @@ class Data(Base): accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. `pandas.DataFrame`, `pandas.SparseDataFrame`. - n_pca : `int` or `None`, optional (default: `None`) + n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`) number of PC dimensions to retain for graph building. - If `None`, uses the original data. + If n_pca in `[None, False, 0]`, uses the original data. + If 'auto' or `True` then estimate using a singular value threshold Note: if data is sparse, uses SVD instead of PCA TODO: should we subtract and store the mean? + rank_threshold : `float`, 'auto', optional (default: 'auto') + threshold to use when estimating rank for + `n_pca in [True, 'auto']`. + If 'auto', this threshold is + s_max * eps * max(n_samples, n_features) + where s_max is the maximum singular value of the data matrix + and eps is numerical precision. [press2007]_. + random_state : `int` or `None`, optional (default: `None`) Random state for random PCA @@ -104,20 +115,20 @@ class Data(Base): sklearn PCA operator """ - def __init__(self, data, n_pca=None, random_state=None, **kwargs): + def __init__(self, data, n_pca=None, rank_threshold=None, + random_state=None, **kwargs): self._check_data(data) - if n_pca is not None and np.min(data.shape) <= n_pca: - warnings.warn("Cannot perform PCA to {} dimensions on " - "data with min(n_samples, n_features) = {}".format( - n_pca, np.min(data.shape)), - RuntimeWarning) - n_pca = None + n_pca, rank_threshold = self._parse_n_pca_threshold( + data, n_pca, rank_threshold) try: if isinstance(data, pd.SparseDataFrame): data = data.to_coo() elif isinstance(data, pd.DataFrame): - data = np.array(data) + try: + data = data.sparse.to_coo() + except AttributeError: + data = np.array(data) except NameError: # pandas not installed pass @@ -130,10 +141,75 @@ def __init__(self, data, n_pca=None, random_state=None, **kwargs): pass self.data = data self.n_pca = n_pca + self.rank_threshold = rank_threshold self.random_state = random_state self.data_nu = self._reduce_data() super().__init__(**kwargs) + def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): + if isinstance(n_pca, str): + n_pca = n_pca.lower() + if n_pca != "auto": + raise ValueError("n_pca must be an integer " + "0 <= n_pca < min(n_samples,n_features), " + "or in [None,False,True,'auto'].") + if isinstance(n_pca, numbers.Number): + if not float(n_pca).is_integer(): # cast it to integer + n_pcaR = np.round(n_pca).astype(int) + warnings.warn( + "Cannot perform PCA to fractional {} dimensions. " + "Rounding to {}".format( + n_pca, n_pcaR), RuntimeWarning) + n_pca = n_pcaR + + if n_pca < 0: + raise ValueError( + "n_pca cannot be negative. " + "Please supply an integer " + "0 <= n_pca < min(n_samples,n_features) or None") + elif np.min(data.shape) <= n_pca: + warnings.warn( + "Cannot perform PCA to {} dimensions on " + "data with min(n_samples, n_features) = {}".format( + n_pca, np.min( + data.shape)), RuntimeWarning) + n_pca = 0 + + if n_pca in [0, False, None]: # cast 0, False to None. + n_pca = None + elif n_pca is True: # notify that we're going to estimate rank. + n_pca = 'auto' + tasklogger.log_info("Estimating n_pca from matrix rank. " + "Supply an integer n_pca " + "for fixed amount.") + if not any([isinstance(n_pca, numbers.Number), + n_pca is None, + n_pca == 'auto']): + raise ValueError( + "n_pca was not an instance of numbers.Number, " + "could not be cast to False, and not None. " + "Please supply an integer " + "0 <= n_pca < min(n_samples,n_features) or None") + if rank_threshold is not None and n_pca != 'auto': + warnings.warn("n_pca = {}, therefore rank_threshold of {} " + "will not be used. To use rank thresholding, " + "set n_pca = True".format(n_pca, rank_threshold), + RuntimeWarning) + if n_pca == 'auto': + if isinstance(rank_threshold, str): + rank_threshold = rank_threshold.lower() + if rank_threshold is None: + rank_threshold = 'auto' + if isinstance(rank_threshold, numbers.Number): + if rank_threshold <= 0: + raise ValueError( + "rank_threshold must be positive float or 'auto'. ") + else: + if rank_threshold != 'auto': + raise ValueError( + "rank_threshold must be positive float or 'auto'. ") + return n_pca, rank_threshold + def _check_data(self, data): if len(data.shape) != 2: msg = "ValueError: Expected 2D array, got {}D array " \ @@ -150,25 +226,50 @@ def _reduce_data(self): If data is dense, uses randomized PCA. If data is sparse, uses randomized SVD. TODO: should we subtract and store the mean? + TODO: Fix the rank estimation so we do not compute the full SVD. Returns ------- Reduced data matrix """ - if self.n_pca is not None and self.n_pca < self.data.shape[1]: + if self.n_pca is not None and (self.n_pca == 'auto' or self.n_pca < self.data.shape[1]): tasklogger.log_start("PCA") + n_pca = self.data.shape[1] - 1 if self.n_pca == 'auto' else self.n_pca if sparse.issparse(self.data): if isinstance(self.data, sparse.coo_matrix) or \ isinstance(self.data, sparse.lil_matrix) or \ isinstance(self.data, sparse.dok_matrix): self.data = self.data.tocsr() - self.data_pca = TruncatedSVD(self.n_pca, - random_state=self.random_state) + self.data_pca = TruncatedSVD(n_pca, random_state=self.random_state) else: - self.data_pca = PCA(self.n_pca, + self.data_pca = PCA(n_pca, svd_solver='randomized', random_state=self.random_state) self.data_pca.fit(self.data) + if self.n_pca == 'auto': + s = self.data_pca.singular_values_ + smax = s.max() + if self.rank_threshold == 'auto': + threshold = smax * \ + np.finfo(self.data.dtype).eps * max(self.data.shape) + self.rank_threshold = threshold + threshold = self.rank_threshold + gate = np.where(s >= threshold)[0] + self.n_pca = gate.shape[0] + if self.n_pca == 0: + raise ValueError("Supplied threshold {} was greater than " + "maximum singular value {} " + "for the data matrix".format(threshold, smax)) + tasklogger.log_info( + "Using rank estimate of {} as n_pca".format(self.n_pca)) + # reset the sklearn operator + op = self.data_pca # for line-width brevity.. + op.components_ = op.components_[gate, :] + op.explained_variance_ = op.explained_variance_[gate] + op.explained_variance_ratio_ = op.explained_variance_ratio_[ + gate] + op.singular_values_ = op.singular_values_[gate] + self.data_pca = op # im not clear if this is needed due to assignment rules data_nu = self.data_pca.transform(self.data) tasklogger.log_complete("PCA") return data_nu @@ -259,6 +360,7 @@ def inverse_transform(self, Y, columns=None): ---------- Y : array-like, shape=[n_samples_y, n_pca] n_features must be the same as `self.data_nu`. + columns : list-like list of integers referring to column indices in the original data space to be returned. Avoids recomputing the full matrix where only @@ -311,10 +413,10 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): ---------- kernel_symm : string, optional (default: '+') - Defines method of MNN symmetrization. + Defines method of kernel symmetrization. '+' : additive '*' : multiplicative - 'theta' : min-max + 'mnn' : min-max MNN symmetrization 'none' : no symmetrization theta: float (default: 1) @@ -355,8 +457,12 @@ def __init__(self, theta = gamma if kernel_symm == 'gamma': warnings.warn("kernel_symm='gamma' is deprecated. " - "Setting kernel_symm='theta'", FutureWarning) - kernel_symm = 'theta' + "Setting kernel_symm='mnn'", FutureWarning) + kernel_symm = 'mnn' + if kernel_symm == 'theta': + warnings.warn("kernel_symm='theta' is deprecated. " + "Setting kernel_symm='mnn'", FutureWarning) + kernel_symm = 'mnn' self.kernel_symm = kernel_symm self.theta = theta self._check_symmetrization(kernel_symm, theta) @@ -373,20 +479,20 @@ def __init__(self, super().__init__(**kwargs) def _check_symmetrization(self, kernel_symm, theta): - if kernel_symm not in ['+', '*', 'theta', None]: + if kernel_symm not in ['+', '*', 'mnn', None]: raise ValueError( "kernel_symm '{}' not recognized. Choose from " - "'+', '*', 'theta', or 'none'.".format(kernel_symm)) - elif kernel_symm != 'theta' and theta is not None: + "'+', '*', 'mnn', or 'none'.".format(kernel_symm)) + elif kernel_symm != 'mnn' and theta is not None: warnings.warn("kernel_symm='{}' but theta is not None. " - "Setting kernel_symm='theta'.".format(kernel_symm)) - self.kernel_symm = kernel_symm = 'theta' + "Setting kernel_symm='mnn'.".format(kernel_symm)) + self.kernel_symm = kernel_symm = 'mnn' - if kernel_symm == 'theta': + if kernel_symm == 'mnn': if theta is None: - warnings.warn("kernel_symm='theta' but theta not given. " - "Defaulting to theta=0.5.") self.theta = theta = 1 + warnings.warn("kernel_symm='mnn' but theta not given. " + "Defaulting to theta={}.".format(self.theta)) elif not isinstance(theta, numbers.Number) or \ theta < 0 or theta > 1: raise ValueError("theta {} not recognized. Expected " @@ -423,9 +529,9 @@ def symmetrize_kernel(self, K): elif self.kernel_symm == "*": tasklogger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) - elif self.kernel_symm == 'theta': + elif self.kernel_symm == 'mnn': tasklogger.log_debug( - "Using theta symmetrization (theta = {}).".format(self.theta)) + "Using mnn symmetrization (theta = {}).".format(self.theta)) K = self.theta * utils.elementwise_minimum(K, K.T) + \ (1 - self.theta) * utils.elementwise_maximum(K, K.T) elif self.kernel_symm is None: @@ -434,7 +540,7 @@ def symmetrize_kernel(self, K): else: # this should never happen raise ValueError( - "Expected kernel_symm in ['+', '*', 'theta' or None]. " + "Expected kernel_symm in ['+', '*', 'mnn' or None]. " "Got {}".format(self.theta)) return K @@ -526,12 +632,15 @@ def diff_aff(self): symmetric diffusion affinity matrix defined as a doubly-stochastic form of the kernel matrix """ - row_degrees = np.array(self.kernel.sum(axis=1)).reshape(-1, 1) - col_degrees = np.array(self.kernel.sum(axis=0)).reshape(1, -1) + row_degrees = utils.to_array(self.kernel.sum(axis=1)) if sparse.issparse(self.kernel): - return self.kernel.multiply(1 / np.sqrt(row_degrees)).multiply( - 1 / np.sqrt(col_degrees)) + # diagonal matrix + degrees = sparse.csr_matrix((1 / np.sqrt(row_degrees.flatten()), + np.arange(len(row_degrees)), + np.arange(len(row_degrees) + 1))) + return degrees @ self.kernel @ degrees else: + col_degrees = row_degrees.T return (self.kernel / np.sqrt(row_degrees)) / np.sqrt(col_degrees) @property @@ -562,6 +671,10 @@ def kernel(self): """ return self.K + @property + def weighted(self): + return self.decay is not None + @abc.abstractmethod def build_kernel(self): """Build the kernel matrix @@ -634,7 +747,7 @@ def to_igraph(self, attribute="weight", **kwargs): # not a pygsp graph W = self.K.copy() W = utils.set_diagonal(W, 0) - return ig.Graph.Weighted_Adjacency(utils.to_dense(W).tolist(), + return ig.Graph.Weighted_Adjacency(utils.to_array(W).tolist(), attr=attribute, **kwargs) def to_pickle(self, path): @@ -645,14 +758,94 @@ def to_pickle(self, path): path : str File path where the pickled object will be stored. """ - if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph): - # python 3.5, 3.6 - logger = self.logger - self.logger = logger.name + pickle_obj = shallow_copy(self) + is_oldpygsp = all([isinstance(self, pygsp.graphs.Graph), + int(sys.version.split(".")[1]) < 7]) + if is_oldpygsp: + pickle_obj.logger = pickle_obj.logger.name with open(path, 'wb') as f: - pickle.dump(self, f) - if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph): - self.logger = logger + pickle.dump(pickle_obj, f, protocol=pickle.HIGHEST_PROTOCOL) + + def _check_shortest_path_distance(self, distance): + if distance == 'data' and self.weighted: + raise NotImplementedError( + "Graph shortest path with constant or data distance only " + "implemented for unweighted graphs. " + "For weighted graphs, use `distance='affinity'`.") + elif distance == 'constant' and self.weighted: + raise NotImplementedError( + "Graph shortest path with constant distance only " + "implemented for unweighted graphs. " + "For weighted graphs, use `distance='affinity'`.") + elif distance == 'affinity' and not self.weighted: + raise ValueError( + "Graph shortest path with affinity distance only " + "valid for weighted graphs. " + "For unweighted graphs, use `distance='constant'` " + "or `distance='data'`.") + + def _default_shortest_path_distance(self): + if not self.weighted: + distance = 'data' + tasklogger.log_info("Using ambient data distances.") + else: + distance = 'affinity' + tasklogger.log_info("Using negative log affinity distances.") + return distance + + def shortest_path(self, method='auto', distance=None): + """ + Find the length of the shortest path between every pair of vertices on the graph + + Parameters + ---------- + method : string ['auto'|'FW'|'D'] + method to use. Options are + 'auto' : attempt to choose the best method for the current problem + 'FW' : Floyd-Warshall algorithm. O[N^3] + 'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2] + distance : {'constant', 'data', 'affinity'}, optional (default: 'data') + Distances along kNN edges. + 'constant' gives constant edge lengths. + 'data' gives distances in ambient data space. + 'affinity' gives distances as negative log affinities. + Returns + ------- + D : np.ndarray, float, shape = [N,N] + D[i,j] gives the shortest distance from point i to point j + along the graph. If no path exists, the distance is np.inf + Notes + ----- + Currently, shortest paths can only be calculated on kNNGraphs with + `decay=None` + """ + if distance is None: + distance = self._default_shortest_path_distance() + + self._check_shortest_path_distance(distance) + + if distance == 'constant': + D = self.K + elif distance == 'data': + D = sparse.coo_matrix(self.K) + D.data = np.sqrt(np.sum(( + self.data_nu[D.row] - self.data_nu[D.col])**2, axis=1)) + elif distance == 'affinity': + D = sparse.csr_matrix(self.K) + D.data = -1 * np.log(D.data) + else: + raise ValueError( + "Expected `distance` in ['constant', 'data', 'affinity']. " + "Got {}".format(distance)) + + P = graph_shortest_path(D, method=method) + # symmetrize for numerical error + P = (P + P.T) / 2 + # sklearn returns 0 if no path exists + P[np.where(P == 0)] = np.inf + # diagonal should actually be zero + P[(np.arange(P.shape[0]), np.arange(P.shape[0]))] = 0 + return P class PyGSPGraph(with_metaclass(abc.ABCMeta, pygsp.graphs.Graph, Base)): @@ -671,7 +864,7 @@ def __init__(self, lap_type='combinatorial', coords=None, plotting = {} W = self._build_weight_from_kernel(self.K) - super().__init__(W=W, + super().__init__(W, lap_type=lap_type, coords=coords, plotting=plotting, **kwargs) @@ -721,10 +914,25 @@ class DataGraph(with_metaclass(abc.ABCMeta, Data, BaseGraph)): data : array-like, shape=[n_samples,n_features] accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. - n_pca : `int` or `None`, optional (default: `None`) + n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`) number of PC dimensions to retain for graph building. - If `None`, uses the original data. + If n_pca in `[None,False,0]`, uses the original data. + If `True` then estimate using a singular value threshold Note: if data is sparse, uses SVD instead of PCA + TODO: should we subtract and store the mean? + + rank_threshold : `float`, 'auto', optional (default: 'auto') + threshold to use when estimating rank for + `n_pca in [True, 'auto']`. + Note that the default kwarg is `None` for this parameter. + It is subsequently parsed to 'auto' if necessary. + If 'auto', this threshold is + smax * np.finfo(data.dtype).eps * max(data.shape) + where smax is the maximum singular value of the data matrix. + For reference, see, e.g. + W. Press, S. Teukolsky, W. Vetterling and B. Flannery, + “Numerical Recipes (3rd edition)”, + Cambridge University Press, 2007, page 795. random_state : `int` or `None`, optional (default: `None`) Random state for random PCA and graph building diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 157da60..2d4c035 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -5,7 +5,6 @@ from sklearn.utils.extmath import randomized_svd from sklearn.preprocessing import normalize from sklearn.cluster import MiniBatchKMeans -from sklearn.utils.graph import graph_shortest_path from scipy.spatial.distance import pdist, cdist from scipy.spatial.distance import squareform from scipy import sparse @@ -13,8 +12,7 @@ import warnings import tasklogger -from .utils import (set_diagonal, - set_submatrix) +from . import utils from .base import DataGraph, PyGSPGraph @@ -88,7 +86,7 @@ def __init__(self, data, knn=5, decay=None, "n_samples ({n}). Setting knn={n}".format( k=knn, n=data.shape[0] - 2)) knn = data.shape[0] - 2 - if n_pca is None and data.shape[1] > 500: + if n_pca in [None,0,False] and data.shape[1] > 500: warnings.warn("Building a kNNGraph on data of shape {} is " "expensive. Consider setting n_pca.".format( data.shape), UserWarning) @@ -313,6 +311,9 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth = bandwidth * bandwidth_scale + # check for zero bandwidth + bandwidth = np.maximum(bandwidth, np.finfo(float).eps) + radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere( @@ -375,40 +376,6 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, tasklogger.log_complete("affinities") return K - def shortest_path(self, method='auto'): - """ - Find the length of the shortest path between every pair of vertices on the graph - - Parameters - ---------- - method : string ['auto'|'FW'|'D'] - method to use. Options are - 'auto' : attempt to choose the best method for the current problem - 'FW' : Floyd-Warshall algorithm. O[N^3] - 'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2] - Returns - ------- - D : np.ndarray, float, shape = [N,N] - D[i,j] gives the shortest distance from point i to point j - along the graph. If no path exists, the distance is np.inf - Notes - ----- - Currently, shortest paths can only be calculated on kNNGraphs with - `decay=None` - """ - if self.decay is None: - D = self.K - else: - raise NotImplementedError( - "Graph shortest path currently only " - "implemented for kNNGraph with `decay=None`.") - P = graph_shortest_path(D, method=method) - # sklearn returns 0 if no path exists - P[np.where(P == 0)] = np.inf - # diagonal should actually be zero - P[(np.arange(P.shape[0]), np.arange(P.shape[0]))] = 0 - return P - class LandmarkGraph(DataGraph): """Landmark graph @@ -716,7 +683,7 @@ class TraditionalGraph(DataGraph): knn : `int`, optional (default: 5) Number of nearest neighbors (including self) to use to build the graph - decay : `int` or `None`, optional (default: `None`) + decay : `int` or `None`, optional (default: 40) Rate of alpha decay to use. If `None`, alpha decay is not used. bandwidth : `float`, list-like,`callable`, or `None`, optional (default: `None`) @@ -733,11 +700,25 @@ class TraditionalGraph(DataGraph): distance metric for building kNN graph. TODO: actually sklearn.neighbors has even more choices - n_pca : `int` or `None`, optional (default: `None`) + n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`) number of PC dimensions to retain for graph building. - If `None`, uses the original data. - Note: if data is sparse, uses SVD instead of PCA. - Only one of `precomputed` and `n_pca` can be set. + If n_pca in `[None,False,0]`, uses the original data. + If `True` then estimate using a singular value threshold + Note: if data is sparse, uses SVD instead of PCA + TODO: should we subtract and store the mean? + + rank_threshold : `float`, 'auto', optional (default: 'auto') + threshold to use when estimating rank for + `n_pca in [True, 'auto']`. + Note that the default kwarg is `None` for this parameter. + It is subsequently parsed to 'auto' if necessary. + If 'auto', this threshold is + smax * np.finfo(data.dtype).eps * max(data.shape) + where smax is the maximum singular value of the data matrix. + For reference, see, e.g. + W. Press, S. Teukolsky, W. Vetterling and B. Flannery, + “Numerical Recipes (3rd edition)”, + Cambridge University Press, 2007, page 795. thresh : `float`, optional (default: `1e-4`) Threshold above which to calculate alpha decay kernel. @@ -752,7 +733,7 @@ class TraditionalGraph(DataGraph): """ def __init__(self, data, - knn=5, decay=10, + knn=5, decay=40, bandwidth=None, bandwidth_scale=1.0, distance='euclidean', @@ -764,7 +745,7 @@ def __init__(self, data, raise ValueError( "`decay` must be provided for a " "TraditionalGraph. For kNN kernel, use kNNGraph.") - if precomputed is not None and n_pca is not None: + if precomputed is not None and n_pca not in [None,0,False]: # the data itself is a matrix of distances / affinities n_pca = None warnings.warn("n_pca cannot be given on a precomputed graph." @@ -890,7 +871,7 @@ def build_kernel(self): not (isinstance(K, sparse.dok_matrix) or isinstance(K, sparse.lil_matrix)): K = K.tolil() - K = set_diagonal(K, 1) + K = utils.set_diagonal(K, 1) else: tasklogger.log_start("affinities") if sparse.issparse(self.data_nu): @@ -1002,6 +983,31 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None tasklogger.log_complete("affinities") return K + @property + def weighted(self): + if self.precomputed is not None: + return not utils.nonzero_discrete(self.K, [0.5, 1]) + else: + return super().weighted + + def _check_shortest_path_distance(self, distance): + if self.precomputed is not None: + if distance == 'data': + raise ValueError( + "Graph shortest path with data distance not " + "valid for precomputed graphs. For precomputed graphs, " + "use `distance='constant'` for unweighted graphs and " + "`distance='affinity'` for weighted graphs.") + super()._check_shortest_path_distance(distance) + + def _default_shortest_path_distance(self): + if self.precomputed is not None and not self.weighted: + distance = 'constant' + tasklogger.log_info("Using constant distances.") + else: + distance = super()._default_shortest_path_distance() + return distance + class MNNGraph(DataGraph): """Mutual nearest neighbors graph @@ -1071,7 +1077,8 @@ def __init__(self, data, sample_idx, super().__init__(data, n_pca=n_pca, **kwargs) def _check_symmetrization(self, kernel_symm, theta): - if kernel_symm == 'theta' and theta is not None and \ + if (kernel_symm == 'theta' or kernel_symm == 'mnn') \ + and theta is not None and \ not isinstance(theta, numbers.Number): raise TypeError("Expected `theta` as a float. " "Got {}.".format(type(theta))) @@ -1181,8 +1188,9 @@ def build_kernel(self): else: K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) for i, X in enumerate(self.subgraphs): - K = set_submatrix(K, self.sample_idx == self.samples[i], - self.sample_idx == self.samples[i], X.K) + K = utils.set_submatrix( + K, self.sample_idx == self.samples[i], + self.sample_idx == self.samples[i], X.K) within_batch_norm = np.array(np.sum(X.K, 1)).flatten() for j, Y in enumerate(self.subgraphs): if i == j: @@ -1200,8 +1208,9 @@ def build_kernel(self): Kij = Kij.multiply(scale[:, None]) else: Kij = Kij * scale[:, None] - K = set_submatrix(K, self.sample_idx == self.samples[i], - self.sample_idx == self.samples[j], Kij) + K = utils.set_submatrix( + K, self.sample_idx == self.samples[i], + self.sample_idx == self.samples[j], Kij) tasklogger.log_complete( "kernel from sample {} to {}".format(self.samples[i], self.samples[j])) diff --git a/graphtools/utils.py b/graphtools/utils.py index 4f1df04..db511ad 100644 --- a/graphtools/utils.py +++ b/graphtools/utils.py @@ -1,5 +1,6 @@ import numpy as np from scipy import sparse +import numbers def if_sparse(sparse_func, dense_func, *args, **kwargs): @@ -49,7 +50,31 @@ def set_submatrix(X, i, j, values): return X -def to_dense(X): +def sparse_nonzero_discrete(X, values): + if isinstance(X, (sparse.bsr_matrix, sparse.dia_matrix, + sparse.dok_matrix, sparse.lil_matrix)): + X = X.tocsr() + return dense_nonzero_discrete(X.data, values) + + +def dense_nonzero_discrete(X, values): + result = np.full_like(X, False, dtype=bool) + for value in values: + result = np.logical_or(result, X == value) + return np.all(result) + + +def nonzero_discrete(X, values): + if isinstance(values, numbers.Number): + values = [values] + if 0 not in values: + values.append(0) + return if_sparse(sparse_nonzero_discrete, dense_nonzero_discrete, X, values=values) + + +def to_array(X): if sparse.issparse(X): X = X.toarray() + elif isinstance(X, np.matrix): + X = X.A return X diff --git a/graphtools/version.py b/graphtools/version.py index 6849410..67bc602 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "1.1.0" +__version__ = "1.3.0" diff --git a/setup.py b/setup.py index fe3176f..0d21022 100644 --- a/setup.py +++ b/setup.py @@ -12,11 +12,13 @@ ] test_requires = [ + 'nose', 'nose2', 'pandas', 'coverage', 'coveralls', - 'python-igraph' + 'python-igraph', + 'parameterized' ] if sys.version_info[0] == 3: @@ -41,8 +43,8 @@ setup(name='graphtools', version=version, description='graphtools', - author='Jay Stanley and Scott Gigante, Krishnaswamy Lab, Yale University', - author_email='jay.stanley@yale.edu', + author='Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University', + author_email='scott.gigante@yale.edu', packages=['graphtools', ], license='GNU General Public License Version 2', install_requires=install_requires, diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index ce12fee..8105a4a 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -33,6 +33,14 @@ def ignore_igraph_warning(): message="The SafeConfigParser class has been renamed to ConfigParser " "in Python 3.2. This alias will be removed in future versions. Use " "ConfigParser directly instead") + warnings.filterwarnings( + "ignore", category=DeprecationWarning, + message="Using or importing the ABCs from 'collections' instead of from " + "'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working") + warnings.filterwarnings( + "ignore", category=DeprecationWarning, + message="Using or importing the ABCs from 'collections' instead of from " + "'collections.abc' is deprecated, and in 3.8 it will stop working") def ignore_joblib_warning(): diff --git a/test/test_data.py b/test/test_data.py index aed139c..f809478 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -13,6 +13,7 @@ squareform, pdist, ) +import numbers import warnings try: @@ -39,13 +40,101 @@ def test_3d_data(): build_graph(data[:, :, None]) +def test_0_n_pca(): + assert build_graph(data, n_pca=0).n_pca is None + assert build_graph(data, n_pca=False).n_pca is None + + +@raises(ValueError) +def test_badstring_n_pca(): + build_graph(data, n_pca='foobar') + + +@raises(ValueError) +def test_uncastable_n_pca(): + build_graph(data, n_pca=[]) + + +@raises(ValueError) +def test_negative_n_pca(): + build_graph(data, n_pca=-1) + + +@raises(ValueError) +def test_badstring_rank_threshold(): + build_graph(data, n_pca=True, rank_threshold='foobar') + + +@raises(ValueError) +def test_negative_rank_threshold(): + build_graph(data, n_pca=True, rank_threshold=-1) + + +@raises(ValueError) +@warns(RuntimeWarning) +def test_True_n_pca_large_threshold(): + build_graph(data, n_pca=True, + rank_threshold=np.linalg.norm(data)**2) + + +@warns(RuntimeWarning) +def test_invalid_threshold1(): + assert build_graph(data, n_pca=10, rank_threshold=-1).n_pca == 10 + + +@raises(ValueError) +def test_invalid_threshold2(): + build_graph(data, n_pca=True, rank_threshold=-1) + + +@raises(ValueError) +def test_invalid_threshold2(): + build_graph(data, n_pca=True, rank_threshold=[]) + + +def test_True_n_pca(): + assert isinstance(build_graph(data, n_pca=True).n_pca, numbers.Number) + + +def test_True_n_pca_manual_rank_threshold(): + g = build_graph(data, n_pca=True, + rank_threshold=0.1) + assert isinstance(g.n_pca, numbers.Number) + assert isinstance(g.rank_threshold, numbers.Number) + + +def test_True_n_pca_auto_rank_threshold(): + g = build_graph(data, n_pca=True, + rank_threshold='auto') + assert isinstance(g.n_pca, numbers.Number) + assert isinstance(g.rank_threshold, numbers.Number) + next_threshold = np.sort(g.data_pca.singular_values_)[2] + g2 = build_graph(data, n_pca=True, rank_threshold=next_threshold) + assert g.n_pca > g2.n_pca + + +def test_goodstring_rank_threshold(): + build_graph(data, n_pca=True, rank_threshold='auto') + build_graph(data, n_pca=True, rank_threshold='AUTO') + + +def test_string_n_pca(): + build_graph(data, n_pca='auto') + build_graph(data, n_pca='AUTO') + + +@warns(RuntimeWarning) +def test_fractional_n_pca(): + build_graph(data, n_pca=1.5) + + @warns(RuntimeWarning) def test_too_many_n_pca(): build_graph(data, n_pca=data.shape[1]) @warns(RuntimeWarning) -def test_too_many_n_pca(): +def test_too_many_n_pca2(): build_graph(data[:data.shape[1] - 1], n_pca=data.shape[1] - 1) @@ -69,7 +158,11 @@ def test_pandas_dataframe(): def test_pandas_sparse_dataframe(): - G = build_graph(pd.SparseDataFrame(data)) + try: + X = pd.DataFrame(data).astype(pd.SparseDtype(float, fill_value=0)) + except AttributeError: + X = pd.SparseDataFrame(data, default_fill_value=0) + G = build_graph(X) assert isinstance(G, graphtools.base.BaseGraph) assert isinstance(G.data, sp.csr_matrix) @@ -192,6 +285,45 @@ def test_inverse_transform_sparse_no_pca(): sp.csr_matrix(G.data)[:, :15]) +##################################################### +# Check adaptive PCA with rank thresholding +##################################################### + + +def test_transform_adaptive_pca(): + G = build_graph(data, n_pca=True, random_state=42) + assert(np.all(G.data_nu == G.transform(G.data))) + assert_raises(ValueError, G.transform, G.data[:, 0]) + assert_raises(ValueError, G.transform, G.data[:, None, :15]) + assert_raises(ValueError, G.transform, G.data[:, :15]) + + G2 = build_graph(data, n_pca=True, + rank_threshold=G.rank_threshold, random_state=42) + assert(np.allclose(G2.data_nu, G2.transform(G2.data))) + assert(np.allclose(G2.data_nu, G.transform(G.data))) + + G3 = build_graph(data, n_pca=G2.n_pca, random_state=42) + + assert(np.allclose(G3.data_nu, G3.transform(G3.data))) + assert(np.allclose(G3.data_nu, G2.transform(G2.data))) + + +def test_transform_sparse_adaptive_pca(): + G = build_graph(data, sparse=True, n_pca=True, random_state=42) + assert(np.all(G.data_nu == G.transform(G.data))) + assert_raises(ValueError, G.transform, sp.csr_matrix(G.data)[:, 0]) + assert_raises(ValueError, G.transform, sp.csr_matrix(G.data)[:, :15]) + + G2 = build_graph(data, sparse=True, n_pca=True, + rank_threshold=G.rank_threshold, random_state=42) + assert(np.allclose(G2.data_nu, G2.transform(G2.data))) + assert(np.allclose(G2.data_nu, G.transform(G.data))) + + G3 = build_graph(data, sparse=True, n_pca=G2.n_pca, random_state=42) + assert(np.allclose(G3.data_nu, G3.transform(G3.data))) + assert(np.allclose(G3.data_nu, G2.transform(G2.data))) + + ############# # Test API ############# diff --git a/test/test_exact.py b/test/test_exact.py index 5373bcd..8dfbc66 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -1,4 +1,5 @@ from __future__ import print_function +from sklearn.utils.graph import graph_shortest_path from load_tests import ( graphtools, np, @@ -452,6 +453,74 @@ def test_exact_graph_anisotropy(): decay=a, knn=k - 1, random_state=42, use_pygsp=True, anisotropy='invalid') +##################################################### +# Check extra functionality +##################################################### + + +def test_shortest_path_affinity(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + D = -1 * np.where(G.K != 0, np.log(np.where(G.K != 0, G.K, np.nan)), 0) + P = graph_shortest_path(D) + # sklearn returns 0 if no path exists + P[np.where(P == 0)] = np.inf + # diagonal should actually be zero + np.fill_diagonal(P, 0) + np.testing.assert_allclose(P, G.shortest_path(distance='affinity')) + np.testing.assert_allclose(P, G.shortest_path()) + + +def test_shortest_path_affinity_precomputed(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + G = graphtools.Graph(G.K, precomputed='affinity') + D = -1 * np.where(G.K != 0, np.log(np.where(G.K != 0, G.K, np.nan)), 0) + P = graph_shortest_path(D) + # sklearn returns 0 if no path exists + P[np.where(P == 0)] = np.inf + # diagonal should actually be zero + np.fill_diagonal(P, 0) + np.testing.assert_allclose(P, G.shortest_path(distance='affinity')) + np.testing.assert_allclose(P, G.shortest_path()) + + +@raises(NotImplementedError) +def test_shortest_path_decay_constant(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + G.shortest_path(distance='constant') + + +@raises(NotImplementedError) +def test_shortest_path_precomputed_decay_constant(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + G = graphtools.Graph(G.K, precomputed='affinity') + G.shortest_path(distance='constant') + + +@raises(NotImplementedError) +def test_shortest_path_decay_data(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + G.shortest_path(distance='data') + + +@raises(ValueError) +def test_shortest_path_precomputed_data(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=15) + G = graphtools.Graph(G.K, precomputed='affinity') + G.shortest_path(distance='data') + + ##################################################### # Check interpolation ##################################################### diff --git a/test/test_knn.py b/test/test_knn.py index 998f635..52e0223 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,19 +1,17 @@ from __future__ import print_function, division from sklearn.utils.graph import graph_shortest_path +from scipy.spatial.distance import pdist, squareform from load_tests import ( graphtools, np, sp, pygsp, - nose2, data, datasets, build_graph, assert_raises, warns, raises, - squareform, - pdist, PCA, TruncatedSVD, ) @@ -316,25 +314,78 @@ def test_knn_interpolate(): ################################################# -def test_shortest_path(): +def test_shortest_path_constant(): data_small = data[np.random.choice( len(data), len(data) // 4, replace=False)] G = build_graph(data_small, knn=5, decay=None) - K = G.K P = graph_shortest_path(G.K) # sklearn returns 0 if no path exists P[np.where(P == 0)] = np.inf # diagonal should actually be zero np.fill_diagonal(P, 0) + np.testing.assert_equal(P, G.shortest_path(distance='constant')) + + +def test_shortest_path_precomputed_constant(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=None) + G = graphtools.Graph(G.K, precomputed='affinity') + P = graph_shortest_path(G.K) + # sklearn returns 0 if no path exists + P[np.where(P == 0)] = np.inf + # diagonal should actually be zero + np.fill_diagonal(P, 0) + np.testing.assert_equal(P, G.shortest_path(distance='constant')) np.testing.assert_equal(P, G.shortest_path()) -@raises(NotImplementedError) -def test_shortest_path_decay(): +def test_shortest_path_data(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=None) + D = squareform(pdist(G.data_nu)) * np.where(G.K.toarray() > 0, 1, 0) + P = graph_shortest_path(D) + # sklearn returns 0 if no path exists + P[np.where(P == 0)] = np.inf + # diagonal should actually be zero + np.fill_diagonal(P, 0) + np.testing.assert_allclose(P, G.shortest_path(distance='data')) + np.testing.assert_allclose(P, G.shortest_path()) + + +@raises(ValueError) +def test_shortest_path_no_decay_affinity(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=None) + G.shortest_path(distance='affinity') + + +@raises(ValueError) +def test_shortest_path_precomputed_no_decay_affinity(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=None) + G = graphtools.Graph(G.K, precomputed='affinity') + G.shortest_path(distance='affinity') + + +@raises(ValueError) +def test_shortest_path_precomputed_no_decay_data(): + data_small = data[np.random.choice( + len(data), len(data) // 4, replace=False)] + G = build_graph(data_small, knn=5, decay=None) + G = graphtools.Graph(G.K, precomputed='affinity') + G.shortest_path(distance='data') + + +@raises(ValueError) +def test_shortest_path_invalid(): data_small = data[np.random.choice( len(data), len(data) // 4, replace=False)] - G = build_graph(data_small, knn=5, decay=15, thresh=1e-4) - G.shortest_path() + G = build_graph(data_small, knn=5, decay=None) + G.shortest_path(distance='invalid') #################### diff --git a/test/test_mnn.py b/test/test_mnn.py index 3cbdb2c..0e91aa4 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -59,7 +59,7 @@ def test_mnn_with_matrix_theta(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', theta=np.tile(np.linspace(0, 1, n_sample), n_sample).reshape(n_sample, n_sample)) @@ -72,7 +72,7 @@ def test_mnn_with_vector_theta(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', theta=np.linspace(0, 1, n_sample - 1)) @@ -82,7 +82,7 @@ def test_mnn_with_unbounded_theta(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', theta=2) @@ -92,7 +92,7 @@ def test_mnn_with_string_theta(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', theta='invalid') @@ -102,7 +102,7 @@ def test_mnn_with_gamma(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', gamma=0.9) @@ -116,6 +116,26 @@ def test_mnn_with_kernel_symm_gamma(): theta=0.9) +@raises(ValueError) +def test_mnn_with_kernel_symm_invalid(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='invalid', + theta=0.9) + + +@warns(FutureWarning) +def test_mnn_with_kernel_symm_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta=0.9) + + @warns(UserWarning) def test_mnn_with_theta_and_kernel_symm_not_theta(): build_graph( @@ -132,7 +152,7 @@ def test_mnn_with_kernel_symmm_theta_and_no_theta(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta') + kernel_symm='mnn') @warns(DeprecationWarning) @@ -141,18 +161,18 @@ def test_mnn_adaptive_k(): data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='theta', + kernel_symm='mnn', theta=0.9, adaptive_k='sqrt') def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, use_pygsp=True) sample_idx += 1 G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -164,11 +184,11 @@ def test_mnn_with_non_zero_indexed_sample_idx(): def test_mnn_with_string_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, use_pygsp=True) sample_idx = np.where(sample_idx == 0, 'a', 'b') G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -230,7 +250,7 @@ def test_mnn_graph_no_decay(): np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) G2 = graphtools.Graph(X, knn=k, decay=a, beta=beta, - kernel_symm='theta', theta=theta, + kernel_symm='mnn', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N @@ -289,7 +309,7 @@ def test_mnn_graph_decay(): np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) G2 = graphtools.Graph(X, knn=k, decay=a, beta=beta, - kernel_symm='theta', theta=theta, + kernel_symm='mnn', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N @@ -310,20 +330,20 @@ def test_verbose(): print() print("Verbose test: MNN") build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, verbose=True) def test_set_params(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='theta', theta=0.5, + kernel_symm='mnn', theta=0.5, n_pca=None, thresh=1e-4) assert G.get_params() == { 'n_pca': None, 'random_state': 42, - 'kernel_symm': 'theta', + 'kernel_symm': 'mnn', 'theta': 0.5, 'anisotropy': 0, 'beta': 1, diff --git a/test/test_utils.py b/test/test_utils.py new file mode 100644 index 0000000..d26c612 --- /dev/null +++ b/test/test_utils.py @@ -0,0 +1,30 @@ +import graphtools.utils +from parameterized import parameterized +from scipy import sparse +import numpy as np +import graphtools +from load_tests import data + + +@parameterized( + [(np.array,), (sparse.csr_matrix,), (sparse.csc_matrix,), + (sparse.bsr_matrix,), (sparse.lil_matrix,), (sparse.coo_matrix,)]) +def test_nonzero_discrete(matrix_class): + X = np.random.choice([0, 1, 2], p=[0.95, 0.025, 0.025], size=(100, 100)) + X = matrix_class(X) + assert graphtools.utils.nonzero_discrete(X, [1, 2]) + assert not graphtools.utils.nonzero_discrete(X, [1, 3]) + + +@parameterized( + [(0,), (1e-4,)]) +def test_nonzero_discrete_knngraph(thresh): + G = graphtools.Graph(data, n_pca=10, knn=5, decay=None, thresh=thresh) + assert graphtools.utils.nonzero_discrete(G.K, [0.5, 1]) + + +@parameterized( + [(0,), (1e-4,)]) +def test_nonzero_discrete_decay_graph(thresh): + G = graphtools.Graph(data, n_pca=10, knn=5, decay=15, thresh=thresh) + assert not graphtools.utils.nonzero_discrete(G.K, [0.5, 1])