From c1e84f48af2fbfac27f3a808b6d42d9112c03d7e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 3 Sep 2018 12:46:39 -0400 Subject: [PATCH 01/26] bump tasklogger version --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c19f67d..6fed4f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy>=1.1.0 pygsp>=>=0.5.1 scikit-learn>=0.19.1 future -tasklogger>=0.2 +tasklogger>=0.2.1 diff --git a/setup.py b/setup.py index fca28af..1394bf8 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ 'pygsp>=0.5.1', 'scikit-learn>=0.19.1', 'future', - 'tasklogger>=0.2', + 'tasklogger>=0.2.1', ] test_requires = [ From 74897db2e798741a7b1489d6f5ff55a5e160b9ba Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Thu, 6 Sep 2018 10:31:51 -0400 Subject: [PATCH 02/26] gamma -> theta --- graphtools/api.py | 10 +++--- graphtools/base.py | 67 ++++++++++++++++++------------------ graphtools/graphs.py | 80 +++++++++++++++++++++---------------------- test/test_exact.py | 2 +- test/test_knn.py | 6 ++-- test/test_landmark.py | 2 +- test/test_mnn.py | 58 +++++++++++++++---------------- unittest.cfg | 2 +- 8 files changed, 113 insertions(+), 114 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index ede4f39..ee2a55f 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -16,7 +16,7 @@ def Graph(data, distance='euclidean', thresh=1e-4, kernel_symm='+', - gamma=None, + theta=None, n_landmark=None, n_svd=100, beta=1, @@ -75,12 +75,12 @@ def Graph(data, Defines method of MNN symmetrization. '+' : additive '*' : multiplicative - 'gamma' : min-max + 'theta' : min-max 'none' : no symmetrization - gamma: float (default: None) - Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'. - K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` + theta: float (default: None) + Min-max symmetrization constant or matrix. Only used if kernel_symm='theta'. + K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) If the graph is precomputed, this variable denotes which graph diff --git a/graphtools/base.py b/graphtools/base.py index edd678a..53e8fb1 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -27,7 +27,6 @@ elementwise_maximum, set_diagonal) - class Base(object): """Class that deals with key-word arguments but is otherwise just an object. @@ -311,12 +310,12 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): Defines method of MNN symmetrization. '+' : additive '*' : multiplicative - 'gamma' : min-max + 'theta' : min-max 'none' : no symmetrization - gamma: float (default: 0.5) + theta: float (default: 0.5) Min-max symmetrization constant. - K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` + K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` initialize : `bool`, optional (default : `True`) if false, don't create the kernel matrix. @@ -337,11 +336,11 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): """ def __init__(self, kernel_symm='+', - gamma=None, + theta=None, initialize=True, **kwargs): self.kernel_symm = kernel_symm - self.gamma = gamma - self._check_symmetrization(kernel_symm, gamma) + self.theta = theta + self._check_symmetrization(kernel_symm, theta) if initialize: tasklogger.log_debug("Initializing kernel...") @@ -350,25 +349,25 @@ def __init__(self, kernel_symm='+', tasklogger.log_debug("Not initializing kernel.") super().__init__(**kwargs) - def _check_symmetrization(self, kernel_symm, gamma): - if kernel_symm not in ['+', '*', 'gamma', None]: + def _check_symmetrization(self, kernel_symm, theta): + if kernel_symm not in ['+', '*', 'theta', None]: raise ValueError( "kernel_symm '{}' not recognized. Choose from " - "'+', '*', 'gamma', or 'none'.".format(kernel_symm)) - elif kernel_symm != 'gamma' and gamma is not None: - warnings.warn("kernel_symm='{}' but gamma is not None. " - "Setting kernel_symm='gamma'.".format(kernel_symm)) - self.kernel_symm = kernel_symm = 'gamma' - - if kernel_symm == 'gamma': - if gamma is None: - warnings.warn("kernel_symm='gamma' but gamma not given. " - "Defaulting to gamma=0.5.") - self.gamma = gamma = 0.5 - elif not isinstance(gamma, numbers.Number) or \ - gamma < 0 or gamma > 1: - raise ValueError("gamma {} not recognized. Expected " - "a float between 0 and 1".format(gamma)) + "'+', '*', 'theta', or 'none'.".format(kernel_symm)) + elif kernel_symm != 'theta' and theta is not None: + warnings.warn("kernel_symm='{}' but theta is not None. " + "Setting kernel_symm='theta'.".format(kernel_symm)) + self.kernel_symm = kernel_symm = 'theta' + + if kernel_symm == 'theta': + if theta is None: + warnings.warn("kernel_symm='theta' but theta not given. " + "Defaulting to theta=0.5.") + self.theta = theta = 0.5 + elif not isinstance(theta, numbers.Number) or \ + theta < 0 or theta > 1: + raise ValueError("theta {} not recognized. Expected " + "a float between 0 and 1".format(theta)) def _build_kernel(self): """Private method to build kernel matrix @@ -400,26 +399,26 @@ def symmetrize_kernel(self, K): elif self.kernel_symm == "*": tasklogger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) - elif self.kernel_symm == 'gamma': + elif self.kernel_symm == 'theta': tasklogger.log_debug( - "Using gamma symmetrization (gamma = {}).".format(self.gamma)) - K = self.gamma * elementwise_minimum(K, K.T) + \ - (1 - self.gamma) * elementwise_maximum(K, K.T) + "Using theta symmetrization (theta = {}).".format(self.theta)) + K = self.theta * elementwise_minimum(K, K.T) + \ + (1 - self.theta) * elementwise_maximum(K, K.T) elif self.kernel_symm is None: tasklogger.log_debug("Using no symmetrization.") pass else: # this should never happen raise ValueError( - "Expected kernel_symm in ['+', '*', 'gamma' or None]. " - "Got {}".format(self.gamma)) + "Expected kernel_symm in ['+', '*', 'theta' or None]. " + "Got {}".format(self.theta)) return K def get_params(self): """Get parameters from this object """ return {'kernel_symm': self.kernel_symm, - 'gamma': self.gamma} + 'theta': self.theta} def set_params(self, **params): """Set parameters on this object @@ -429,7 +428,7 @@ def set_params(self, **params): Valid parameters: Invalid parameters: (these would require modifying the kernel matrix) - kernel_symm - - gamma + - theta Parameters ---------- @@ -439,8 +438,8 @@ def set_params(self, **params): ------- self """ - if 'gamma' in params and params['gamma'] != self.gamma: - raise ValueError("Cannot update gamma. Please create a new graph") + if 'theta' in params and params['theta'] != self.theta: + raise ValueError("Cannot update theta. Please create a new graph") if 'kernel_symm' in params and \ params['kernel_symm'] != self.kernel_symm: raise ValueError( diff --git a/graphtools/graphs.py b/graphtools/graphs.py index ce8f9ae..65699fd 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -886,33 +886,33 @@ def __init__(self, data, sample_idx, super().__init__(data, n_pca=n_pca, **kwargs) - def _check_symmetrization(self, kernel_symm, gamma): - if kernel_symm == 'gamma' and gamma is not None and \ - not isinstance(gamma, numbers.Number): - # matrix gamma + def _check_symmetrization(self, kernel_symm, theta): + if kernel_symm == 'theta' and theta is not None and \ + not isinstance(theta, numbers.Number): + # matrix theta try: - gamma.shape + theta.shape except AttributeError: - raise ValueError("gamma {} not recognized. " + raise ValueError("theta {} not recognized. " "Expected a float between 0 and 1 " "or a [n_batch,n_batch] matrix of " - "floats between 0 and 1".format(gamma)) - if not np.shape(gamma) == (len(self.samples), + "floats between 0 and 1".format(theta)) + if not np.shape(theta) == (len(self.samples), len(self.samples)): raise ValueError( - "Matrix gamma must be of shape " + "Matrix theta must be of shape " "({}), got ({})".format( (len(self.samples), - len(self.samples)), gamma.shape)) - elif np.max(gamma) > 1 or np.min(gamma) < 0: + len(self.samples)), theta.shape)) + elif np.max(theta) > 1 or np.min(theta) < 0: raise ValueError( - "Values in matrix gamma must be between" + "Values in matrix theta must be between" " 0 and 1, got values between {} and {}".format( - np.max(gamma), np.min(gamma))) - elif np.any(gamma != gamma.T): - raise ValueError("gamma must be a symmetric matrix") + np.max(theta), np.min(theta))) + elif np.any(theta != theta.T): + raise ValueError("theta must be a symmetric matrix") else: - super()._check_symmetrization(kernel_symm, gamma) + super()._check_symmetrization(kernel_symm, theta) def _weight_knn(self, sample_size=None): """Select adaptive values of knn @@ -1070,14 +1070,14 @@ def build_kernel(self): return K def symmetrize_kernel(self, K): - if self.kernel_symm == 'gamma' and self.gamma is not None and \ - not isinstance(self.gamma, numbers.Number): - # matrix gamma + if self.kernel_symm == 'theta' and self.theta is not None and \ + not isinstance(self.theta, numbers.Number): + # matrix theta # Gamma can be a matrix with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously - tasklogger.log_debug("Using gamma symmetrization. " - "Gamma:\n{}".format(self.gamma)) + tasklogger.log_debug("Using theta symmetrization. " + "Gamma:\n{}".format(self.theta)) for i, sample_i in enumerate(self.samples): for j, sample_j in enumerate(self.samples): if j < i: @@ -1086,9 +1086,9 @@ def symmetrize_kernel(self, K): self.sample_idx == sample_j)] Kji = K[np.ix_(self.sample_idx == sample_j, self.sample_idx == sample_i)] - Kij_symm = self.gamma[i, j] * \ + Kij_symm = self.theta[i, j] * \ elementwise_minimum(Kij, Kji.T) + \ - (1 - self.gamma[i, j]) * \ + (1 - self.theta[i, j]) * \ elementwise_maximum(Kij, Kji.T) K = set_submatrix(K, self.sample_idx == sample_i, self.sample_idx == sample_j, Kij_symm) @@ -1100,7 +1100,7 @@ def symmetrize_kernel(self, K): K = super().symmetrize_kernel(K) return K - def build_kernel_to_data(self, Y, gamma=None): + def build_kernel_to_data(self, Y, theta=None): """Build transition matrix from new data to the graph Creates a transition matrix such that `Y` can be approximated by @@ -1120,8 +1120,8 @@ def build_kernel_to_data(self, Y, gamma=None): to the existing data. `n_features` must match either the ambient or PCA dimensions - gamma : array-like or `None`, optional (default: `None`) - if `self.gamma` is a matrix, gamma values must be explicitly + theta : array-like or `None`, optional (default: `None`) + if `self.theta` is a matrix, theta values must be explicitly specified between `Y` and each sample in `self.data` Returns @@ -1131,15 +1131,15 @@ def build_kernel_to_data(self, Y, gamma=None): Transition matrix from `Y` to `self.data` """ raise NotImplementedError - tasklogger.log_warning("building MNN kernel to gamma is experimental") - if not isinstance(self.gamma, str) and \ - not isinstance(self.gamma, numbers.Number): - if gamma is None: + tasklogger.log_warning("building MNN kernel to theta is experimental") + if not isinstance(self.theta, str) and \ + not isinstance(self.theta, numbers.Number): + if theta is None: raise ValueError( - "self.gamma is a matrix but gamma is not provided.") - elif len(gamma) != len(self.samples): + "self.theta is a matrix but theta is not provided.") + elif len(theta) != len(self.samples): raise ValueError( - "gamma should have one value for every sample") + "theta should have one value for every sample") Y = self._check_extension_shape(Y) kernel_xy = [] @@ -1156,26 +1156,26 @@ def build_kernel_to_data(self, Y, gamma=None): kernel_yx = sparse.vstack(kernel_yx) # n_cells_x x n_cells_y # symmetrize - if gamma is not None: + if theta is not None: # Gamma can be a vector with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously K = np.empty_like(kernel_xy) for i, sample in enumerate(self.samples): sample_idx = self.sample_idx == sample - K[:, sample_idx] = gamma[i] * \ + K[:, sample_idx] = theta[i] * \ kernel_xy[:, sample_idx].minimum( kernel_yx[sample_idx, :].T) + \ - (1 - gamma[i]) * \ + (1 - theta[i]) * \ kernel_xy[:, sample_idx].maximum( kernel_yx[sample_idx, :].T) - if self.gamma == "+": + if self.theta == "+": K = (kernel_xy + kernel_yx.T) / 2 - elif self.gamma == "*": + elif self.theta == "*": K = kernel_xy.multiply(kernel_yx.T) else: - K = self.gamma * kernel_xy.minimum(kernel_yx.T) + \ - (1 - self.gamma) * kernel_xy.maximum(kernel_yx.T) + K = self.theta * kernel_xy.minimum(kernel_yx.T) + \ + (1 - self.theta) * kernel_xy.maximum(kernel_yx.T) return K diff --git a/test/test_exact.py b/test/test_exact.py index 542e625..c5ef987 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -348,7 +348,7 @@ def test_set_params(): assert G.get_params() == {'n_pca': 20, 'random_state': 42, 'kernel_symm': '+', - 'gamma': None, + 'theta': None, 'knn': 3, 'decay': 10, 'distance': 'euclidean', diff --git a/test/test_knn.py b/test/test_knn.py index b8682c2..e274308 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -184,7 +184,7 @@ def test_set_params(): 'n_pca': 20, 'random_state': 42, 'kernel_symm': '+', - 'gamma': None, + 'theta': None, 'knn': 3, 'decay': None, 'distance': 'euclidean', @@ -204,11 +204,11 @@ def test_set_params(): assert_raises(ValueError, G.set_params, decay=10) assert_raises(ValueError, G.set_params, distance='manhattan') assert_raises(ValueError, G.set_params, thresh=1e-3) - assert_raises(ValueError, G.set_params, gamma=0.99) + assert_raises(ValueError, G.set_params, theta=0.99) assert_raises(ValueError, G.set_params, kernel_symm='*') G.set_params(knn=G.knn, decay=G.decay, thresh=G.thresh, distance=G.distance, - gamma=G.gamma, + theta=G.theta, kernel_symm=G.kernel_symm) diff --git a/test/test_landmark.py b/test/test_landmark.py index 42d9025..0c09a3e 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -135,7 +135,7 @@ def test_set_params(): assert G.get_params() == {'n_pca': 20, 'random_state': 42, 'kernel_symm': '+', - 'gamma': None, + 'theta': None, 'n_landmark': 500, 'knn': 3, 'decay': None, diff --git a/test/test_mnn.py b/test/test_mnn.py index 330a4e4..529dffa 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -49,38 +49,38 @@ def test_build_mnn_with_precomputed(): @raises(ValueError) -def test_mnn_with_square_gamma_wrong_length(): +def test_mnn_with_square_theta_wrong_length(): n_sample = len(np.unique(digits['target'])) - # square matrix gamma of the wrong size + # square matrix theta of the wrong size build_graph( data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='gamma', - gamma=np.tile(np.linspace(0, 1, n_sample - 1), + kernel_symm='theta', + theta=np.tile(np.linspace(0, 1, n_sample - 1), n_sample).reshape(n_sample - 1, n_sample)) @raises(ValueError) -def test_mnn_with_vector_gamma(): +def test_mnn_with_vector_theta(): n_sample = len(np.unique(digits['target'])) - # vector gamma + # vector theta build_graph( data, thresh=0, n_pca=20, decay=10, knn=5, random_state=42, sample_idx=digits['target'], - kernel_symm='gamma', - gamma=np.linspace(0, 1, n_sample - 1)) + kernel_symm='theta', + theta=np.linspace(0, 1, n_sample - 1)) def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) sample_idx += 1 G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -92,11 +92,11 @@ def test_mnn_with_non_zero_indexed_sample_idx(): def test_mnn_with_string_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) sample_idx = np.where(sample_idx == 0, 'a', 'b') G2 = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, use_pygsp=True) assert G.N == G2.N assert np.all(G.d == G2.d) @@ -110,9 +110,9 @@ def test_mnn_with_string_sample_idx(): ##################################################### -def test_mnn_graph_float_gamma(): +def test_mnn_graph_float_theta(): X, sample_idx = generate_swiss_roll() - gamma = 0.9 + theta = 0.9 k = 10 a = 20 metric = 'euclidean' @@ -139,12 +139,12 @@ def test_mnn_graph_float_gamma(): # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij - W = np.array((gamma * np.minimum(K, K.T)) + - ((1 - gamma) * np.maximum(K, K.T))) + W = np.array((theta * np.minimum(K, K.T)) + + ((1 - theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, - kernel_symm='gamma', gamma=gamma, + kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N @@ -154,10 +154,10 @@ def test_mnn_graph_float_gamma(): assert isinstance(G2, graphtools.graphs.MNNGraph) -def test_mnn_graph_matrix_gamma(): +def test_mnn_graph_matrix_theta(): X, sample_idx = generate_swiss_roll() bs = 0.8 - gamma = np.array([[1, bs], # 0 + theta = np.array([[1, bs], # 0 [bs, 1]]) # 3 k = 10 a = 20 @@ -187,18 +187,18 @@ def test_mnn_graph_matrix_gamma(): K = np.array(K) - matrix_gamma = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx)))) + matrix_theta = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx)))) for ix, si in enumerate(set(sample_idx)): for jx, sj in enumerate(set(sample_idx)): - matrix_gamma.iloc[sample_idx == si, - sample_idx == sj] = gamma[ix, jx] + matrix_theta.iloc[sample_idx == si, + sample_idx == sj] = theta[ix, jx] - W = np.array((matrix_gamma * np.minimum(K, K.T)) + - ((1 - matrix_gamma) * np.maximum(K, K.T))) + W = np.array((matrix_theta * np.minimum(K, K.T)) + + ((1 - matrix_theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, - kernel_symm='gamma', gamma=gamma, + kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N @@ -220,21 +220,21 @@ def test_verbose(): print() print("Verbose test: MNN") build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, verbose=True) def test_set_params(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, - kernel_symm='gamma', gamma=0.5, + kernel_symm='theta', theta=0.5, n_pca=None, thresh=1e-4) assert G.get_params() == { 'n_pca': None, 'random_state': 42, - 'kernel_symm': 'gamma', - 'gamma': 0.5, + 'kernel_symm': 'theta', + 'theta': 0.5, 'beta': 1, 'adaptive_k': 'sqrt', 'knn': 3, diff --git a/unittest.cfg b/unittest.cfg index 0f1a4ec..85c81ba 100644 --- a/unittest.cfg +++ b/unittest.cfg @@ -3,4 +3,4 @@ verbose = True [coverage] always-on = True -coverage = graphtools \ No newline at end of file +coverage = graphtools From 049e4a82b37e2f0338d37146f5a6084b4583c060 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 22:24:50 -0400 Subject: [PATCH 03/26] implement numeric and vector fixed bandwidth --- graphtools/api.py | 1 + graphtools/graphs.py | 82 +++++++++++++++++++++++++++++++++++--------- test/test_exact.py | 43 +++++++++++++++++++++++ test/test_knn.py | 43 +++++++++++++++++++++++ 4 files changed, 152 insertions(+), 17 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index ee2a55f..9a9af33 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -13,6 +13,7 @@ def Graph(data, precomputed=None, knn=5, decay=10, + bandwidth=None, distance='euclidean', thresh=1e-4, kernel_symm='+', diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 65699fd..8b3c95c 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -35,6 +35,12 @@ class kNNGraph(DataGraph): decay : `int` or `None`, optional (default: `None`) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) or bandwidths for each + sample. + TODO: implement `callable` bandwidth + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. @@ -55,10 +61,11 @@ class kNNGraph(DataGraph): """ def __init__(self, data, knn=5, decay=None, - distance='euclidean', + bandwidth=None, distance='euclidean', thresh=1e-4, n_pca=None, **kwargs): self.knn = knn self.decay = decay + self.bandwidth = bandwidth self.distance = distance self.thresh = thresh @@ -82,6 +89,7 @@ def get_params(self): params = super().get_params() params.update({'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'thresh': self.thresh, 'n_jobs': self.n_jobs, @@ -101,6 +109,7 @@ def set_params(self, **params): Invalid parameters: (these would require modifying the kernel matrix) - knn - decay + - bandwidth - distance - thresh @@ -116,6 +125,9 @@ def set_params(self, **params): raise ValueError("Cannot update knn. Please create a new graph") if 'decay' in params and params['decay'] != self.decay: raise ValueError("Cannot update decay. Please create a new graph") + if 'bandwidth' in params and params['bandwidth'] != self.bandwidth: + raise ValueError( + "Cannot update bandwidth. Please create a new graph") if 'distance' in params and params['distance'] != self.distance: raise ValueError("Cannot update distance. " "Please create a new graph") @@ -184,7 +196,7 @@ def build_kernel(self): K = self.build_kernel_to_data(self.data_nu) return K - def build_kernel_to_data(self, Y, knn=None): + def build_kernel_to_data(self, Y, knn=None, bandwidth=None): """Build a kernel from new input data `Y` to the `self.data` Parameters @@ -198,6 +210,9 @@ def build_kernel_to_data(self, Y, knn=None): knn : `int` or `None`, optional (default: `None`) If `None`, defaults to `self.knn` + bandwidth : `int` or `None`, optional (default: `None`) + If `None`, defaults to `self.bandwidth` + Returns ------- @@ -212,6 +227,8 @@ def build_kernel_to_data(self, Y, knn=None): """ if knn is None: knn = self.knn + if bandwidth is None: + bandwidth = self.bandwidth if knn > self.data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " "data.shape[0] ({n}). Setting knn={n}".format( @@ -247,7 +264,8 @@ def build_kernel_to_data(self, Y, knn=None): RuntimeWarning) tasklogger.log_complete("KNN search") tasklogger.log_start("affinities") - bandwidth = distances[:, knn - 1] + if bandwidth is None: + bandwidth = distances[:, knn - 1] radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere( @@ -266,8 +284,9 @@ def build_kernel_to_data(self, Y, knn=None): for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] - update_idx = [i for i, d in enumerate(distances) - if np.max(d) < radius[i]] + update_idx = [i for i, d in enumerate(distances) if np.max(d) < + (radius if isinstance(bandwidth, numbers.Number) + else radius[i])] tasklogger.log_debug("search_knn = {}; {} remaining".format( search_knn, len(update_idx))) @@ -281,12 +300,18 @@ def build_kernel_to_data(self, Y, knn=None): # give up - radius search dist_new, ind_new = knn_tree.radius_neighbors( Y[update_idx, :], - radius=np.max(radius[update_idx])) + radius=radius + if isinstance(bandwidth, numbers.Number) + else np.max(radius[update_idx])) for i, idx in enumerate(update_idx): distances[idx] = dist_new[i] indices[idx] = ind_new[i] - data = np.concatenate([distances[i] / bandwidth[i] - for i in range(len(distances))]) + if isinstance(bandwidth, numbers.Number): + data = np.concatenate(distances) / bandwidth + else: + data = np.concatenate([distances[i] / bandwidth[i] + for i in range(len(distances))]) + indices = np.concatenate(indices) indptr = np.concatenate( [[0], np.cumsum([len(d) for d in distances])]) @@ -590,6 +615,12 @@ class TraditionalGraph(DataGraph): decay : `int` or `None`, optional (default: `None`) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) or bandwidths for each + sample. + TODO: implement `callable` bandwidth + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. @@ -613,8 +644,11 @@ class TraditionalGraph(DataGraph): Only one of `precomputed` and `n_pca` can be set. """ - def __init__(self, data, knn=5, decay=10, - distance='euclidean', n_pca=None, + def __init__(self, data, + knn=5, decay=10, + bandwidth=None, + distance='euclidean', + n_pca=None, thresh=1e-4, precomputed=None, **kwargs): if precomputed is not None and n_pca is not None: @@ -640,6 +674,7 @@ def __init__(self, data, knn=5, decay=10, "non-negative".format(precomputed)) self.knn = knn self.decay = decay + self.bandwidth = bandwidth self.distance = distance self.thresh = thresh self.precomputed = precomputed @@ -653,6 +688,7 @@ def get_params(self): params = super().get_params() params.update({'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'precomputed': self.precomputed}) return params @@ -667,6 +703,7 @@ def set_params(self, **params): - distance - knn - decay + - bandwidth Parameters ---------- @@ -690,6 +727,10 @@ def set_params(self, **params): if 'decay' in params and params['decay'] != self.decay and \ self.precomputed is None: raise ValueError("Cannot update decay. Please create a new graph") + if 'bandwidth' in params and params['bandwidth'] != self.bandwidth and \ + self.precomputed is None: + raise ValueError( + "Cannot update bandwidth. Please create a new graph") # update superclass parameters super().set_params(**params) return self @@ -752,9 +793,12 @@ def build_kernel(self): "precomputed='{}' not recognized. " "Choose from ['affinity', 'adjacency', 'distance', " "None]".format(self.precomputed)) - knn_dist = np.partition(pdx, self.knn, axis=1)[:, :self.knn] - epsilon = np.max(knn_dist, axis=1) - pdx = (pdx.T / epsilon).T + if self.bandwidth is None: + knn_dist = np.partition(pdx, self.knn, axis=1)[:, :self.knn] + bandwidth = np.max(knn_dist, axis=1) + else: + bandwidth = self.bandwidth + pdx = (pdx.T / bandwidth).T K = np.exp(-1 * np.power(pdx, self.decay)) # handle nan K = np.where(np.isnan(K), 1, K) @@ -773,7 +817,7 @@ def build_kernel(self): K[K < self.thresh] = 0 return K - def build_kernel_to_data(self, Y, knn=None): + def build_kernel_to_data(self, Y, knn=None, bandwidth=None): """Build transition matrix from new data to the graph Creates a transition matrix such that `Y` can be approximated by @@ -805,15 +849,18 @@ def build_kernel_to_data(self, Y, knn=None): """ if knn is None: knn = self.knn + if bandwidth is None: + bandwidth = self.bandwidth if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: tasklogger.log_start("affinities") Y = self._check_extension_shape(Y) pdx = cdist(Y, self.data_nu, metric=self.distance) - knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] - epsilon = np.max(knn_dist, axis=1) - pdx = (pdx.T / epsilon).T + if bandwidth is None: + knn_dist = np.partition(pdx, knn, axis=1)[:, :knn] + bandwidth = np.max(knn_dist, axis=1) + pdx = (pdx.T / bandwidth).T K = np.exp(-1 * pdx**self.decay) # handle nan K = np.where(np.isnan(K), 1, K) @@ -957,6 +1004,7 @@ def get_params(self): 'adaptive_k': self.adaptive_k, 'knn': self.knn, 'decay': self.decay, + 'bandwidth': self.bandwidth, 'distance': self.distance, 'thresh': self.thresh, 'n_jobs': self.n_jobs}) diff --git a/test/test_exact.py b/test/test_exact.py index c5ef987..6a77ce4 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -289,6 +289,48 @@ def test_truncated_exact_graph_no_pca(): assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) +def test_exact_graph_fixed_bandwidth(): + decay = 5 + bandwidth = 2 + n_pca = 20 + pca = PCA(n_pca, svd_solver='randomized', random_state=42).fit(data) + data_nu = pca.transform(data) + pdx = squareform(pdist(data_nu, metric='euclidean')) + K = np.exp(-1 * (pdx / bandwidth)**decay) + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + graphtype='exact', + decay=decay, bandwidth=bandwidth, + random_state=42, + thresh=0, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G2.W != G.W).sum() == 0) + assert((G.W != G2.W).nnz == 0) + bandwidth = np.random.gamma(5, 0.5, len(data)) + K = np.exp(-1 * (pdx.T / bandwidth).T**decay) + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + graphtype='exact', + decay=decay, bandwidth=bandwidth, + random_state=42, + thresh=0, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G2.W != G.W).sum() == 0) + assert((G.W != G2.W).nnz == 0) + + ##################################################### # Check interpolation ##################################################### @@ -351,6 +393,7 @@ def test_set_params(): 'theta': None, 'knn': 3, 'decay': 10, + 'bandwidth': None, 'distance': 'euclidean', 'precomputed': None} assert_raises(ValueError, G.set_params, knn=15) diff --git a/test/test_knn.py b/test/test_knn.py index e274308..1b2360f 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -124,6 +124,48 @@ def test_sparse_alpha_knn_graph(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) +def test_knn_graph_fixed_bandwidth(): + k = 3 + decay = 5 + bandwidth = 10 + n_pca = 20 + thresh = 1e-4 + pca = PCA(n_pca, svd_solver='randomized', random_state=42).fit(data) + data_nu = pca.transform(data) + pdx = squareform(pdist(data_nu, metric='euclidean')) + K = np.exp(-1 * np.power(pdx / bandwidth, decay)) + K[K < thresh] = 0 + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + decay=decay, bandwidth=bandwidth, + knn=k, random_state=42, + thresh=thresh, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.kNNGraph)) + np.testing.assert_array_equal(G.N, G2.N) + np.testing.assert_array_equal(G.d, G2.d) + np.testing.assert_array_equal((G.W != G2.W).nnz, 0) + bandwidth = np.random.gamma(20, 0.5, len(data)) + K = np.exp(-1 * (pdx.T / bandwidth).T**decay) + K[K < thresh] = 0 + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data, n_pca=n_pca, + decay=decay, bandwidth=bandwidth, + knn=k, random_state=42, + thresh=thresh, + use_pygsp=True) + assert(isinstance(G2, graphtools.graphs.kNNGraph)) + np.testing.assert_array_equal(G.N, G2.N) + np.testing.assert_array_equal(G.d, G2.d) + np.testing.assert_array_equal((G.W != G2.W).nnz, 0) + + @warns(UserWarning) def test_knn_graph_sparse_no_pca(): build_graph(sp.coo_matrix(data), n_pca=None, # n_pca, @@ -187,6 +229,7 @@ def test_set_params(): 'theta': None, 'knn': 3, 'decay': None, + 'bandwidth': None, 'distance': 'euclidean', 'thresh': 0, 'n_jobs': -1, From 40d41da22fe2ddbfc17e5f6c2eb4787c85cf37b6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 22:36:52 -0400 Subject: [PATCH 04/26] fix landmark and knn tests --- test/test_knn.py | 2 +- test/test_landmark.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/test/test_knn.py b/test/test_knn.py index 1b2360f..96aba5d 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -163,7 +163,7 @@ def test_knn_graph_fixed_bandwidth(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) np.testing.assert_array_equal(G.N, G2.N) np.testing.assert_array_equal(G.d, G2.d) - np.testing.assert_array_equal((G.W != G2.W).nnz, 0) + np.testing.assert_allclose(G.W.toarray(), G2.W.toarray(), atol=1e-4) @warns(UserWarning) diff --git a/test/test_landmark.py b/test/test_landmark.py index 0c09a3e..51a8740 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -132,18 +132,19 @@ def test_verbose(): def test_set_params(): G = build_graph(data, n_landmark=500, decay=None) G.landmark_op - assert G.get_params() == {'n_pca': 20, - 'random_state': 42, - 'kernel_symm': '+', - 'theta': None, - 'n_landmark': 500, - 'knn': 3, - 'decay': None, - 'distance': - 'euclidean', - 'thresh': 0, - 'n_jobs': -1, - 'verbose': 0} + assert G.get_params() == { + 'n_pca': 20, + 'random_state': 42, + 'kernel_symm': '+', + 'theta': None, + 'n_landmark': 500, + 'knn': 3, + 'decay': None, + 'bandwidth': None, + 'distance': 'euclidean', + 'thresh': 0, + 'n_jobs': -1, + 'verbose': 0} G.set_params(n_landmark=300) assert G.landmark_op.shape == (300, 300) G.set_params(n_landmark=G.n_landmark, n_svd=G.n_svd) From 5f2bd79746a3ff841f1ddef0b87e113a9ae38456 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 17 Sep 2018 22:43:02 -0400 Subject: [PATCH 05/26] implement fixed bandwidth mnn --- graphtools/graphs.py | 5 ++++- test/test_mnn.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 8b3c95c..4e53353 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -904,6 +904,7 @@ def __init__(self, data, sample_idx, knn=5, beta=1, n_pca=None, adaptive_k='sqrt', decay=None, + bandwidth=None, distance='euclidean', thresh=1e-4, n_jobs=1, @@ -916,6 +917,7 @@ def __init__(self, data, sample_idx, self.knn = knn self.decay = decay self.distance = distance + self.bandwidth = bandwidth self.thresh = thresh self.n_jobs = n_jobs self.weighted_knn = self._weight_knn() @@ -1043,7 +1045,7 @@ def set_params(self, **params): "Cannot update adaptive_k. Please create a new graph") # knn arguments - knn_kernel_args = ['knn', 'decay', 'distance', 'thresh'] + knn_kernel_args = ['knn', 'decay', 'distance', 'thresh', 'bandwidth'] knn_other_args = ['n_jobs', 'random_state', 'verbose'] for arg in knn_kernel_args: if arg in params and params[arg] != getattr(self, arg): @@ -1085,6 +1087,7 @@ def build_kernel(self): graph = Graph(data, n_pca=None, knn=self.weighted_knn[i], decay=self.decay, + bandwidth=self.bandwidth, distance=self.distance, thresh=self.thresh, verbose=self.verbose, diff --git a/test/test_mnn.py b/test/test_mnn.py index 529dffa..3ce8fce 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -239,6 +239,7 @@ def test_set_params(): 'adaptive_k': 'sqrt', 'knn': 3, 'decay': 10, + 'bandwidth': None, 'distance': 'euclidean', 'thresh': 1e-4, 'n_jobs': 1 From 264bd2dc6858acdf8335d86d286d735420f6d7a2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 18 Sep 2018 12:35:42 -0400 Subject: [PATCH 06/26] fix check for equal kernels --- test/test_knn.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_knn.py b/test/test_knn.py index 96aba5d..d24ed82 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -147,7 +147,9 @@ def test_knn_graph_fixed_bandwidth(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) np.testing.assert_array_equal(G.N, G2.N) np.testing.assert_array_equal(G.d, G2.d) - np.testing.assert_array_equal((G.W != G2.W).nnz, 0) + np.testing.assert_allclose( + (G.W - G2.W).data, + np.zeros_like((G.W - G2.W).data), atol=1e-14) bandwidth = np.random.gamma(20, 0.5, len(data)) K = np.exp(-1 * (pdx.T / bandwidth).T**decay) K[K < thresh] = 0 @@ -163,7 +165,9 @@ def test_knn_graph_fixed_bandwidth(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) np.testing.assert_array_equal(G.N, G2.N) np.testing.assert_array_equal(G.d, G2.d) - np.testing.assert_allclose(G.W.toarray(), G2.W.toarray(), atol=1e-4) + np.testing.assert_allclose( + (G.W - G2.W).data, + np.zeros_like((G.W - G2.W).data), atol=1e-14) @warns(UserWarning) From 56e16a42cb3a7ea448561b566d302beffd0d3bdd Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 18 Sep 2018 12:35:52 -0400 Subject: [PATCH 07/26] set random seed for swiss roll generation --- test/load_tests/__init__.py | 2 +- test/test_mnn.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index 3f62c9d..f51e45a 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -38,7 +38,7 @@ def generate_swiss_roll(n_samples=1000, noise=0.5, seed=42): t = 1.5 * np.pi * (1 + 2 * generator.rand(1, n_samples)) x = t * np.cos(t) y = t * np.sin(t) - sample_idx = np.random.choice([0, 1], n_samples, replace=True) + sample_idx = generator.choice([0, 1], n_samples, replace=True) z = sample_idx t = np.squeeze(t) X = np.concatenate((x, y)) diff --git a/test/test_mnn.py b/test/test_mnn.py index 3ce8fce..b827470 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -148,7 +148,7 @@ def test_mnn_graph_float_theta(): distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N - assert np.all(G.d == G2.d) + np.testing.assert_array_equal(G.dw, G2.dw) assert (G.W != G2.W).nnz == 0 assert (G2.W != G.W).sum() == 0 assert isinstance(G2, graphtools.graphs.MNNGraph) @@ -202,7 +202,7 @@ def test_mnn_graph_matrix_theta(): distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) assert G.N == G2.N - assert np.all(G.d == G2.d) + np.testing.assert_array_equal(G.dw, G2.dw) assert (G.W != G2.W).nnz == 0 assert (G2.W != G.W).sum() == 0 assert isinstance(G2, graphtools.graphs.MNNGraph) From f5b19631a004986e07e7a951dc5a2f2c1fbd4d23 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 16 Oct 2018 11:18:44 -0400 Subject: [PATCH 08/26] add conda badge --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index d18d970..2a2a264 100644 --- a/README.rst +++ b/README.rst @@ -5,6 +5,9 @@ graphtools .. image:: https://img.shields.io/pypi/v/graphtools.svg :target: https://pypi.org/project/graphtools/ :alt: Latest PyPi version +.. image:: https://anaconda.org/conda-forge/tasklogger/badges/version.svg + :target: https://anaconda.org/conda-forge/tasklogger/ + :alt: Latest Conda version .. image:: https://api.travis-ci.com/KrishnaswamyLab/graphtools.svg?branch=master :target: https://travis-ci.com/KrishnaswamyLab/graphtools :alt: Travis CI Build From 5acc72429bb19bdb1ce0439e58dc60427e956c32 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 16 Oct 2018 11:19:30 -0400 Subject: [PATCH 09/26] expose landmarkgraph clusters --- graphtools/graphs.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 4e53353..2445711 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -360,8 +360,14 @@ class LandmarkGraph(DataGraph): transitions : array-like, shape=[n_samples, n_landmark] Transition probabilities between samples and landmarks. - _clusters : array-like, shape=[n_samples] + clusters : array-like, shape=[n_samples] Private attribute. Cluster assignments for each sample. + + Examples + -------- + >>> G = graphtools.Graph(data, n_landmark=1000) + >>> X_landmark = transform(G.landmark_op) + >>> X_full = G.interpolate(X_landmark) """ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs): @@ -456,6 +462,23 @@ def landmark_op(self): self.build_landmark_op() return self._landmark_op + @property + def clusters(self): + """Cluster assignments for each sample. + + Compute or return the cluster assignments + + Returns + ------- + clusters : list-like, shape=[n_samples] + Cluster assignments for each sample. + """ + try: + return self._clusters + except AttributeError: + self.build_landmark_op() + return self._clusters + @property def transitions(self): """Transition matrix from samples to landmarks @@ -475,13 +498,13 @@ def transitions(self): return self._transitions def _landmarks_to_data(self): - landmarks = np.unique(self._clusters) + landmarks = np.unique(self.clusters) if sparse.issparse(self.kernel): pmn = sparse.vstack( - [sparse.csr_matrix(self.kernel[self._clusters == i, :].sum( + [sparse.csr_matrix(self.kernel[self.clusters == i, :].sum( axis=0)) for i in landmarks]) else: - pmn = np.array([np.sum(self.kernel[self._clusters == i, :], axis=0) + pmn = np.array([np.sum(self.kernel[self.clusters == i, :], axis=0) for i in landmarks]) return pmn @@ -557,12 +580,12 @@ def extend_to_data(self, data, **kwargs): kernel = self.build_kernel_to_data(data, **kwargs) if sparse.issparse(kernel): pnm = sparse.hstack( - [sparse.csr_matrix(kernel[:, self._clusters == i].sum( - axis=1)) for i in np.unique(self._clusters)]) + [sparse.csr_matrix(kernel[:, self.clusters == i].sum( + axis=1)) for i in np.unique(self.clusters)]) else: pnm = np.array([np.sum( - kernel[:, self._clusters == i], - axis=1).T for i in np.unique(self._clusters)]).transpose() + kernel[:, self.clusters == i], + axis=1).T for i in np.unique(self.clusters)]).transpose() pnm = normalize(pnm, norm='l1', axis=1) return pnm From 411b7bab403a7e408a4fa048640486648405c42e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 16 Oct 2018 11:20:04 -0400 Subject: [PATCH 10/26] test weighted degrees --- test/test_knn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_knn.py b/test/test_knn.py index d24ed82..80c3f11 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -164,7 +164,7 @@ def test_knn_graph_fixed_bandwidth(): use_pygsp=True) assert(isinstance(G2, graphtools.graphs.kNNGraph)) np.testing.assert_array_equal(G.N, G2.N) - np.testing.assert_array_equal(G.d, G2.d) + np.testing.assert_allclose(G.dw, G2.dw, atol=1e-14) np.testing.assert_allclose( (G.W - G2.W).data, np.zeros_like((G.W - G2.W).data), atol=1e-14) From d63007b59d44d5b24e6dd1b3324e9c47578b4096 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 20:06:50 -0500 Subject: [PATCH 11/26] add from_igraph and to_pygsp methods --- graphtools/__init__.py | 2 +- graphtools/api.py | 8 ++++++++ graphtools/base.py | 5 +++++ setup.py | 3 ++- test/test_api.py | 22 ++++++++++++++++++++++ 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/graphtools/__init__.py b/graphtools/__init__.py index 05d693d..8fc8a50 100644 --- a/graphtools/__init__.py +++ b/graphtools/__init__.py @@ -1,2 +1,2 @@ -from .api import Graph +from .api import Graph, from_igraph from .version import __version__ diff --git a/graphtools/api.py b/graphtools/api.py index 9a9af33..2181066 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -1,6 +1,7 @@ import numpy as np import warnings import tasklogger +from scipy import sparse from . import base from . import graphs @@ -222,3 +223,10 @@ def Graph(data, for key, value in params.items() if key != "data"]))) return Graph(**params) + + +def from_igraph(G, **kwargs): + if 'precomputed' in kwargs and kwargs['precomputed'] != 'adjacency': + raise ValueError("Cannot build graph from igraph with precomputed={}. " + "Use 'adjacency' instead.".format(kwargs['precomputed'])) + return Graph(sparse.coo_matrix(G.get_adjacency().data), precomputed='adjacency', **kwargs) diff --git a/graphtools/base.py b/graphtools/base.py index 53e8fb1..179b080 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -27,6 +27,7 @@ elementwise_maximum, set_diagonal) + class Base(object): """Class that deals with key-word arguments but is otherwise just an object. @@ -534,6 +535,10 @@ def build_kernel(self): """ raise NotImplementedError + def to_pygsp(self): + from . import api + return api.Graph(self.K, precomputed="affinity", use_pygsp=True) + class PyGSPGraph(with_metaclass(abc.ABCMeta, pygsp.graphs.Graph, Base)): """Interface between BaseGraph and PyGSP. diff --git a/setup.py b/setup.py index 1394bf8..cd27e15 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ 'nose2', 'pandas', 'coverage', - 'coveralls' + 'coveralls', + 'python-igraph' ] if sys.version_info[0] == 3: diff --git a/test/test_api.py b/test/test_api.py index c099086..0ef9533 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -4,8 +4,30 @@ build_graph, raises, ) +import igraph +import numpy as np +import graphtools +def test_from_igraph(): + n = 100 + m = 500 + K = np.zeros((n, n)) + for _ in range(m): + e = np.random.choice(n, 2, replace=False) + K[e[0], e[1]] = K[e[1], e[0]] = 1 + g = igraph.Graph.Adjacency(K.tolist()) + G = graphtools.from_igraph(g) + G2 = graphtools.Graph(K, precomputed='adjacency') + assert np.all(G.K == G2.K) + + +def test_to_pygsp(): + G = build_graph(data) + G2 = G.to_pygsp() + assert isinstance(G2, graphtools.graphs.PyGSPGraph) + assert np.all(G2.K == G.K) + ##################################################### # Check parameters ##################################################### From 718d191989f19a0f9cde1350e2920e370048cde6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 20:48:12 -0500 Subject: [PATCH 12/26] normalize between-batch affinities by rowwise magnitude of within-batch affinities --- graphtools/graphs.py | 27 ++++++++++++++++--------- test/test_mnn.py | 48 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 2445711..e556054 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -911,9 +911,9 @@ class MNNGraph(DataGraph): Batch index beta : `float`, optional (default: 1) - Downweight within-batch affinities by beta + Downweight between-batch affinities by beta - adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: 'sqrt') + adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: None) Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. @@ -925,7 +925,7 @@ class MNNGraph(DataGraph): def __init__(self, data, sample_idx, knn=5, beta=1, n_pca=None, - adaptive_k='sqrt', + adaptive_k=None, decay=None, bandwidth=None, distance='euclidean', @@ -1116,7 +1116,7 @@ def build_kernel(self): verbose=self.verbose, random_state=self.random_state, n_jobs=self.n_jobs, - initialize=False) + initialize=True) self.subgraphs.append(graph) # append to list of subgraphs tasklogger.log_complete("subgraphs") @@ -1126,16 +1126,25 @@ def build_kernel(self): else: K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]]) for i, X in enumerate(self.subgraphs): + K = set_submatrix(K, self.sample_idx == self.samples[i], + self.sample_idx == self.samples[i], X.K) + within_batch_norm = np.array(np.sum(X.K, 1)).flatten() for j, Y in enumerate(self.subgraphs): + if i == j: + continue tasklogger.log_start( "kernel from sample {} to {}".format(self.samples[i], self.samples[j])) Kij = Y.build_kernel_to_data( X.data_nu, knn=self.weighted_knn[i]) - if i == j: - # downweight within-batch affinities by beta - Kij = Kij * self.beta + between_batch_norm = np.array(np.sum(Kij, 1)).flatten() + scale = np.minimum(1, within_batch_norm / + between_batch_norm) * self.beta + if sparse.issparse(Kij): + Kij = Kij.multiply(scale[:, None]) + else: + Kij = Kij * scale[:, None] K = set_submatrix(K, self.sample_idx == self.samples[i], self.sample_idx == self.samples[j], Kij) tasklogger.log_complete( @@ -1147,11 +1156,11 @@ def symmetrize_kernel(self, K): if self.kernel_symm == 'theta' and self.theta is not None and \ not isinstance(self.theta, numbers.Number): # matrix theta - # Gamma can be a matrix with specific values transitions for + # Theta can be a matrix with specific values transitions for # each batch. This allows for technical replicates and # experimental samples to be corrected simultaneously tasklogger.log_debug("Using theta symmetrization. " - "Gamma:\n{}".format(self.theta)) + "Theta:\n{}".format(self.theta)) for i, sample_i in enumerate(self.samples): for j, sample_j in enumerate(self.samples): if j < i: diff --git a/test/test_mnn.py b/test/test_mnn.py index b827470..dd6c936 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -12,6 +12,7 @@ raises, cdist, ) +from scipy.linalg import norm ##################################################### @@ -116,7 +117,7 @@ def test_mnn_graph_float_theta(): k = 10 a = 20 metric = 'euclidean' - beta = 0 + beta = 0.5 samples = np.unique(sample_idx) K = np.zeros((len(X), len(X))) @@ -133,17 +134,32 @@ def test_mnn_graph_float_theta(): pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel if si == sj: - K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \ - (1 - beta) # fill out values in K for NN on diagonal + K.iloc[sample_idx == si, sample_idx == sj] = ( + k_ij + k_ij.T) / 2 else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij - + Kn = K.copy() + for i in samples: + curr_K = K.iloc[sample_idx == i, sample_idx == i] + i_norm = norm(curr_K, 1, axis=1) + for j in samples: + if i == j: + continue + else: + curr_K = K.iloc[sample_idx == i, sample_idx == j] + curr_norm = norm(curr_K, 1, axis=1) + scale = np.minimum( + np.ones(len(curr_norm)), i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, sample_idx == j] = ( + curr_K.T * scale).T + + K = Kn W = np.array((theta * np.minimum(K, K.T)) + ((1 - theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) - G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, + G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta, kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) @@ -179,11 +195,27 @@ def test_mnn_graph_matrix_theta(): pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel if si == sj: - K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \ - (1 - beta) # fill out values in K for NN on diagonal + K.iloc[sample_idx == si, sample_idx == sj] = ( + k_ij + k_ij.T) / 2 else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij + Kn = K.copy() + for i in samples: + curr_K = K.iloc[sample_idx == i, sample_idx == i] + i_norm = norm(curr_K, 1, axis=1) + for j in samples: + if i == j: + continue + else: + curr_K = K.iloc[sample_idx == i, sample_idx == j] + curr_norm = norm(curr_K, 1, axis=1) + scale = np.minimum( + np.ones(len(curr_norm)), i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, sample_idx == j] = ( + curr_K.T * scale).T + + K = Kn K = np.array(K) @@ -197,7 +229,7 @@ def test_mnn_graph_matrix_theta(): ((1 - matrix_theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) - G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta, + G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta, kernel_symm='theta', theta=theta, distance=metric, sample_idx=sample_idx, thresh=0, use_pygsp=True) From b95df52224ef09068a38c4475b6e12f0677a0417 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 20:50:26 -0500 Subject: [PATCH 13/26] ignore igraph warning --- test/load_tests/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index f51e45a..f018957 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -16,6 +16,7 @@ def reset_warnings(): warnings.resetwarnings() warnings.simplefilter("error") ignore_numpy_warning() + ignore_igraph_warning() def ignore_numpy_warning(): @@ -25,6 +26,14 @@ def ignore_numpy_warning(): "matrices or deal with linear algebra ") +def ignore_igraph_warning(): + warnings.filterwarnings( + "ignore", category=DeprecationWarning, + message="The SafeConfigParser class has been renamed to ConfigParser " + "in Python 3.2. This alias will be removed in future versions. Use " + "ConfigParser directly instead") + + reset_warnings() global digits From d2741c3a522ef8467f36171764cfaf0e15af7f0e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 21:33:58 -0500 Subject: [PATCH 14/26] resolve python2 division error --- graphtools/graphs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index e556054..e090dac 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -1,3 +1,4 @@ +from __future__ import division from builtins import super import numpy as np from sklearn.neighbors import NearestNeighbors From 324a54730a5c44b0c06a731544eb0cd1ac42cb84 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 21:36:24 -0500 Subject: [PATCH 15/26] update beta docstring --- graphtools/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index 2181066..1eaaa0b 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -10,7 +10,7 @@ def Graph(data, n_pca=None, sample_idx=None, - adaptive_k='sqrt', + adaptive_k=None, precomputed=None, knn=5, decay=10, @@ -90,12 +90,12 @@ def Graph(data, Only one of `precomputed` and `n_pca` can be set. beta: float, optional(default: 1) - Multiply within - batch connections by(1 - beta) + Multiply between - batch connections by beta sample_idx: array-like Batch index for MNN kernel - adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt') + adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: None) Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. From a46804a9b4f04f5d994f850b690d7667f4020705 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 21:40:28 -0500 Subject: [PATCH 16/26] update bandwidth docstring --- graphtools/api.py | 5 +++++ graphtools/graphs.py | 4 ++-- test/test_api.py | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index 1eaaa0b..c083bf3 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -63,6 +63,11 @@ def Graph(data, decay : `int` or `None`, optional (default: 10) Rate of alpha decay to use. If `None`, alpha decay is not used. + bandwidth : `float`, list-like or `None`, optional (default: `None`) + Fixed bandwidth to use. If given, overrides `knn`. Can be a single + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each + sample. + distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. diff --git a/graphtools/graphs.py b/graphtools/graphs.py index e090dac..f1c54aa 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -38,7 +38,7 @@ class kNNGraph(DataGraph): bandwidth : `float`, list-like or `None`, optional (default: `None`) Fixed bandwidth to use. If given, overrides `knn`. Can be a single - bandwidth or a list-like (shape=[n_samples]) or bandwidths for each + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each sample. TODO: implement `callable` bandwidth @@ -641,7 +641,7 @@ class TraditionalGraph(DataGraph): bandwidth : `float`, list-like or `None`, optional (default: `None`) Fixed bandwidth to use. If given, overrides `knn`. Can be a single - bandwidth or a list-like (shape=[n_samples]) or bandwidths for each + bandwidth or a list-like (shape=[n_samples]) of bandwidths for each sample. TODO: implement `callable` bandwidth diff --git a/test/test_api.py b/test/test_api.py index 0ef9533..e46a60d 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -4,6 +4,8 @@ build_graph, raises, ) +import warnings + import igraph import numpy as np import graphtools From 6252eb1a68a3a879ff07d98a80f016db743905f5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 21:48:39 -0500 Subject: [PATCH 17/26] fix default params for mnngraph --- test/test_mnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_mnn.py b/test/test_mnn.py index dd6c936..827531e 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -268,7 +268,7 @@ def test_set_params(): 'kernel_symm': 'theta', 'theta': 0.5, 'beta': 1, - 'adaptive_k': 'sqrt', + 'adaptive_k': None, 'knn': 3, 'decay': 10, 'bandwidth': None, From fcc3b4350a4ea1f663c9860a3ba7178c4d771432 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 16 Nov 2018 22:07:51 -0500 Subject: [PATCH 18/26] document and test api functions --- graphtools/api.py | 29 +++++++++++++++++++++++++---- graphtools/base.py | 36 ++++++++++++++++++++++++++++++++++-- test/test_api.py | 25 +++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index c083bf3..9e5d31b 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -231,7 +231,28 @@ def Graph(data, def from_igraph(G, **kwargs): - if 'precomputed' in kwargs and kwargs['precomputed'] != 'adjacency': - raise ValueError("Cannot build graph from igraph with precomputed={}. " - "Use 'adjacency' instead.".format(kwargs['precomputed'])) - return Graph(sparse.coo_matrix(G.get_adjacency().data), precomputed='adjacency', **kwargs) + """Convert an igraph.Graph to a graphtools.Graph + + Creates a graphtools.graphs.TraditionalGraph with a + precomputed adjacency matrix + + Parameters + ---------- + G : igraph.Graph + Graph to be converted + kwargs + keyword arguments for graphtools.Graph + + Returns + ------- + G : graphtools.graphs.TraditionalGraph + """ + if 'precomputed' in kwargs: + if kwargs['precomputed'] != 'adjacency': + warnings.warn( + "Cannot build graph from igraph with precomputed={}. " + "Use 'adjacency' instead.".format(kwargs['precomputed']), + UserWarning) + del kwargs['precomputed'] + return Graph(sparse.coo_matrix(G.get_adjacency().data), + precomputed='adjacency', **kwargs) diff --git a/graphtools/base.py b/graphtools/base.py index 179b080..d08c741 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -535,9 +535,41 @@ def build_kernel(self): """ raise NotImplementedError - def to_pygsp(self): + def to_pygsp(self, **kwargs): + """Convert to a PyGSP graph + + For use only when the user means to create the graph using + the flag `use_pygsp=True`, and doesn't wish to recompute the kernel. + Creates a graphtools.graphs.TraditionalGraph with a precomputed + affinity matrix which also inherits from pygsp.graphs.Graph. + + Parameters + ---------- + kwargs + keyword arguments for graphtools.Graph + + Returns + ------- + G : graphtools.base.PyGSPGraph, graphtools.graphs.TraditionalGraph + """ from . import api - return api.Graph(self.K, precomputed="affinity", use_pygsp=True) + if 'precomputed' in kwargs: + if kwargs['precomputed'] != 'affinity': + warnings.warn( + "Cannot build PyGSPGraph with precomputed={}. " + "Using 'affinity' instead.".format(kwargs['precomputed']), + UserWarning) + del kwargs['precomputed'] + if 'use_pygsp' in kwargs: + if kwargs['use_pygsp'] is not True: + warnings.warn( + "Cannot build PyGSPGraph with use_pygsp={}. " + "Use True instead.".format(kwargs['use_pygsp']), + UserWarning) + del kwargs['use_pygsp'] + return api.Graph(self.K, + precomputed="affinity", use_pygsp=True, + **kwargs) class PyGSPGraph(with_metaclass(abc.ABCMeta, pygsp.graphs.Graph, Base)): diff --git a/test/test_api.py b/test/test_api.py index e46a60d..64b0f4f 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -3,6 +3,7 @@ data, build_graph, raises, + warns, ) import warnings @@ -24,12 +25,36 @@ def test_from_igraph(): assert np.all(G.K == G2.K) +@warns(UserWarning) +def test_from_igraph_invalid_precomputed(): + n = 100 + m = 500 + K = np.zeros((n, n)) + for _ in range(m): + e = np.random.choice(n, 2, replace=False) + K[e[0], e[1]] = K[e[1], e[0]] = 1 + g = igraph.Graph.Adjacency(K.tolist()) + G = graphtools.from_igraph(g, precomputed='affinity') + + def test_to_pygsp(): G = build_graph(data) G2 = G.to_pygsp() assert isinstance(G2, graphtools.graphs.PyGSPGraph) assert np.all(G2.K == G.K) + +@warns(UserWarning) +def test_to_pygsp_invalid_precomputed(): + G = build_graph(data) + G2 = G.to_pygsp(precomputed='adjacency') + + +@warns(UserWarning) +def test_to_pygsp_invalid_use_pygsp(): + G = build_graph(data) + G2 = G.to_pygsp(use_pygsp=False) + ##################################################### # Check parameters ##################################################### From 654cf467ccaaaf9d3c21e3560d726d1cb237dff1 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 21 Nov 2018 19:51:14 -0800 Subject: [PATCH 19/26] fix edc91ea0f01d8d49c8d153b54e9182454bf1fddb md rst issue --- README.rst | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 2a2a264..b343cbd 100644 --- a/README.rst +++ b/README.rst @@ -31,7 +31,11 @@ Installation graphtools is available on `pip`. Install by running the following in a terminal:: - pip install --user graphtools + pip install --user graphtools + +Alternatively, graphtools can be installed using `Conda `_ (most easily obtained via the `Miniconda Python distribution `_):: + + conda install -c conda-forge graphtools Usage example ------------- @@ -40,14 +44,14 @@ The `graphtools.Graph` class provides an all-in-one interface for k-nearest neig Use it as follows:: - from sklearn import datasets - import graphtools - digits = datasets.load_digits() - G = graphtools.Graph(digits['data']) - K = G.kernel - P = G.diff_op - G = graphtools.Graph(digits['data'], n_landmark=300) - L = G.landmark_op + from sklearn import datasets + import graphtools + digits = datasets.load_digits() + G = graphtools.Graph(digits['data']) + K = G.kernel + P = G.diff_op + G = graphtools.Graph(digits['data'], n_landmark=300) + L = G.landmark_op Help ---- From 305a861fa0827aeb78e53af27d0a3d379d04a512 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 10:04:46 -0800 Subject: [PATCH 20/26] bump version --- graphtools/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtools/version.py b/graphtools/version.py index 569b121..d3ec452 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "0.1.10" +__version__ = "0.2.0" From 73528d9dd090d4855025b13f3f75ddc126899ee8 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 10:14:18 -0800 Subject: [PATCH 21/26] test for gamma/theta parameter confusion --- graphtools/base.py | 9 +++++++++ test/test_mnn.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/graphtools/base.py b/graphtools/base.py index d08c741..5f1a2ab 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -338,7 +338,16 @@ class BaseGraph(with_metaclass(abc.ABCMeta, Base)): def __init__(self, kernel_symm='+', theta=None, + gamma=None, initialize=True, **kwargs): + if gamma is not None: + warnings.warn("gamma is deprecated. " + "Setting theta={}".format(gamma), FutureWarning) + theta = gamma + if kernel_symm == 'gamma': + warnings.warn("kernel_symm='gamma' is deprecated. " + "Setting kernel_symm='theta'", FutureWarning) + kernel_symm = 'theta' self.kernel_symm = kernel_symm self.theta = theta self._check_symmetrization(kernel_symm, theta) diff --git a/test/test_mnn.py b/test/test_mnn.py index 827531e..898c721 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -10,6 +10,7 @@ generate_swiss_roll, assert_raises, raises, + warns, cdist, ) from scipy.linalg import norm @@ -74,6 +75,26 @@ def test_mnn_with_vector_theta(): theta=np.linspace(0, 1, n_sample - 1)) +@warns(FutureWarning) +def test_mnn_with_gamma(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + gamma=0.9) + + +@warns(FutureWarning) +def test_mnn_with_kernel_symm_gamma(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='gamma', + theta=0.9) + + def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, From daba2c919abbd4a241aa7db61a23d29666cf964b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 10:19:10 -0800 Subject: [PATCH 22/26] bump tasklogger version --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6fed4f1..08e1515 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy>=1.1.0 pygsp>=>=0.5.1 scikit-learn>=0.19.1 future -tasklogger>=0.2.1 +tasklogger>=0.4.0 diff --git a/setup.py b/setup.py index cd27e15..f67b380 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ 'pygsp>=0.5.1', 'scikit-learn>=0.19.1', 'future', - 'tasklogger>=0.2.1', + 'tasklogger>=0.4.0', ] test_requires = [ From 67283697500a3ab3614aed2835884a9e9b730fff Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 10:30:52 -0800 Subject: [PATCH 23/26] cache packages --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index d4accf9..e346d3b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,8 @@ sudo: required + cache: packages + addons: apt: packages: From 5b95ff9c60af10ba4a820e243c6aa71e6b042cea Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 11:04:22 -0800 Subject: [PATCH 24/26] increase coverage --- test/test_exact.py | 8 ++++++++ test/test_knn.py | 19 +++++++++++++++++++ test/test_landmark.py | 11 +++++++++++ test/test_mnn.py | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+) diff --git a/test/test_exact.py b/test/test_exact.py index 6a77ce4..43216ac 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -68,6 +68,13 @@ def test_precomputed_negative(): n_pca=None) +@raises(ValueError) +def test_precomputed_invalid(): + build_graph(np.random.uniform(0, 1, [200, 200]), + precomputed='invalid', + n_pca=None) + + @warns(RuntimeWarning) def test_duplicate_data(): build_graph(np.vstack([data, data[:10]]), @@ -400,6 +407,7 @@ def test_set_params(): assert_raises(ValueError, G.set_params, decay=15) assert_raises(ValueError, G.set_params, distance='manhattan') assert_raises(ValueError, G.set_params, precomputed='distance') + assert_raises(ValueError, G.set_params, bandwidth=5) G.set_params(knn=G.knn, decay=G.decay, distance=G.distance, diff --git a/test/test_knn.py b/test/test_knn.py index 80c3f11..c359ede 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -45,6 +45,24 @@ def test_duplicate_data(): thresh=1e-4) +@warns(UserWarning) +def test_balltree_cosine(): + build_graph(data, + n_pca=20, + decay=10, + distance='cosine', + thresh=1e-4) + + +@warns(UserWarning) +def test_k_too_large(): + build_graph(data, + n_pca=20, + decay=10, + knn=len(data) + 1, + thresh=1e-4) + + ##################################################### # Check kernel ##################################################### @@ -253,6 +271,7 @@ def test_set_params(): assert_raises(ValueError, G.set_params, thresh=1e-3) assert_raises(ValueError, G.set_params, theta=0.99) assert_raises(ValueError, G.set_params, kernel_symm='*') + assert_raises(ValueError, G.set_params, bandwidth=5) G.set_params(knn=G.knn, decay=G.decay, thresh=G.thresh, diff --git a/test/test_landmark.py b/test/test_landmark.py index 51a8740..57903ce 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -42,6 +42,15 @@ def test_landmark_exact_graph(): assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.TraditionalGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) + assert(G.transitions.shape == (data.shape[0], n_landmark)) + assert(G.clusters.shape == data.shape[0]) + assert(len(np.unique(G.clusters)) <= n_landmark) + signal = np.random.normal(0, 1, [n_landmark, 10]) + interpolated_signal = G.interpolate(signal) + assert interpolated_signal.shape == (data.shape[0], signal.shape[1]) + G._reset_landmarks() + # no error on double delete + G._reset_landmarks() def test_landmark_knn_graph(): @@ -49,6 +58,7 @@ def test_landmark_knn_graph(): # knn graph G = build_graph(data, n_landmark=n_landmark, n_pca=20, decay=None, knn=5, random_state=42) + assert(G.transitions.shape == (data.shape[0], n_landmark)) assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.kNNGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) @@ -62,6 +72,7 @@ def test_landmark_mnn_graph(): thresh=1e-5, n_pca=None, decay=10, knn=5, random_state=42, sample_idx=sample_idx) + assert(G.clusters.shape == data.shape[0]) assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.MNNGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) diff --git a/test/test_mnn.py b/test/test_mnn.py index 898c721..9bf284a 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -75,6 +75,26 @@ def test_mnn_with_vector_theta(): theta=np.linspace(0, 1, n_sample - 1)) +@raises(ValueError) +def test_mnn_with_unbounded_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta=2) + + +@raises(ValueError) +def test_mnn_with_string_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta='invalid') + + @warns(FutureWarning) def test_mnn_with_gamma(): build_graph( @@ -95,6 +115,25 @@ def test_mnn_with_kernel_symm_gamma(): theta=0.9) +@warns(UserWarning) +def test_mnn_with_theta_and_kernel_symm_not_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='+', + theta=0.9) + + +@warns(UserWarning) +def test_mnn_with_kernel_symmm_theta_and_no_theta(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta') + + def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, From 5104b74ed0b5c07d60764cfc8e5090bcf7b733f2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 11:21:33 -0800 Subject: [PATCH 25/26] cache pip --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e346d3b..64d5223 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ sudo: required - cache: packages + cache: pip addons: apt: From 76ba4f0189985418ff189bcafa7e7e5d84e8c753 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 22 Nov 2018 11:21:54 -0800 Subject: [PATCH 26/26] make tests pass --- graphtools/graphs.py | 28 +++++++++++++++++----------- test/test_api.py | 1 + test/test_data.py | 1 + test/test_exact.py | 10 ++++++++++ test/test_knn.py | 1 + test/test_landmark.py | 5 +++-- test/test_mnn.py | 1 + 7 files changed, 34 insertions(+), 13 deletions(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index f1c54aa..a0810da 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -64,24 +64,25 @@ class kNNGraph(DataGraph): def __init__(self, data, knn=5, decay=None, bandwidth=None, distance='euclidean', thresh=1e-4, n_pca=None, **kwargs): - self.knn = knn - self.decay = decay - self.bandwidth = bandwidth - self.distance = distance - self.thresh = thresh if decay is not None and thresh <= 0: raise ValueError("Cannot instantiate a kNNGraph with `decay=None` " "and `thresh=0`. Use a TraditionalGraph instead.") if knn > data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " - "data.shape[0] ({n}). Setting knn={n}".format( + "n_samples ({n}). Setting knn={n}".format( k=knn, n=data.shape[0])) + knn = data.shape[0] if n_pca is None and data.shape[1] > 500: warnings.warn("Building a kNNGraph on data of shape {} is " "expensive. Consider setting n_pca.".format( data.shape), UserWarning) + self.knn = knn + self.decay = decay + self.bandwidth = bandwidth + self.distance = distance + self.thresh = thresh super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): @@ -232,7 +233,7 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None): bandwidth = self.bandwidth if knn > self.data.shape[0]: warnings.warn("Cannot set knn ({k}) to be greater than " - "data.shape[0] ({n}). Setting knn={n}".format( + "n_samples ({n}). Setting knn={n}".format( k=knn, n=self.data.shape[0])) Y = self._check_extension_shape(Y) @@ -675,15 +676,20 @@ def __init__(self, data, n_pca=None, thresh=1e-4, precomputed=None, **kwargs): + if decay is None and precomputed not in ['affinity', 'adjacency']: + # decay high enough is basically a binary kernel + raise ValueError("`decay` must be provided for a TraditionalGraph" + ". For kNN kernel, use kNNGraph.") if precomputed is not None and n_pca is not None: # the data itself is a matrix of distances / affinities n_pca = None warnings.warn("n_pca cannot be given on a precomputed graph." " Setting n_pca=None", RuntimeWarning) - if decay is None and precomputed not in ['affinity', 'adjacency']: - # decay high enough is basically a binary kernel - raise ValueError("`decay` must be provided for a TraditionalGraph" - ". For kNN kernel, use kNNGraph.") + if knn > data.shape[0]: + warnings.warn("Cannot set knn ({k}) to be greater than or equal to" + " n_samples ({n}). Setting knn={n}".format( + k=knn, n=data.shape[0] - 1)) + knn = data.shape[0] - 1 if precomputed is not None: if precomputed not in ["distance", "affinity", "adjacency"]: raise ValueError("Precomputed value {} not recognized. " diff --git a/test/test_api.py b/test/test_api.py index 64b0f4f..49d2126 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( nose2, data, diff --git a/test/test_data.py b/test/test_data.py index 39d7966..dfa0889 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( np, sp, diff --git a/test/test_exact.py b/test/test_exact.py index 43216ac..80d84f0 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -83,6 +84,15 @@ def test_duplicate_data(): thresh=0) +@warns(UserWarning) +def test_k_too_large(): + build_graph(data, + n_pca=20, + decay=10, + knn=len(data) + 1, + thresh=0) + + ##################################################### # Check kernel ##################################################### diff --git a/test/test_knn.py b/test/test_knn.py index c359ede..7d15b0d 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, diff --git a/test/test_landmark.py b/test/test_landmark.py index 57903ce..da0fbbc 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np, @@ -43,7 +44,7 @@ def test_landmark_exact_graph(): assert(isinstance(G, graphtools.graphs.TraditionalGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) assert(G.transitions.shape == (data.shape[0], n_landmark)) - assert(G.clusters.shape == data.shape[0]) + assert(G.clusters.shape == (data.shape[0],)) assert(len(np.unique(G.clusters)) <= n_landmark) signal = np.random.normal(0, 1, [n_landmark, 10]) interpolated_signal = G.interpolate(signal) @@ -72,7 +73,7 @@ def test_landmark_mnn_graph(): thresh=1e-5, n_pca=None, decay=10, knn=5, random_state=42, sample_idx=sample_idx) - assert(G.clusters.shape == data.shape[0]) + assert(G.clusters.shape == (X.shape[0],)) assert(G.landmark_op.shape == (n_landmark, n_landmark)) assert(isinstance(G, graphtools.graphs.MNNGraph)) assert(isinstance(G, graphtools.graphs.LandmarkGraph)) diff --git a/test/test_mnn.py b/test/test_mnn.py index 9bf284a..be78437 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -1,3 +1,4 @@ +from __future__ import print_function from load_tests import ( graphtools, np,