From a54cd4b5b1649b626b6ad991197b340010f6f9e3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Feb 2019 10:52:49 -0500 Subject: [PATCH] deprecate adaptive_k and matrix theta, fix tests --- graphtools/base.py | 28 +++++++++++ graphtools/graphs.py | 112 +++++-------------------------------------- test/test_mnn.py | 83 +++++++++++++++----------------- 3 files changed, 79 insertions(+), 144 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 55d72c5..2dc8736 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -65,6 +65,9 @@ def _get_param_names(cls): return parameters def set_params(self, **kwargs): + # for k in kwargs: + # raise TypeError("set_params() got an unexpected " + # "keyword argument '{}'".format(k)) return self @@ -866,3 +869,28 @@ def interpolate(self, transform, transitions=None, Y=None): transitions = self.extend_to_data(Y) Y_transform = transitions.dot(transform) return Y_transform + + def set_params(self, **params): + """Set parameters on this object + + Safe setter method - attributes should not be modified directly as some + changes are not valid. + Valid parameters: + - n_jobs + - verbose + + Parameters + ---------- + params : key-value pairs of parameter name and new values + + Returns + ------- + self + """ + if 'n_jobs' in params: + self.n_jobs = params['n_jobs'] + if 'verbose' in params: + self.verbose = params['verbose'] + tasklogger.set_level(self.verbose) + super().set_params(**params) + return self diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 29653a3..298bc7e 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -2,12 +2,12 @@ from builtins import super import numpy as np from sklearn.neighbors import NearestNeighbors -from scipy.spatial.distance import pdist, cdist -from scipy.spatial.distance import squareform from sklearn.utils.extmath import randomized_svd from sklearn.preprocessing import normalize from sklearn.cluster import MiniBatchKMeans from sklearn.utils.graph import graph_shortest_path +from scipy.spatial.distance import pdist, cdist +from scipy.spatial.distance import squareform from scipy import sparse import numbers import warnings @@ -1038,8 +1038,8 @@ class MNNGraph(DataGraph): def __init__(self, data, sample_idx, knn=5, beta=1, n_pca=None, - adaptive_k=None, decay=None, + adaptive_k=None, bandwidth=None, distance='euclidean', thresh=1e-4, @@ -1049,14 +1049,12 @@ def __init__(self, data, sample_idx, self.sample_idx = sample_idx self.samples, self.n_cells = np.unique( self.sample_idx, return_counts=True) - self.adaptive_k = adaptive_k self.knn = knn self.decay = decay self.distance = distance self.bandwidth = bandwidth self.thresh = thresh self.n_jobs = n_jobs - self.weighted_knn = self._weight_knn() if sample_idx is None: raise ValueError("sample_idx must be given. For a graph without" @@ -1068,78 +1066,25 @@ def __init__(self, data, sample_idx, elif len(self.samples) == 1: raise ValueError( "sample_idx must contain more than one unique value") + if adaptive_k is not None: + warnings.warn("`adaptive_k` has been deprecated. Using fixed knn.", + DeprecationWarning) super().__init__(data, n_pca=n_pca, **kwargs) def _check_symmetrization(self, kernel_symm, theta): if kernel_symm == 'theta' and theta is not None and \ not isinstance(theta, numbers.Number): - # matrix theta - try: - theta.shape - except AttributeError: - raise ValueError("theta {} not recognized. " - "Expected a float between 0 and 1 " - "or a [n_batch,n_batch] matrix of " - "floats between 0 and 1".format(theta)) - if not np.shape(theta) == (len(self.samples), - len(self.samples)): - raise ValueError( - "Matrix theta must be of shape " - "({}), got ({})".format( - (len(self.samples), - len(self.samples)), theta.shape)) - elif np.max(theta) > 1 or np.min(theta) < 0: - raise ValueError( - "Values in matrix theta must be between" - " 0 and 1, got values between {} and {}".format( - np.max(theta), np.min(theta))) - elif np.any(theta != theta.T): - raise ValueError("theta must be a symmetric matrix") + raise TypeError("Expected `theta` as a float. " + "Got {}.".format(type(theta))) else: super()._check_symmetrization(kernel_symm, theta) - def _weight_knn(self, sample_size=None): - """Select adaptive values of knn - - Parameters - ---------- - - sample_size : `int` or `None` - Number of cells in the sample in question. Used only for - out-of-sample extension. If `None`, calculates within-sample - knn values. - - Returns - ------- - - knn : array-like or `int`, weighted knn values - """ - if sample_size is None: - # calculate within sample knn values - sample_size = self.n_cells - if self.adaptive_k == 'min': - # the smallest sample has k - knn_weight = self.n_cells / np.min(self.n_cells) - elif self.adaptive_k == 'mean': - # the average sample has k - knn_weight = self.n_cells / np.mean(self.n_cells) - elif self.adaptive_k == 'sqrt': - # the samples are sqrt'd first, then smallest has k - knn_weight = np.sqrt(self.n_cells / np.min(self.n_cells)) - elif self.adaptive_k is None: - knn_weight = np.repeat(1, len(self.n_cells)) - weighted_knn = np.round(self.knn * knn_weight).astype(np.int32) - if len(weighted_knn) == 1: - weighted_knn = weighted_knn[0] - return weighted_knn - def get_params(self): """Get parameters from this object """ params = super().get_params() params.update({'beta': self.beta, - 'adaptive_k': self.adaptive_k, 'knn': self.knn, 'decay': self.decay, 'bandwidth': self.bandwidth, @@ -1176,9 +1121,6 @@ def set_params(self, **params): # mnn specific arguments if 'beta' in params and params['beta'] != self.beta: raise ValueError("Cannot update beta. Please create a new graph") - if 'adaptive_k' in params and params['adaptive_k'] != self.adaptive_k: - raise ValueError( - "Cannot update adaptive_k. Please create a new graph") # knn arguments knn_kernel_args = ['knn', 'decay', 'distance', 'thresh', 'bandwidth'] @@ -1216,12 +1158,12 @@ def build_kernel(self): tasklogger.log_debug("subgraph {}: sample {}, " "n = {}, knn = {}".format( i, idx, np.sum(self.sample_idx == idx), - self.weighted_knn[i])) + self.knn)) # select data for sample data = self.data_nu[self.sample_idx == idx] # build a kNN graph for cells within sample graph = Graph(data, n_pca=None, - knn=self.weighted_knn[i], + knn=self.knn, decay=self.decay, bandwidth=self.bandwidth, distance=self.distance, @@ -1229,6 +1171,7 @@ def build_kernel(self): verbose=self.verbose, random_state=self.random_state, n_jobs=self.n_jobs, + kernel_symm='+', initialize=True) self.subgraphs.append(graph) # append to list of subgraphs tasklogger.log_complete("subgraphs") @@ -1251,7 +1194,7 @@ def build_kernel(self): self.samples[j])) Kij = Y.build_kernel_to_data( X.data_nu, - knn=self.weighted_knn[i]) + knn=self.knn) between_batch_norm = np.array(np.sum(Kij, 1)).flatten() scale = np.minimum(1, within_batch_norm / between_batch_norm) * self.beta @@ -1267,37 +1210,6 @@ def build_kernel(self): tasklogger.log_complete("MNN kernel") return K - def symmetrize_kernel(self, K): - if self.kernel_symm == 'theta' and self.theta is not None and \ - not isinstance(self.theta, numbers.Number): - # matrix theta - # Theta can be a matrix with specific values transitions for - # each batch. This allows for technical replicates and - # experimental samples to be corrected simultaneously - tasklogger.log_debug("Using theta symmetrization. " - "Theta:\n{}".format(self.theta)) - for i, sample_i in enumerate(self.samples): - for j, sample_j in enumerate(self.samples): - if j < i: - continue - Kij = K[np.ix_(self.sample_idx == sample_i, - self.sample_idx == sample_j)] - Kji = K[np.ix_(self.sample_idx == sample_j, - self.sample_idx == sample_i)] - Kij_symm = self.theta[i, j] * \ - elementwise_minimum(Kij, Kji.T) + \ - (1 - self.theta[i, j]) * \ - elementwise_maximum(Kij, Kji.T) - K = set_submatrix(K, self.sample_idx == sample_i, - self.sample_idx == sample_j, Kij_symm) - if not i == j: - K = set_submatrix(K, self.sample_idx == sample_j, - self.sample_idx == sample_i, - Kij_symm.T) - else: - K = super().symmetrize_kernel(K) - return K - def build_kernel_to_data(self, Y, theta=None): """Build transition matrix from new data to the graph diff --git a/test/test_mnn.py b/test/test_mnn.py index 5344e25..3cbdb2c 100644 --- a/test/test_mnn.py +++ b/test/test_mnn.py @@ -51,8 +51,8 @@ def test_build_mnn_with_precomputed(): build_graph(data, n_pca=None, graphtype='mnn', precomputed='distance') -@raises(ValueError) -def test_mnn_with_square_theta_wrong_length(): +@raises(TypeError) +def test_mnn_with_matrix_theta(): n_sample = len(np.unique(digits['target'])) # square matrix theta of the wrong size build_graph( @@ -60,11 +60,11 @@ def test_mnn_with_square_theta_wrong_length(): decay=10, knn=5, random_state=42, sample_idx=digits['target'], kernel_symm='theta', - theta=np.tile(np.linspace(0, 1, n_sample - 1), - n_sample).reshape(n_sample - 1, n_sample)) + theta=np.tile(np.linspace(0, 1, n_sample), + n_sample).reshape(n_sample, n_sample)) -@raises(ValueError) +@raises(TypeError) def test_mnn_with_vector_theta(): n_sample = len(np.unique(digits['target'])) # vector theta @@ -86,7 +86,7 @@ def test_mnn_with_unbounded_theta(): theta=2) -@raises(ValueError) +@raises(TypeError) def test_mnn_with_string_theta(): build_graph( data, thresh=0, n_pca=20, @@ -135,6 +135,16 @@ def test_mnn_with_kernel_symmm_theta_and_no_theta(): kernel_symm='theta') +@warns(DeprecationWarning) +def test_mnn_adaptive_k(): + build_graph( + data, thresh=0, n_pca=20, + decay=10, knn=5, random_state=42, + sample_idx=digits['target'], + kernel_symm='theta', + theta=0.9, adaptive_k='sqrt') + + def test_mnn_with_non_zero_indexed_sample_idx(): X, sample_idx = generate_swiss_roll() G = build_graph(X, sample_idx=sample_idx, @@ -171,14 +181,13 @@ def test_mnn_with_string_sample_idx(): # Check kernel ##################################################### - -def test_mnn_graph_float_theta(): +def test_mnn_graph_no_decay(): X, sample_idx = generate_swiss_roll() theta = 0.9 k = 10 - a = 20 + a = None metric = 'euclidean' - beta = 0.5 + beta = 0.2 samples = np.unique(sample_idx) K = np.zeros((len(X), len(X))) @@ -188,18 +197,19 @@ def test_mnn_graph_float_theta(): for si in samples: X_i = X[sample_idx == si] # get observations in sample i for sj in samples: + batch_k = k + 1 if si == sj else k X_j = X[sample_idx == sj] # get observation in sample j pdx_ij = cdist(X_i, X_j, metric=metric) # pairwise distances kdx_ij = np.sort(pdx_ij, axis=1) # get kNN - e_ij = kdx_ij[:, k] # dist to kNN - pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize - k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel + e_ij = kdx_ij[:, batch_k - 1] # dist to kNN + k_ij = np.where(pdx_ij <= e_ij[:, None], 1, 0) # apply knn kernel if si == sj: K.iloc[sample_idx == si, sample_idx == sj] = ( k_ij + k_ij.T) / 2 else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij + Kn = K.copy() for i in samples: curr_K = K.iloc[sample_idx == i, sample_idx == i] @@ -210,10 +220,9 @@ def test_mnn_graph_float_theta(): else: curr_K = K.iloc[sample_idx == i, sample_idx == j] curr_norm = norm(curr_K, 1, axis=1) - scale = np.minimum( - np.ones(len(curr_norm)), i_norm / curr_norm) * beta - Kn.iloc[sample_idx == i, sample_idx == j] = ( - curr_K.T * scale).T + scale = np.minimum(1, i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, + sample_idx == j] = curr_K.values * scale[:, None] K = Kn W = np.array((theta * np.minimum(K, K.T)) + @@ -230,15 +239,13 @@ def test_mnn_graph_float_theta(): assert isinstance(G2, graphtools.graphs.MNNGraph) -def test_mnn_graph_matrix_theta(): +def test_mnn_graph_decay(): X, sample_idx = generate_swiss_roll() - bs = 0.8 - theta = np.array([[1, bs], # 0 - [bs, 1]]) # 3 + theta = 0.9 k = 10 a = 20 metric = 'euclidean' - beta = 0 + beta = 0.2 samples = np.unique(sample_idx) K = np.zeros((len(X), len(X))) @@ -248,10 +255,11 @@ def test_mnn_graph_matrix_theta(): for si in samples: X_i = X[sample_idx == si] # get observations in sample i for sj in samples: + batch_k = k if si == sj else k - 1 X_j = X[sample_idx == sj] # get observation in sample j pdx_ij = cdist(X_i, X_j, metric=metric) # pairwise distances kdx_ij = np.sort(pdx_ij, axis=1) # get kNN - e_ij = kdx_ij[:, k] # dist to kNN + e_ij = kdx_ij[:, batch_k] # dist to kNN pdxe_ij = pdx_ij / e_ij[:, np.newaxis] # normalize k_ij = np.exp(-1 * (pdxe_ij ** a)) # apply alpha-decaying kernel if si == sj: @@ -260,6 +268,7 @@ def test_mnn_graph_matrix_theta(): else: # fill out values in K for NN on diagonal K.iloc[sample_idx == si, sample_idx == sj] = k_ij + Kn = K.copy() for i in samples: curr_K = K.iloc[sample_idx == i, sample_idx == i] @@ -270,23 +279,13 @@ def test_mnn_graph_matrix_theta(): else: curr_K = K.iloc[sample_idx == i, sample_idx == j] curr_norm = norm(curr_K, 1, axis=1) - scale = np.minimum( - np.ones(len(curr_norm)), i_norm / curr_norm) * beta - Kn.iloc[sample_idx == i, sample_idx == j] = ( - curr_K.T * scale).T + scale = np.minimum(1, i_norm / curr_norm) * beta + Kn.iloc[sample_idx == i, + sample_idx == j] = curr_K.values * scale[:, None] K = Kn - - K = np.array(K) - - matrix_theta = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx)))) - for ix, si in enumerate(set(sample_idx)): - for jx, sj in enumerate(set(sample_idx)): - matrix_theta.iloc[sample_idx == si, - sample_idx == sj] = theta[ix, jx] - - W = np.array((matrix_theta * np.minimum(K, K.T)) + - ((1 - matrix_theta) * np.maximum(K, K.T))) + W = np.array((theta * np.minimum(K, K.T)) + + ((1 - theta) * np.maximum(K, K.T))) np.fill_diagonal(W, 0) G = pygsp.graphs.Graph(W) G2 = graphtools.Graph(X, knn=k, decay=a, beta=beta, @@ -295,8 +294,7 @@ def test_mnn_graph_matrix_theta(): use_pygsp=True) assert G.N == G2.N np.testing.assert_array_equal(G.dw, G2.dw) - assert (G.W != G2.W).nnz == 0 - assert (G2.W != G.W).sum() == 0 + np.testing.assert_array_equal((G.W - G2.W).data, 0) assert isinstance(G2, graphtools.graphs.MNNGraph) @@ -329,7 +327,6 @@ def test_set_params(): 'theta': 0.5, 'anisotropy': 0, 'beta': 1, - 'adaptive_k': None, 'knn': 3, 'decay': 10, 'bandwidth': None, @@ -356,10 +353,8 @@ def test_set_params(): assert_raises(ValueError, G.set_params, distance='manhattan') assert_raises(ValueError, G.set_params, thresh=1e-3) assert_raises(ValueError, G.set_params, beta=0.2) - assert_raises(ValueError, G.set_params, adaptive_k='min') G.set_params(knn=G.knn, decay=G.decay, thresh=G.thresh, distance=G.distance, - beta=G.beta, - adaptive_k=G.adaptive_k) + beta=G.beta)