From ba74c5f4e2963359bcab1f4374a69457b5456c94 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 12:10:51 -0400 Subject: [PATCH 1/7] bump version --- graphtools/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtools/version.py b/graphtools/version.py index 0a8da88..02a5c48 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "0.1.6" +__version__ = "0.1.7rc" From e9e57c6b1de8e149b96f6b9d537478bd51ad8230 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:10:51 -0400 Subject: [PATCH 2/7] replace len with shape[0] for sparse matrix compatibility --- graphtools/graphs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 818d552..a2d0268 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -244,7 +244,7 @@ def build_kernel_to_data(self, Y, knn=None): if len(update_idx) > 0: distances = [d for d in distances] indices = [i for i in indices] - while len(update_idx) > len(Y) // 10 and \ + while len(update_idx) > Y.shape[0] // 10 and \ search_knn < self.data_nu.shape[0] / 2: # increase the knn search search_knn = min(search_knn * 20, self.data_nu.shape[0]) @@ -829,9 +829,9 @@ def __init__(self, data, sample_idx, if sample_idx is None: raise ValueError("sample_idx must be given. For a graph without" " batch correction, use kNNGraph.") - elif len(sample_idx) != len(data): + elif len(sample_idx) != data.shape[0]: raise ValueError("sample_idx ({}) must be the same length as " - "data ({})".format(len(sample_idx), len(data))) + "data ({})".format(len(sample_idx), data.shape[0])) elif len(self.samples) == 1: raise ValueError( "sample_idx must contain more than one unique value") @@ -1092,7 +1092,7 @@ def build_kernel_to_data(self, Y, gamma=None): kernel_yx = [] # don't really need within Y kernel Y_graph = kNNGraph(Y, n_pca=None, knn=0, **(self.knn_args)) - y_knn = self._weight_knn(sample_size=len(Y)) + y_knn = self._weight_knn(sample_size=Y.shape[0]) for i, X in enumerate(self.subgraphs): kernel_xy.append(X.build_kernel_to_data( Y, knn=self.weighted_knn[i])) # kernel X -> Y From 5932b749931ceed3a92b0ac8d7eb2c1fb4d01000 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:11:17 -0400 Subject: [PATCH 3/7] allow data to be sparse --- graphtools/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 33c532d..869435d 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -148,10 +148,13 @@ def _reduce_data(self): log_complete("PCA") return data_nu else: - data = self.data - if sparse.issparse(data): - data = data.toarray() - return data + data_nu = self.data + if sparse.issparse(data_nu) and not isinstance( + data_nu, (sparse.csr_matrix, + sparse.csc_matrix, + sparse.bsr_matrix)): + data_nu = data_nu.tocsr() + return data_nu def get_params(self): """Get parameters from this object From 7a1248c38c4847c0590ee689fc1e1bc4cd52809b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:11:56 -0400 Subject: [PATCH 4/7] make landmark mnn test fast --- test/test_landmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_landmark.py b/test/test_landmark.py index 18c5b0e..93926b0 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -54,7 +54,7 @@ def test_landmark_knn_graph(): def test_landmark_mnn_graph(): n_landmark = 500 # mnn graph - select_idx = np.random.choice([True, False], len(data), replace=True) + select_idx = np.random.choice(len(data), len(data) // 5, replace=False) G = build_graph(data[select_idx], n_landmark=n_landmark, thresh=1e-5, n_pca=20, decay=10, knn=5, random_state=42, From 357a235ca6732b0bda2b6c43985e08a107d6fa18 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:12:05 -0400 Subject: [PATCH 5/7] test sparse input --- test/load_tests/__init__.py | 2 +- test/test_exact.py | 95 +++++++++++++++++++++++++++++++++++++ test/test_knn.py | 35 ++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index 4ae5e10..cbeb8e1 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -1,4 +1,4 @@ -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, TruncatedSVD from sklearn import datasets from scipy.spatial.distance import pdist, cdist, squareform import pygsp diff --git a/test/test_exact.py b/test/test_exact.py index 6f9ed3b..5f86fbc 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -12,6 +12,7 @@ squareform, pdist, PCA, + TruncatedSVD ) ##################################################### @@ -186,6 +187,100 @@ def test_truncated_exact_graph(): assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) +def test_truncated_exact_graph_sparse(): + k = 3 + a = 13 + n_pca = 20 + thresh = 1e-4 + data_small = data[np.random.choice( + len(data), len(data) // 2, replace=False)] + pca = TruncatedSVD(n_pca, + random_state=42).fit(data_small) + data_small_nu = pca.transform(data_small) + pdx = squareform(pdist(data_small_nu, metric='euclidean')) + knn_dist = np.partition(pdx, k, axis=1)[:, :k] + epsilon = np.max(knn_dist, axis=1) + weighted_pdx = (pdx.T / epsilon).T + K = np.exp(-1 * weighted_pdx**a) + K[K < thresh] = 0 + W = K + K.T + W = np.divide(W, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(sp.coo_matrix(data_small), thresh=thresh, + graphtype='exact', + n_pca=n_pca, + decay=a, knn=k, random_state=42, + use_pygsp=True) + assert(G.N == G2.N) + np.testing.assert_allclose(G2.W.toarray(), G.W.toarray()) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + G2 = build_graph(sp.bsr_matrix(pdx), n_pca=None, precomputed='distance', + thresh=thresh, + decay=a, knn=k, random_state=42, use_pygsp=True) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G.W != G2.W).nnz == 0) + assert((G2.W != G.W).sum() == 0) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + G2 = build_graph(sp.lil_matrix(K), n_pca=None, + precomputed='affinity', + thresh=thresh, + random_state=42, use_pygsp=True) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G.W != G2.W).nnz == 0) + assert((G2.W != G.W).sum() == 0) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + G2 = build_graph(sp.dok_matrix(W), n_pca=None, + precomputed='adjacency', + random_state=42, use_pygsp=True) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G.W != G2.W).nnz == 0) + assert((G2.W != G.W).sum() == 0) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + + +def test_truncated_exact_graph_no_pca(): + k = 3 + a = 13 + n_pca = None + thresh = 1e-4 + data_small = data[np.random.choice( + len(data), len(data) // 10, replace=False)] + pdx = squareform(pdist(data_small, metric='euclidean')) + knn_dist = np.partition(pdx, k, axis=1)[:, :k] + epsilon = np.max(knn_dist, axis=1) + weighted_pdx = (pdx.T / epsilon).T + K = np.exp(-1 * weighted_pdx**a) + K[K < thresh] = 0 + W = K + K.T + W = np.divide(W, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(data_small, thresh=thresh, + graphtype='exact', + n_pca=n_pca, + decay=a, knn=k, random_state=42, + use_pygsp=True) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G.W != G2.W).nnz == 0) + assert((G2.W != G.W).sum() == 0) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + G2 = build_graph(sp.csr_matrix(data_small), thresh=thresh, + graphtype='exact', + n_pca=n_pca, + decay=a, knn=k, random_state=42, + use_pygsp=True) + assert(G.N == G2.N) + assert(np.all(G.d == G2.d)) + assert((G.W != G2.W).nnz == 0) + assert((G2.W != G.W).sum() == 0) + assert(isinstance(G2, graphtools.graphs.TraditionalGraph)) + + ##################################################### # Check interpolation ##################################################### diff --git a/test/test_knn.py b/test/test_knn.py index d221934..09b90a4 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,16 +1,19 @@ from load_tests import ( graphtools, np, + sp, pygsp, nose2, data, datasets, build_graph, assert_raises, + warns, raises, squareform, pdist, PCA, + TruncatedSVD, ) @@ -66,6 +69,31 @@ def test_knn_graph(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) +def test_knn_graph_sparse(): + k = 3 + n_pca = 20 + pca = TruncatedSVD(n_pca, random_state=42).fit(data) + data_nu = pca.transform(data) + pdx = squareform(pdist(data_nu, metric='euclidean')) + knn_dist = np.partition(pdx, k, axis=1)[:, :k] + epsilon = np.max(knn_dist, axis=1) + K = np.empty_like(pdx) + for i in range(len(pdx)): + K[i, pdx[i, :] <= epsilon[i]] = 1 + K[i, pdx[i, :] > epsilon[i]] = 0 + + K = K + K.T + W = np.divide(K, 2) + np.fill_diagonal(W, 0) + G = pygsp.graphs.Graph(W) + G2 = build_graph(sp.coo_matrix(data), n_pca=n_pca, + decay=None, knn=k, random_state=42, + use_pygsp=True) + assert(G.N == G2.N) + np.testing.assert_allclose(G2.W.toarray(), G.W.toarray()) + assert(isinstance(G2, graphtools.graphs.kNNGraph)) + + def test_sparse_alpha_knn_graph(): data = datasets.make_swiss_roll()[0] k = 5 @@ -88,6 +116,13 @@ def test_sparse_alpha_knn_graph(): assert(isinstance(G2, graphtools.graphs.kNNGraph)) +@warns(UserWarning) +def test_knn_graph_sparse_no_pca(): + build_graph(sp.coo_matrix(data), n_pca=None, # n_pca, + decay=10, knn=3, thresh=1e-4, + random_state=42, use_pygsp=True) + + ##################################################### # Check interpolation ##################################################### From a86af43be2c4b33f813cab2f18f506851eb7ac98 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:12:37 -0400 Subject: [PATCH 6/7] bump version --- graphtools/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtools/version.py b/graphtools/version.py index 02a5c48..f1380ee 100644 --- a/graphtools/version.py +++ b/graphtools/version.py @@ -1 +1 @@ -__version__ = "0.1.7rc" +__version__ = "0.1.7" From d23c72f05adf7b4c4e0c5bb3882e5e7c3b33a91d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Jun 2018 15:18:05 -0400 Subject: [PATCH 7/7] make faster mnn landmark test pass --- test/test_landmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_landmark.py b/test/test_landmark.py index 93926b0..896388d 100644 --- a/test/test_landmark.py +++ b/test/test_landmark.py @@ -52,7 +52,7 @@ def test_landmark_knn_graph(): def test_landmark_mnn_graph(): - n_landmark = 500 + n_landmark = 150 # mnn graph select_idx = np.random.choice(len(data), len(data) // 5, replace=False) G = build_graph(data[select_idx], n_landmark=n_landmark,