diff --git a/graphtools/api.py b/graphtools/api.py index b9a4b1b..27b9a79 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -255,7 +255,7 @@ def Graph( else: msg = msg + " and PyGSP inheritance" - _logger.debug(msg) + _logger.log_debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: @@ -273,7 +273,7 @@ def Graph( pass # build graph and return - _logger.debug( + _logger.log_debug( "Initializing {} with arguments {}".format( parent_classes, ", ".join( diff --git a/graphtools/base.py b/graphtools/base.py index 4789b8a..59f166a 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -1,13 +1,17 @@ from future.utils import with_metaclass from builtins import super from copy import copy as shallow_copy +from dataclasses import dataclass +from mock import patch +from functools import partial import numpy as np import abc import pygsp from inspect import signature +import sklearn from sklearn.decomposition import PCA, TruncatedSVD from sklearn.preprocessing import normalize -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from scipy import sparse import warnings import numbers @@ -20,6 +24,115 @@ _logger = tasklogger.get_tasklogger("graphtools") +@dataclass +class PCAParameters(object): + """Data class that stores PCA parameters. + Parameters + ---------- + n_oversamples : int, default=10 + Additional number of random vectors to sample the range of M so as + to ensure proper conditioning. The total number of random vectors + used to find the range of M is n_components + n_oversamples. Smaller + number can improve speed but can negatively impact the quality of + approximation of singular vectors and singular values. Users might wish + to increase this parameter up to `2*k - n_components` where k is the + effective rank, for large matrices, noisy problems, matrices with + slowly decaying spectrums, or to increase precision accuracy. + n_iter : int or 'auto', default='auto' + Number of power iterations. It can be used to deal with very noisy + problems. When 'auto', it is set to 4, unless `n_components` is small + (< .1 * min(X.shape)) in which case `n_iter` is set to 7. + This improves precision with few components. Note that in general + users should rather increase `n_oversamples` before increasing `n_iter` + as the principle of the randomized method is to avoid usage of these + more costly power iterations steps. When `n_components` is equal + or greater to the effective matrix rank and the spectrum does not + present a slow decay, `n_iter=0` or `1` should even work fine in theory + power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' + Whether the power iterations are normalized with step-by-step + QR factorization (the slowest but most accurate), 'none' + (the fastest but numerically unstable when `n_iter` is large, e.g. + typically 5 or larger), or 'LU' factorization (numerically stable + but can lose slightly in accuracy). The 'auto' mode applies no + normalization if `n_iter` <= 2 and switches to LU otherwise. + See documentation for sklearn.utils.extmath.randomized_svd + """ + + _valid = {} + _valid["n_oversamples"] = {int: lambda x: x > 0} + _valid["n_iter"] = {str: lambda x: x in ["auto"], int: lambda x: x >= 0} + _valid["power_iteration_normalizer"] = { + str: lambda x: x.lower() in ["auto", "qr", "lu", "none"] + } + _valid_str = {} + _valid_str["n_oversamples"] = ["int > 0"] + _valid_str["n_iter"] = ["auto", "int >= 0"] + _valid_str["power_iteration_normalizer"] = ["auto", "QR", "LU", "none"] + + n_oversamples: int = 10 + n_iter: int = "auto" + power_iteration_normalizer: str = "auto" + + def validate(self): + validated = [] + errs = [] + valids = [] + fields = list(self.__dataclass_fields__.items()) + fields.sort(key=lambda x: x[0]) + for field_name, field_def in fields: + attr = getattr(self, field_name) + validated.append(False) + for typ, typfun in self._valid[field_name].items(): + if isinstance(attr, typ): + validated[-1] = typfun(attr) + if not validated[-1]: + errs.append(field_name) + return all(validated), errs + + def __post_init__(self): + validated, errs = self.validate() + errs = errs + if not validated: + errorstring = f"{errs} were invalid type or value. " f"Valid values are " + for err in errs: + errorstring += f"{self._valid_str[err]}, " + errorstring += "respectively." + raise ValueError(errorstring) + + +##some monkey patching of randomized_svd... +def randomized_svd_monkey( + M, + n_components, + *, + pca_params=PCAParameters(), + n_oversamples=10, + n_iter="auto", + power_iteration_normalizer="auto", + transpose="auto", + flip_sign=True, + random_state="warn", +): + if sklearn.__version__ > "1.0.1": + warnings.warn( + "Graphtools is using a patched version of randomized_svd " + "designed for sklearn version 1.0.1. The current version " + "of sklearn is {}. Please alert the graphtools authors to " + "update the patch.".format(sklearn.__version__), + RuntimeWarning, + ) + return sklearn.utils.extmath.randomized_svd( + M, + n_components=n_components, + n_oversamples=pca_params.n_oversamples, + n_iter=pca_params.n_iter, + power_iteration_normalizer=pca_params.power_iteration_normalizer, + transpose=transpose, + flip_sign=flip_sign, + random_state=random_state, + ) + + class Base(object): """Class that deals with key-word arguments but is otherwise just an object. @@ -90,7 +203,9 @@ class Data(Base): s_max * eps * max(n_samples, n_features) where s_max is the maximum singular value of the data matrix and eps is numerical precision. [press2007]_. - + pca_params : `PCAParameters`, optional (default: `PCAParameters()`) + Parameters to use for randomized SVD and PCA. See documentation + for graphtools.base.PCAParameters. random_state : `int` or `None`, optional (default: `None`) Random state for random PCA @@ -109,11 +224,19 @@ class Data(Base): """ def __init__( - self, data, n_pca=None, rank_threshold=None, random_state=None, **kwargs + self, + data, + n_pca=None, + rank_threshold=None, + pca_params=PCAParameters(), + random_state=None, + **kwargs, ): self._check_data(data) - n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold) + n_pca, rank_threshold, pca_params = self._parse_pca_parameters( + data, n_pca, rank_threshold, pca_params + ) if utils.is_SparseDataFrame(data): data = data.to_coo() @@ -130,11 +253,12 @@ def __init__( self.data = data self.n_pca = n_pca self.rank_threshold = rank_threshold + self.pca_params = pca_params self.random_state = random_state self.data_nu = self._reduce_data() super().__init__(**kwargs) - def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): + def _parse_pca_parameters(self, data, n_pca, rank_threshold, pca_params): if isinstance(n_pca, str): n_pca = n_pca.lower() if n_pca != "auto": @@ -173,7 +297,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): n_pca = None elif n_pca is True: # notify that we're going to estimate rank. n_pca = "auto" - _logger.info( + _logger.log_info( "Estimating n_pca from matrix rank. " "Supply an integer n_pca " "for fixed amount." @@ -207,7 +331,12 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): raise ValueError( "rank_threshold must be positive float or 'auto'. " ) - return n_pca, rank_threshold + if pca_params is None: + pca_params = PCAParameters() + else: + if not isinstance(pca_params, PCAParameters): + raise ValueError("pca_params must be an instance of PCAParameters.") + return n_pca, rank_threshold, pca_params def _check_data(self, data): if len(data.shape) != 2: @@ -237,7 +366,10 @@ def _reduce_data(self): if self.n_pca is not None and ( self.n_pca == "auto" or self.n_pca < self.data.shape[1] ): - with _logger.task("PCA"): + with _logger.log_task("PCA"): + randomized_pca = partial( + randomized_svd_monkey, pca_params=self.pca_params + ) n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca if sparse.issparse(self.data): if ( @@ -251,7 +383,14 @@ def _reduce_data(self): self.data_pca = PCA( n_pca, svd_solver="randomized", random_state=self.random_state ) - self.data_pca.fit(self.data) + with patch( + "sklearn.decomposition._pca.randomized_svd", new=randomized_pca + ) as foo, patch( + "sklearn.decomposition._truncated_svd.randomized_svd", + new=randomized_pca, + ) as bar: + self.data_pca.fit(self.data) + if self.n_pca == "auto": s = self.data_pca.singular_values_ smax = s.max() @@ -269,7 +408,7 @@ def _reduce_data(self): "maximum singular value {} " "for the data matrix".format(threshold, smax) ) - _logger.info( + _logger.log_info( "Using rank estimate of {} as n_pca".format(self.n_pca) ) # reset the sklearn operator @@ -292,8 +431,7 @@ def _reduce_data(self): return data_nu def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" return {"n_pca": self.n_pca, "random_state": self.random_state} def set_params(self, **params): @@ -469,7 +607,7 @@ def __init__( anisotropy=0, gamma=None, initialize=True, - **kwargs + **kwargs, ): if gamma is not None: warnings.warn( @@ -498,10 +636,10 @@ def __init__( self.anisotropy = anisotropy if initialize: - _logger.debug("Initializing kernel...") + _logger.log_debug("Initializing kernel...") self.K else: - _logger.debug("Not initializing kernel.") + _logger.log_debug("Not initializing kernel.") super().__init__(**kwargs) def _check_symmetrization(self, kernel_symm, theta): @@ -556,18 +694,20 @@ def _build_kernel(self): def symmetrize_kernel(self, K): # symmetrize if self.kernel_symm == "+": - _logger.debug("Using addition symmetrization.") + _logger.log_debug("Using addition symmetrization.") K = (K + K.T) / 2 elif self.kernel_symm == "*": - _logger.debug("Using multiplication symmetrization.") + _logger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) elif self.kernel_symm == "mnn": - _logger.debug("Using mnn symmetrization (theta = {}).".format(self.theta)) + _logger.log_debug( + "Using mnn symmetrization (theta = {}).".format(self.theta) + ) K = self.theta * matrix.elementwise_minimum(K, K.T) + ( 1 - self.theta ) * matrix.elementwise_maximum(K, K.T) elif self.kernel_symm is None: - _logger.debug("Using no symmetrization.") + _logger.log_debug("Using no symmetrization.") pass else: raise NotImplementedError @@ -589,8 +729,7 @@ def apply_anisotropy(self, K): return K def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" return { "kernel_symm": self.kernel_symm, "theta": self.theta, @@ -697,8 +836,7 @@ def diff_aff(self): @property def diff_op(self): - """Synonym for P - """ + """Synonym for P""" return self.P @property @@ -719,8 +857,7 @@ def K(self): @property def kernel(self): - """Synonym for K - """ + """Synonym for K""" return self.K @property @@ -850,10 +987,10 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if not self.weighted: distance = "data" - _logger.info("Using ambient data distances.") + _logger.log_info("Using ambient data distances.") else: distance = "affinity" - _logger.info("Using negative log affinity distances.") + _logger.log_info("Using negative log affinity distances.") return distance def shortest_path(self, method="auto", distance=None): @@ -895,8 +1032,12 @@ def shortest_path(self, method="auto", distance=None): np.sum((self.data_nu[D.row] - self.data_nu[D.col]) ** 2, axis=1) ) elif distance == "affinity": - D = sparse.csr_matrix(self.K) - D.data = -1 * np.log(D.data) + # D = sparse.csr_matrix(self.K) + # D.data = -1 * np.log(D.data) + D = -1 * np.where( + self.K != 0, np.log(np.where(self.K != 0, self.K, np.nan)), 0 + ) + # D = sparse.csr_matrix(D) else: raise ValueError( "Expected `distance` in ['constant', 'data', 'affinity']. " @@ -1019,8 +1160,7 @@ def __init__(self, data, verbose=True, n_jobs=1, **kwargs): super().__init__(data, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = Data.get_params(self) params.update(BaseGraph.get_params(self)) return params diff --git a/graphtools/estimator.py b/graphtools/estimator.py index 5cb130f..3244fbb 100644 --- a/graphtools/estimator.py +++ b/graphtools/estimator.py @@ -81,18 +81,18 @@ class GraphEstimator(object, metaclass=abc.ABCMeta): verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages - + n_svd : int, optional (default: 100) number of singular vectors to compute for landmarking - + thresh : float, optional (default: 1e-4) threshold below which to truncate kernel - + kwargs : additional arguments for graphtools.Graph - + Attributes ---------- - + graph : graphtools.Graph """ @@ -248,13 +248,13 @@ def _set_graph_params(self, **params): ) self.graph.set_params(**params) except ValueError as e: - _logger.debug("Reset graph due to {}".format(str(e))) + _logger.log_debug("Reset graph due to {}".format(str(e))) self.graph = None @abc.abstractmethod def _reset_graph(self): """Trigger a reset of self.graph - + Any downstream effects of resetting the graph should override this function """ raise NotImplementedError @@ -361,7 +361,7 @@ def _update_graph(self, X, precomputed, n_pca, n_landmark, **kwargs): **(self.kwargs) ) if self.graph is not None: - _logger.info("Using precomputed graph and diffusion operator...") + _logger.log_info("Using precomputed graph and diffusion operator...") def fit(self, X, **kwargs): """Computes the graph @@ -384,13 +384,13 @@ def fit(self, X, **kwargs): X, n_pca, n_landmark, precomputed, update_graph = self._parse_input(X) if precomputed is None: - _logger.info( + _logger.log_info( "Building graph on {} samples and {} features.".format( X.shape[0], X.shape[1] ) ) else: - _logger.info( + _logger.log_info( "Building graph on precomputed {} matrix with {} samples.".format( precomputed, X.shape[0] ) @@ -402,7 +402,7 @@ def fit(self, X, **kwargs): self.X = X if self.graph is None: - with _logger.task("graph and diffusion operator"): + with _logger.log_task("graph and diffusion operator"): self.graph = api.Graph( X, n_pca=n_pca, diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 2caa431..e6eda49 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -132,8 +132,7 @@ def __init__( super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -347,19 +346,19 @@ def build_kernel_to_data( Y = self._check_extension_shape(Y) if self.decay is None or self.thresh == 1: - with _logger.task("KNN search"): + with _logger.log_task("KNN search"): # binary connectivity matrix K = self.knn_tree.kneighbors_graph( Y, n_neighbors=knn, mode="connectivity" ) else: - with _logger.task("KNN search"): + with _logger.log_task("KNN search"): # sparse fast alpha decay knn_tree = self.knn_tree search_knn = min(knn * self.search_multiplier, knn_max) distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn) self._check_duplicates(distances, indices) - with _logger.task("affinities"): + with _logger.log_task("affinities"): if bandwidth is None: bandwidth = distances[:, knn - 1] @@ -370,7 +369,7 @@ def build_kernel_to_data( radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere(np.max(distances, axis=1) < radius).reshape(-1) - _logger.debug( + _logger.log_debug( "search_knn = {}; {} remaining".format(search_knn, len(update_idx)) ) if len(update_idx) > 0: @@ -399,7 +398,7 @@ def build_kernel_to_data( else radius[i] ) ] - _logger.debug( + _logger.log_debug( "search_knn = {}; {} remaining".format( search_knn, len(update_idx) ) @@ -412,7 +411,7 @@ def build_kernel_to_data( ).fit(self.data_nu) if len(update_idx) > 0: if search_knn == knn_max: - _logger.debug( + _logger.log_debug( "knn search to knn_max ({}) on {}".format( knn_max, len(update_idx) ) @@ -425,7 +424,7 @@ def build_kernel_to_data( distances[idx] = dist_new[i] indices[idx] = ind_new[i] else: - _logger.debug("radius search on {}".format(len(update_idx))) + _logger.log_debug("radius search on {}".format(len(update_idx))) # give up - radius search dist_new, ind_new = knn_tree.radius_neighbors( Y[update_idx, :], @@ -524,8 +523,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs): super().__init__(data, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update({"n_landmark": self.n_landmark, "n_pca": self.n_pca}) return params @@ -653,16 +651,16 @@ def build_landmark_op(self): probabilities between cluster centers by using transition probabilities between samples assigned to each cluster. """ - with _logger.task("landmark operator"): + with _logger.log_task("landmark operator"): is_sparse = sparse.issparse(self.kernel) # spectral clustering - with _logger.task("SVD"): + with _logger.log_task("SVD"): _, _, VT = randomized_svd( self.diff_aff, n_components=self.n_svd, random_state=self.random_state, ) - with _logger.task("KMeans"): + with _logger.log_task("KMeans"): kmeans = MiniBatchKMeans( self.n_landmark, init_size=3 * self.n_landmark, @@ -670,7 +668,6 @@ def build_landmark_op(self): random_state=self.random_state, ) self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T)) - # transition matrices pmn = self._landmarks_to_data() @@ -886,8 +883,7 @@ def __init__( super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -985,7 +981,7 @@ def build_kernel(self): K = K.tolil() K = matrix.set_diagonal(K, 1) else: - with _logger.task("affinities"): + with _logger.log_task("affinities"): if sparse.issparse(self.data_nu): self.data_nu = self.data_nu.toarray() if self.precomputed == "distance": @@ -1091,7 +1087,7 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: - with _logger.task("affinities"): + with _logger.log_task("affinities"): Y = self._check_extension_shape(Y) pdx = cdist(Y, self.data_nu, metric=self.distance) if bandwidth is None: @@ -1128,7 +1124,7 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if self.precomputed is not None and not self.weighted: distance = "constant" - _logger.info("Using constant distances.") + _logger.log_info("Using constant distances.") else: distance = super()._default_shortest_path_distance() return distance @@ -1222,8 +1218,7 @@ def _check_symmetrization(self, kernel_symm, theta): super()._check_symmetrization(kernel_symm, theta) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -1296,13 +1291,13 @@ def build_kernel(self): symmetric matrix with ones down the diagonal with no non-negative entries. """ - with _logger.task("subgraphs"): + with _logger.log_task("subgraphs"): self.subgraphs = [] from .api import Graph # iterate through sample ids for i, idx in enumerate(self.samples): - _logger.debug( + _logger.log_debug( "subgraph {}: sample {}, " "n = {}, knn = {}".format( i, idx, np.sum(self.sample_idx == idx), self.knn @@ -1327,7 +1322,7 @@ def build_kernel(self): ) self.subgraphs.append(graph) # append to list of subgraphs - with _logger.task("MNN kernel"): + with _logger.log_task("MNN kernel"): if self.thresh > 0 or self.decay is None: K = sparse.lil_matrix((self.data_nu.shape[0], self.data_nu.shape[0])) else: @@ -1343,7 +1338,7 @@ def build_kernel(self): for j, Y in enumerate(self.subgraphs): if i == j: continue - with _logger.task( + with _logger.log_task( "kernel from sample {} to {}".format( self.samples[i], self.samples[j] ) diff --git a/requirements.txt b/requirements.txt index c31163d..96f623d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ scikit-learn>=0.20.0 future tasklogger>=1.0 Deprecated +mock diff --git a/setup.py b/setup.py index 0586273..792709a 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ "future", "tasklogger>=1.0", "Deprecated", + "mock", ] test_requires = [ @@ -44,7 +45,9 @@ description="graphtools", author="Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University", author_email="scott.gigante@yale.edu", - packages=["graphtools",], + packages=[ + "graphtools", + ], license="GNU General Public License Version 2", install_requires=install_requires, extras_require={"test": test_requires, "doc": doc_requires}, @@ -54,7 +57,12 @@ download_url="https://github.com/KrishnaswamyLab/graphtools/archive/v{}.tar.gz".format( version ), - keywords=["graphs", "big-data", "signal processing", "manifold-learning",], + keywords=[ + "graphs", + "big-data", + "signal processing", + "manifold-learning", + ], classifiers=[ "Development Status :: 4 - Beta", "Environment :: Console", diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index 1c6213f..8b0d25b 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -1,4 +1,6 @@ from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.utils.extmath import randomized_svd +from graphtools.base import PCAParameters, Data from sklearn import datasets from scipy.spatial.distance import pdist, cdist, squareform import pygsp @@ -7,7 +9,7 @@ import scipy.sparse as sp import warnings import pandas as pd - +import sklearn import nose2 from nose.tools import assert_raises_regex, assert_warns_regex import re diff --git a/test/test_data.py b/test/test_data.py index 24f6dd2..3b24a03 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -3,7 +3,10 @@ np, sp, pd, + sklearn, graphtools, + PCAParameters, + Data, nose2, data, build_graph, @@ -528,6 +531,84 @@ def test_transform_sparse_adaptive_pca(): assert np.allclose(G3.data_nu, G2.transform(G2.data)) +##################################################### +# Check PCAParameters +##################################################### + + +def test_pca_parameters(): + params = PCAParameters() + assert params.n_oversamples == 10 + assert params.n_iter == "auto" + assert params.power_iteration_normalizer == "auto" + + with assert_raises_message( + ValueError, + "['n_oversamples'] were invalid type or value. Valid values are ['int > 0'], respectively.", + ): + params = PCAParameters(n_oversamples=0) + try: + params = PCAParameters( + n_oversamples=0, n_iter="foo", power_iteration_normalizer="bar" + ) + except ValueError as e: + assert ( + str(e) + == "['n_iter', 'n_oversamples', 'power_iteration_normalizer'] were invalid type or value. Valid values are ['auto', 'int >= 0'], ['int > 0'], ['auto', 'QR', 'LU', 'none'], respectively." + ) + params = PCAParameters(11, 2, "QR") + + +##################################################### +# Check randomized_svd monkey patch +##################################################### + + +def test_warns_sklearn_version(): + import sklearn + + sklbak = sklearn.__version__ + sklearn.__version__ = "1.0.2" + x = np.random.randn(100, 100) + with assert_warns_message( + RuntimeWarning, + "Graphtools is using a patched version of randomized_svd designed for sklearn version 1.0.1. The current version of sklearn is 1.0.2. Please alert the graphtools authors to update the patch.", + ): + Data(x, n_pca=2) + sklearn.__version__ = sklbak + + +def test_gets_good_svs(): + x = np.random.randn(1000, 500) + u, s, vt = np.linalg.svd(x, full_matrices=False) + sy = np.r_[ + np.arange(50), + np.zeros( + 450, + ), + ] + y = (u * sy) @ vt + # test the sparse case (truncated SVD, no mean centering) + y = sp.csr_matrix(y) + obj = Data(y, n_pca=25) + assert np.any( + np.logical_not(obj.data_pca.singular_values_ == np.arange(50)[::-1][:25]) + ) + params = PCAParameters(n_oversamples=100) + obj = Data(y, n_pca=25, pca_params=params) + assert np.allclose(obj.data_pca.singular_values_, np.arange(50)[::-1][:25]) + # test the dense case, has mean centering + y = y.toarray() + y = y - np.mean(y, axis=0) + u, s, vt = np.linalg.svd(y, full_matrices=False) + params = PCAParameters(n_oversamples=1) + obj = Data(y, n_pca=25, pca_params=params) + assert not (np.allclose(obj.data_pca.singular_values_, s[:25])) + params = PCAParameters(n_oversamples=1000) + obj = Data(y, n_pca=25, pca_params=params) + assert np.allclose(obj.data_pca.singular_values_, s[:25]) + + ############# # Test API ############# diff --git a/test/test_exact.py b/test/test_exact.py index 07faab0..684807a 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -1,5 +1,5 @@ from __future__ import print_function -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from load_tests import ( graphtools, np, @@ -12,6 +12,7 @@ pdist, PCA, TruncatedSVD, + Data, assert_raises_message, assert_warns_message, ) @@ -212,8 +213,8 @@ def test_truncated_exact_graph(): n_pca = 20 thresh = 1e-4 data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)] - pca = PCA(n_pca, svd_solver="randomized", random_state=42).fit(data_small) - data_small_nu = pca.transform(data_small) + pca = Data(data_small, n_pca, random_state=42) + data_small_nu = pca.data_pca.transform(data_small) pdx = squareform(pdist(data_small_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1) @@ -283,8 +284,8 @@ def test_truncated_exact_graph_sparse(): n_pca = 20 thresh = 1e-4 data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)] - pca = TruncatedSVD(n_pca, random_state=42).fit(data_small) - data_small_nu = pca.transform(data_small) + pca = Data(sp.coo_matrix(data_small), n_pca, random_state=42) + data_small_nu = pca.data_pca.transform(data_small) pdx = squareform(pdist(data_small_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1) diff --git a/test/test_knn.py b/test/test_knn.py index fe47c07..9706240 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,5 +1,5 @@ from __future__ import print_function, division -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from scipy.spatial.distance import pdist, squareform from load_tests import assert_raises_message, assert_warns_message from nose.tools import assert_raises_regex, assert_warns_regex @@ -14,6 +14,7 @@ build_graph, PCA, TruncatedSVD, + Data, ) @@ -156,7 +157,8 @@ def test_knn_graph(): ), ): G2.build_kernel_to_data( - Y=G2.data_nu, knn=data.shape[0] + 1, + Y=G2.data_nu, + knn=data.shape[0] + 1, ) @@ -195,8 +197,8 @@ def test_knn_graph_multiplication_symm(): def test_knn_graph_sparse(): k = 3 n_pca = 20 - pca = TruncatedSVD(n_pca, random_state=42).fit(data) - data_nu = pca.transform(data) + pca = Data(sp.coo_matrix(data), n_pca, random_state=42) + data_nu = pca.data_pca.transform(data) pdx = squareform(pdist(data_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1)