diff --git a/cpp/include/cuml/manifold/umap.hpp b/cpp/include/cuml/manifold/umap.hpp index 62a875e685..7de08c5488 100644 --- a/cpp/include/cuml/manifold/umap.hpp +++ b/cpp/include/cuml/manifold/umap.hpp @@ -84,6 +84,27 @@ void refine(const raft::handle_t& handle, UMAPParams* params, float* embeddings); +/** + * Initializes embeddings and performs a UMAP fit on them, which enables + * iterative fitting without callbacks. + * + * @param[in] handle: raft::handle_t + * @param[in] X: pointer to input array + * @param[in] n: n_samples of input array + * @param[in] d: n_features of input array + * @param[in] graph: pointer to raft::sparse::COO object computed using ML::UMAP::get_graph + * @param[in] params: pointer to ML::UMAPParams object + * @param[out] embeddings: pointer to current embedding with shape n * n_components, stores updated + * embeddings on executing refine + */ +void init_and_refine(const raft::handle_t& handle, + float* X, + int n, + int d, + raft::sparse::COO* graph, + UMAPParams* params, + float* embeddings); + /** * Dense fit * diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh index 41bac31678..0ceeb3acaa 100644 --- a/cpp/src/umap/runner.cuh +++ b/cpp/src/umap/runner.cuh @@ -247,12 +247,31 @@ void _refine(const raft::handle_t& handle, value_t* embeddings) { cudaStream_t stream = handle.get_stream(); + ML::Logger::get().setLevel(params->verbosity); + /** * Run simplicial set embedding to approximate low-dimensional representation */ SimplSetEmbed::run(inputs.n, inputs.d, graph, params, embeddings, stream); } +template +void _init_and_refine(const raft::handle_t& handle, + const umap_inputs& inputs, + UMAPParams* params, + raft::sparse::COO* graph, + value_t* embeddings) +{ + cudaStream_t stream = handle.get_stream(); + ML::Logger::get().setLevel(params->verbosity); + + // Initialize embeddings + InitEmbed::run(handle, inputs.n, inputs.d, graph, params, embeddings, stream, params->init); + + // Run simplicial set embedding + SimplSetEmbed::run(inputs.n, inputs.d, graph, params, embeddings, stream); +} + template void _fit(const raft::handle_t& handle, const umap_inputs& inputs, diff --git a/cpp/src/umap/umap.cu b/cpp/src/umap/umap.cu index 86799ae6bc..899051f8de 100644 --- a/cpp/src/umap/umap.cu +++ b/cpp/src/umap/umap.cu @@ -92,6 +92,20 @@ void refine(const raft::handle_t& handle, handle, inputs, params, graph, embeddings); } +void init_and_refine(const raft::handle_t& handle, + float* X, + int n, + int d, + raft::sparse::COO* graph, + UMAPParams* params, + float* embeddings) +{ + CUML_LOG_DEBUG("Calling UMAP::init_and_refine() with precomputed KNN"); + manifold_dense_inputs_t inputs(X, nullptr, n, d); + UMAPAlgo::_init_and_refine, TPB_X>( + handle, inputs, params, graph, embeddings); +} + void fit(const raft::handle_t& handle, float* X, float* y, diff --git a/python/cuml/cuml/manifold/simpl_set.pyx b/python/cuml/cuml/manifold/simpl_set.pyx index f22f524bf7..b0be2d5de7 100644 --- a/python/cuml/cuml/manifold/simpl_set.pyx +++ b/python/cuml/cuml/manifold/simpl_set.pyx @@ -16,6 +16,7 @@ # distutils: language = c++ +import warnings from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import @@ -26,7 +27,7 @@ from cuml.manifold.umap_utils cimport * from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ metric_parsing -from cuml.internals.input_utils import input_to_cuml_array +from cuml.internals.input_utils import input_to_cuml_array, is_array_like from cuml.internals.array import CumlArray from pylibraft.common.handle cimport handle_t @@ -56,6 +57,14 @@ cdef extern from "cuml/manifold/umap.hpp" namespace "ML::UMAP": UMAPParams* params, float* embeddings) + void init_and_refine(handle_t &handle, + float* X, + int n, + int d, + COO* cgraph_coo, + UMAPParams* params, + float* embeddings) + def fuzzy_simplicial_set(X, n_neighbors, @@ -73,6 +82,7 @@ def fuzzy_simplicial_set(X, locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy simplicial sets into a global one via a fuzzy union. + Parameters ---------- X: array of shape (n_samples, n_features) @@ -212,7 +222,7 @@ def simplicial_set_embedding( initial_alpha=1.0, a=None, b=None, - repulsion_strength=1.0, + gamma=1.0, negative_sample_rate=5, n_epochs=None, init="spectral", @@ -221,6 +231,7 @@ def simplicial_set_embedding( metric_kwds=None, output_metric="euclidean", output_metric_kwds=None, + repulsion_strength=None, convert_dtype=True, verbose=False, ): @@ -228,6 +239,7 @@ def simplicial_set_embedding( initialisation method and then minimizing the fuzzy set cross entropy between the 1-skeletons of the high and low dimensional fuzzy simplicial sets. + Parameters ---------- data: array of shape (n_samples, n_features) @@ -244,7 +256,7 @@ def simplicial_set_embedding( Parameter of differentiable approximation of right adjoint functor b: float Parameter of differentiable approximation of right adjoint functor - repulsion_strength: float + gamma: float Weight to apply to negative samples. negative_sample_rate: int (optional, default 5) The number of negative samples to select per positive sample @@ -260,7 +272,7 @@ def simplicial_set_embedding( How to initialize the low dimensional embedding. Options are: * 'spectral': use a spectral embedding of the fuzzy 1-skeleton * 'random': assign initial embedding positions at random. - * A numpy array of initial embedding positions. + * An array-like with initial embedding positions. random_state: numpy RandomState or equivalent A state capable being used as a numpy random state. metric: string (default='euclidean'). @@ -294,9 +306,6 @@ def simplicial_set_embedding( if output_metric_kwds is None: output_metric_kwds = {} - if init not in ['spectral', 'random']: - raise Exception("Initialization strategy not supported: %d" % init) - if output_metric not in ['euclidean', 'categorical']: raise Exception("Invalid output metric: {}" % output_metric) @@ -320,17 +329,29 @@ def simplicial_set_embedding( cdef UMAPParams* umap_params = new UMAPParams() umap_params.n_components = n_components umap_params.initial_alpha = initial_alpha - umap_params.a = a - umap_params.b = b - umap_params.repulsion_strength = repulsion_strength + umap_params.a = a + umap_params.b = b + + if repulsion_strength: + gamma = repulsion_strength + warnings.simplefilter(action="always", category=FutureWarning) + warnings.warn('Parameter "repulsion_strength" has been' + ' deprecated. It will be removed in version 24.12.' + ' Please use the "gamma" parameter instead.', + FutureWarning) + + umap_params.repulsion_strength = gamma umap_params.negative_sample_rate = negative_sample_rate umap_params.n_epochs = n_epochs - if init == 'spectral': - umap_params.init = 1 - else: # init == 'random' - umap_params.init = 0 umap_params.random_state = random_state umap_params.deterministic = deterministic + if isinstance(init, str): + if init == "random": + umap_params.init = 0 + elif init == 'spectral': + umap_params.init = 1 + else: + raise ValueError("Invalid initialization strategy") try: umap_params.metric = metric_parsing[metric.lower()] except KeyError: @@ -344,7 +365,7 @@ def simplicial_set_embedding( else: # output_metric == 'categorical' umap_params.target_metric = MetricType.CATEGORICAL umap_params.target_weight = output_metric_kwds['p'] \ - if 'p' in output_metric_kwds else 0 + if 'p' in output_metric_kwds else 0.5 umap_params.verbosity = verbose X_m, _, _, _ = \ @@ -365,17 +386,40 @@ def simplicial_set_embedding( handle, graph) - embedding = CumlArray.zeros((X_m.shape[0], n_components), - order="C", dtype=np.float32, - index=X_m.index) - - refine(handle_[0], - X_m.ptr, - X_m.shape[0], - X_m.shape[1], - fss_graph.get(), - umap_params, - embedding.ptr) + if isinstance(init, str): + if init in ['spectral', 'random']: + embedding = CumlArray.zeros((X_m.shape[0], n_components), + order="C", dtype=np.float32, + index=X_m.index) + init_and_refine(handle_[0], + X_m.ptr, + X_m.shape[0], + X_m.shape[1], + fss_graph.get(), + umap_params, + embedding.ptr) + else: + raise ValueError("Invalid initialization strategy") + elif is_array_like(init): + embedding, _, _, _ = \ + input_to_cuml_array(init, + order='C', + convert_to_dtype=(np.float32 if convert_dtype + else None), + check_dtype=np.float32, + check_rows=X_m.shape[0], + check_cols=n_components) + refine(handle_[0], + X_m.ptr, + X_m.shape[0], + X_m.shape[1], + fss_graph.get(), + umap_params, + embedding.ptr) + else: + raise ValueError( + "Initialization not supported. Please provide a valid " + "initialization strategy or a pre-initialized embedding.") free(umap_params) diff --git a/python/cuml/cuml/tests/test_simpl_set.py b/python/cuml/cuml/tests/test_simpl_set.py index cbc5ebc635..7f55155a9f 100644 --- a/python/cuml/cuml/tests/test_simpl_set.py +++ b/python/cuml/cuml/tests/test_simpl_set.py @@ -24,6 +24,7 @@ import pytest from cuml.datasets import make_blobs from cuml.internals.safe_imports import cpu_only_import +from cuml.metrics import trustworthiness np = cpu_only_import("numpy") cp = gpu_only_import("cupy") @@ -133,7 +134,7 @@ def test_simplicial_set_embedding( metric = "euclidean" initial_alpha = 1.0 a, b = UMAP.find_ab_params(1.0, 0.1) - gamma = 0 + gamma = 1.0 negative_sample_rate = 5 n_epochs = 500 init = "random" @@ -180,7 +181,6 @@ def test_simplicial_set_embedding( cu_fss_graph = cu_fuzzy_simplicial_set( X, n_neighbors, random_state, metric ) - cu_embedding = cu_simplicial_set_embedding( X, cu_fss_graph, @@ -199,7 +199,7 @@ def test_simplicial_set_embedding( output_metric_kwds=output_metric_kwds, ) - ref_embedding = cp.array(ref_embedding) - assert correctness_dense( - ref_embedding, cu_embedding, rtol=0.1, threshold=0.95 - ) + ref_t_score = trustworthiness(X, ref_embedding, n_neighbors=n_neighbors) + t_score = trustworthiness(X, cu_embedding, n_neighbors=n_neighbors) + abs_tol = 0.05 + assert t_score >= ref_t_score - abs_tol