From 4587f6a618e2f21a458796aa6515411ea2eedd27 Mon Sep 17 00:00:00 2001 From: Jinsol Park Date: Fri, 23 Aug 2024 02:52:17 -0700 Subject: [PATCH] [FEA] UMAP API for building with batched NN Descent (#6022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adds the following parameters as part of the `build_kwds` - `n_clusters`: number of clusters to use when batching. Larger number of clusters reduce GPU memory usage. Defaults to 1 (no batch) Results showing consistent trustworthiness scores for doing/not doing batching. Also note below that now UMAP can run with datasets that don't fit on the GPU. Putting the dataset on host and enabling the batching method allows UMAP to run with a dataset that is 50M x 768 (153GB). Screenshot 2024-08-13 at 5 55 27 PM ### Notes [This PR in raft](https://github.com/rapidsai/raft/pull/2403) needs to be merged before this PR Authors: - Jinsol Park (https://github.com/jinsolp) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/6022 --- cpp/src/umap/knn_graph/algo.cuh | 2 +- python/cuml/cuml/manifold/umap.pyx | 12 ++++++++++-- python/cuml/cuml/manifold/umap_utils.pxd | 9 +++++---- python/cuml/cuml/tests/test_umap.py | 22 ++++++++++++++++++++++ 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh index f6284e6a91..92c717afcd 100644 --- a/cpp/src/umap/knn_graph/algo.cuh +++ b/cpp/src/umap/knn_graph/algo.cuh @@ -59,7 +59,7 @@ void launcher(const raft::handle_t& handle, // Functor to post-process distances as L2Sqrt* template -struct DistancePostProcessSqrt { +struct DistancePostProcessSqrt : NNDescent::DistEpilogue { DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); } }; diff --git a/python/cuml/cuml/manifold/umap.pyx b/python/cuml/cuml/manifold/umap.pyx index 2b1b11c597..86933ab31b 100644 --- a/python/cuml/cuml/manifold/umap.pyx +++ b/python/cuml/cuml/manifold/umap.pyx @@ -297,7 +297,9 @@ class UMAP(UniversalBase, smaller than or equal to 50K. Otherwise, runs with nn descent. build_kwds: dict (optional, default=None) Build algorithm argument {'nnd_graph_degree': 64, 'nnd_intermediate_graph_degree': 128, - 'nnd_max_iterations': 20, 'nnd_termination_threshold': 0.0001, 'nnd_return_distances': True} + 'nnd_max_iterations': 20, 'nnd_termination_threshold': 0.0001, 'nnd_return_distances': True, + 'nnd_n_clusters': 1} + Note that nnd_n_clusters > 1 will result in batch-building with NN Descent. Notes ----- @@ -440,7 +442,8 @@ class UMAP(UniversalBase, # https://github.com/rapidsai/cuml/issues/5985 logger.info("build_algo set to brute_force_knn because random_state is given") self.build_algo ="brute_force_knn" - self.build_algo = build_algo + else: + self.build_algo = build_algo else: raise Exception("Invalid build algo: {}. Only support auto, brute_force_knn and nn_descent" % build_algo) @@ -488,12 +491,17 @@ class UMAP(UniversalBase, umap_params.nn_descent_params.max_iterations = 20 umap_params.nn_descent_params.termination_threshold = 0.0001 umap_params.nn_descent_params.return_distances = True + umap_params.nn_descent_params.n_clusters = 1 else: umap_params.nn_descent_params.graph_degree = cls.build_kwds.get("nnd_graph_degree", 64) umap_params.nn_descent_params.intermediate_graph_degree = cls.build_kwds.get("nnd_intermediate_graph_degree", 128) umap_params.nn_descent_params.max_iterations = cls.build_kwds.get("nnd_max_iterations", 20) umap_params.nn_descent_params.termination_threshold = cls.build_kwds.get("nnd_termination_threshold", 0.0001) umap_params.nn_descent_params.return_distances = cls.build_kwds.get("nnd_return_distances", True) + if cls.build_kwds.get("nnd_n_clusters", 1) < 1: + logger.info("Negative number of nnd_n_clusters not allowed. Changing nnd_n_clusters to 1") + umap_params.nn_descent_params.n_clusters = cls.build_kwds.get("nnd_n_clusters", 1) + umap_params.target_weight = cls.target_weight umap_params.random_state = cls.random_state umap_params.deterministic = cls.deterministic diff --git a/python/cuml/cuml/manifold/umap_utils.pxd b/python/cuml/cuml/manifold/umap_utils.pxd index b57b25a25a..54d5adf21f 100644 --- a/python/cuml/cuml/manifold/umap_utils.pxd +++ b/python/cuml/cuml/manifold/umap_utils.pxd @@ -40,11 +40,12 @@ cdef extern from "cuml/common/callback.hpp" namespace "ML::Internals": cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbors::experimental::nn_descent": cdef struct index_params: - int64_t graph_degree, - int64_t intermediate_graph_degree, - int64_t max_iterations, + uint64_t graph_degree, + uint64_t intermediate_graph_degree, + uint64_t max_iterations, float termination_threshold, - bool return_distances + bool return_distances, + uint64_t n_clusters, cdef extern from "cuml/manifold/umapparams.h" namespace "ML": diff --git a/python/cuml/cuml/tests/test_umap.py b/python/cuml/cuml/tests/test_umap.py index 76e44a2786..219810ba6d 100644 --- a/python/cuml/cuml/tests/test_umap.py +++ b/python/cuml/cuml/tests/test_umap.py @@ -838,3 +838,25 @@ def test_umap_distance_metrics_fit_transform_trust_on_sparse_input( if umap_learn_supported: assert array_equal(umap_trust, cuml_trust, 0.05, with_sign=True) + + +@pytest.mark.parametrize("data_on_host", [True, False]) +@pytest.mark.parametrize("num_clusters", [0, 3, 5]) +def test_umap_trustworthiness_on_batch_nnd(data_on_host, num_clusters): + + digits = datasets.load_digits() + + cuml_model = cuUMAP( + n_neighbors=10, + min_dist=0.01, + build_algo="nn_descent", + build_kwds={"nnd_n_clusters": num_clusters}, + ) + + cuml_embedding = cuml_model.fit_transform( + digits.data, convert_dtype=True, data_on_host=data_on_host + ) + + cuml_trust = trustworthiness(digits.data, cuml_embedding, n_neighbors=10) + + assert cuml_trust > 0.9