From f0b1ac70e4e3607ef8c6cc42b9af97148e7c6fec Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 14 Jun 2024 23:16:36 +0000 Subject: [PATCH 01/25] enable nn descent in hdbscan --- cpp/include/cuml/cluster/hdbscan.hpp | 9 +- cpp/src/hdbscan/detail/reachability.cuh | 305 ++++++++++++++++++------ cpp/src/hdbscan/hdbscan.cu | 18 +- cpp/src/hdbscan/runner.h | 4 +- python/cuml/cluster/hdbscan/hdbscan.pyx | 55 ++++- python/cuml/tests/test_hdbscan.py | 70 +++++- 6 files changed, 370 insertions(+), 91 deletions(-) diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp index eb1223fd88..3d98ec1faa 100644 --- a/cpp/include/cuml/cluster/hdbscan.hpp +++ b/cpp/include/cuml/cluster/hdbscan.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -27,6 +28,8 @@ namespace ML { namespace HDBSCAN { namespace Common { +using nn_index_params = raft::neighbors::experimental::nn_descent::index_params; + /** * The Condensed hierarchicy is represented by an edge list with * parents as the source vertices, children as the destination, @@ -134,6 +137,7 @@ class CondensedHierarchy { }; enum CLUSTER_SELECTION_METHOD { EOM = 0, LEAF = 1 }; +enum GRAPH_BUILD_ALGO { BRUTE_FORCE_KNN = 0, NN_DESCENT = 1 }; class RobustSingleLinkageParams { public: @@ -151,6 +155,8 @@ class RobustSingleLinkageParams { class HDBSCANParams : public RobustSingleLinkageParams { public: CLUSTER_SELECTION_METHOD cluster_selection_method = CLUSTER_SELECTION_METHOD::EOM; + GRAPH_BUILD_ALGO build_algo = GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN; + nn_index_params nn_descent_params = {}; }; /** @@ -502,7 +508,8 @@ void compute_core_dists(const raft::handle_t& handle, size_t m, size_t n, raft::distance::DistanceType metric, - int min_samples); + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo); /** * @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 03a7f7c0ad..e381f52222 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -16,11 +16,14 @@ #pragma once +#include #include #include #include #include +#include +#include #include #include #include @@ -34,6 +37,9 @@ #include #include +using align32 = raft::Pow2<32>; +namespace NNDescent = raft::neighbors::experimental::nn_descent; + namespace ML { namespace HDBSCAN { namespace detail { @@ -68,6 +74,12 @@ void core_distances( }); } +// Functor to post-process distances into reachability space +template +struct DistancePostProcessSqrt { + DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); } +}; + /** * Wraps the brute force knn API, to be used for both training and prediction * @tparam value_idx data type for integrals @@ -93,33 +105,93 @@ void compute_knn(const raft::handle_t& handle, const value_t* search_items, size_t n_search_items, int k, - raft::distance::DistanceType metric) + raft::distance::DistanceType metric, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { auto stream = handle.get_stream(); auto exec_policy = handle.get_thrust_policy(); - std::vector inputs; - inputs.push_back(const_cast(X)); - - std::vector sizes; - sizes.push_back(m); - // This is temporary. Once faiss is updated, we should be able to // pass value_idx through to knn. rmm::device_uvector int64_indices(k * n_search_items, stream); - // perform knn - brute_force_knn(handle, - inputs, - sizes, - n, - const_cast(search_items), - n_search_items, - int64_indices.data(), - dists, - k, - true, - true, - metric); + if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { + std::vector inputs; + inputs.push_back(const_cast(X)); + + std::vector sizes; + sizes.push_back(m); + // // This is temporary. Once faiss is updated, we should be able to + // // pass value_idx through to knn. + // rmm::device_uvector int64_indices(k * n_search_items, stream); + + // perform knn + brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(search_items), + n_search_items, + int64_indices.data(), + dists, + k, + true, + true, + metric); + } else { // NN_DESCENT + // [JS] TODO: add check for graph degree + // [JS] TODO: pass params + auto epilogue = DistancePostProcessSqrt{}; + build_params.return_distances = true; + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto dataset = raft::make_host_matrix_view(X, m, n); + auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); + + for (int i = 0; i < n_search_items; i++) { + if (graph.distances().has_value()) { + raft::copy(dists + i * k + 1, + graph.distances().value().data_handle() + i * build_params.graph_degree, + k - 1, + handle.get_stream()); + thrust::fill(thrust::device.on(stream), dists + i * k, dists + i * k + 1, 0.0); + } + raft::copy(int64_indices.data() + i * k + 1, + graph.graph().data_handle() + i * build_params.graph_degree, + k - 1, + handle.get_stream()); + thrust::fill(thrust::device.on(stream), + int64_indices.data() + i * k, + int64_indices.data() + i * k + 1, + i); + } + // NNDescent::index_params params = {}; + // params.return_distances = true; + // size_t graph_degree = align32::roundUp(static_cast(k * 3.0)); + // params.graph_degree = graph_degree; + // params.intermediate_graph_degree = align32::roundUp(static_cast(graph_degree * 1.3)); + // params.max_iterations = 50; + + // RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be + // smaller than the graph degree computed by nn descent"); + + // auto dataset = + // raft::make_host_matrix_view(X, m, n); + // auto graph = + // NNDescent::detail::build(handle, build_params, dataset, epilogue); + + // for (int i = 0; i < n_search_items; i++) { + // raft::copy(dists + i * k, + // graph.distances().data_handle() + i * build_params.graph_degree, + // k, + // stream); + // raft::copy(int64_indices.data() + i * k, + // graph.graph().data_handle() + i * build_params.graph_degree, + // k, + // stream); + // } + } // convert from current knn's 64-bit to 32-bit. thrust::transform(exec_policy, @@ -134,13 +206,15 @@ void compute_knn(const raft::handle_t& handle, to compute core_dists */ template -void _compute_core_dists(const raft::handle_t& handle, - const value_t* X, - value_t* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples) +void _compute_core_dists( + const raft::handle_t& handle, + const value_t* X, + value_t* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); @@ -151,7 +225,7 @@ void _compute_core_dists(const raft::handle_t& handle, rmm::device_uvector dists(min_samples * m, stream); // perform knn - compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric); + compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric, build_algo); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); @@ -169,6 +243,18 @@ struct ReachabilityPostProcess { value_t alpha; }; +// Functor to post-process distances into reachability space +template +struct ReachabilityPostProcessSqrt { + DI value_t operator()(value_t value, value_idx row, value_idx col) const + { + return max(core_dists[col], max(core_dists[row], sqrtf(alpha * value))); + } + + const value_t* core_dists; + value_t alpha; +}; + /** * Given core distances, Fuses computations of L2 distances between all * points, projection into mutual reachability space, and k-selection. @@ -184,38 +270,84 @@ struct ReachabilityPostProcess { * @param[in] core_dists array of core distances (size m) */ template -void mutual_reachability_knn_l2(const raft::handle_t& handle, - value_idx* out_inds, - value_t* out_dists, - const value_t* X, - size_t m, - size_t n, - int k, - value_t* core_dists, - value_t alpha) +void mutual_reachability_knn_l2( + const raft::handle_t& handle, + value_idx* out_inds, + value_t* out_dists, + const value_t* X, + size_t m, + size_t n, + int k, + value_t* core_dists, + value_t alpha, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { // Create a functor to postprocess distances into mutual reachability space // Note that we can't use a lambda for this here, since we get errors like: // `A type local to a function cannot be used in the template argument of the // enclosing parent function (and any parent classes) of an extended __device__ // or __host__ __device__ lambda` - auto epilogue = ReachabilityPostProcess{core_dists, alpha}; - - auto X_view = raft::make_device_matrix_view(X, m, n); - std::vector> index = {X_view}; - - raft::neighbors::brute_force::knn( - handle, - index, - X_view, - raft::make_device_matrix_view(out_inds, m, static_cast(k)), - raft::make_device_matrix_view(out_dists, m, static_cast(k)), - // TODO: expand distance metrics to support more than just L2 distance - // https://github.com/rapidsai/cuml/issues/5301 - raft::distance::DistanceType::L2SqrtExpanded, - std::make_optional(2.0f), - std::nullopt, - epilogue); + + if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { + auto epilogue = ReachabilityPostProcess{core_dists, alpha}; + auto X_view = raft::make_device_matrix_view(X, m, n); + std::vector> index = {X_view}; + + raft::neighbors::brute_force::knn( + handle, + index, + X_view, + raft::make_device_matrix_view(out_inds, m, static_cast(k)), + raft::make_device_matrix_view(out_dists, m, static_cast(k)), + // TODO: expand distance metrics to support more than just L2 distance + // https://github.com/rapidsai/cuml/issues/5301 + raft::distance::DistanceType::L2SqrtExpanded, + std::make_optional(2.0f), + std::nullopt, + epilogue); + } else { + // [JS] TODO: add check for graph degree + auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; + // NNDescent::index_params params = {}; + build_params.return_distances = true; + // size_t graph_degree = align32::roundUp(static_cast(k * 3.0)); + // params.graph_degree = graph_degree; + // params.intermediate_graph_degree = align32::roundUp(static_cast(graph_degree * 1.3)); + // params.max_iterations = 50; + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto dataset = raft::make_host_matrix_view(X, m, n); + // [JS] TODO: add distance epilogue here + auto graph = + NNDescent::detail::build(handle, build_params, dataset, epilogue); + + for (size_t i = 0; i < m; i++) { + if (graph.distances().has_value()) { + raft::copy(out_dists + i * k + 1, + graph.distances().value().data_handle() + i * build_params.graph_degree, + k - 1, + handle.get_stream()); + thrust::fill( + thrust::device.on(handle.get_stream()), out_dists + i * k, out_dists + i * k + 1, 0.0); + } + // raft::copy(out_dists + i * k, + // graph.distances().data_handle() + i * build_params.graph_degree, + // k, + // handle.get_stream()); + raft::copy(out_inds + i * k + 1, + graph.graph().data_handle() + i * build_params.graph_degree, + k - 1, + handle.get_stream()); + thrust::fill( + thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i); + // raft::copy(out_inds + i * k, + // graph.graph().data_handle() + i * build_params.graph_degree, + // k, + // handle.get_stream()); + } + } } /** @@ -260,16 +392,19 @@ void mutual_reachability_knn_l2(const raft::handle_t& handle, * neighbors. */ template -void mutual_reachability_graph(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples, - value_t alpha, - value_idx* indptr, - value_t* core_dists, - raft::sparse::COO& out) +void mutual_reachability_graph( + const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + value_t alpha, + value_idx* indptr, + value_t* core_dists, + raft::sparse::COO& out, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); @@ -281,18 +416,48 @@ void mutual_reachability_graph(const raft::handle_t& handle, rmm::device_uvector inds(min_samples * m, stream); rmm::device_uvector dists(min_samples * m, stream); + // printf("[JS] min samples: %d\n", min_samples); // perform knn - compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric); - + compute_knn(handle, + X, + inds.data(), + dists.data(), + m, + n, + X, + m, + min_samples, + metric, + build_algo, + build_params); + raft::print_device_vector("indices", inds.data(), min_samples, std::cout); + raft::print_device_vector("distances", dists.data(), min_samples, std::cout); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); - + // raft::print_device_vector("core dists", core_dists, 20, std::cout); + + // raft::print_device_vector("dists for 4:", dists.data() + min_samples * 4, min_samples, + // std::cout); raft::print_device_vector("dists for 5:", dists.data() + min_samples * 5, + // min_samples, std::cout); raft::print_device_vector("dists for 14:", dists.data() + min_samples + // * 14, min_samples, std::cout); raft::print_device_vector("dists for 15:", dists.data() + + // min_samples * 15, min_samples, std::cout); raft::print_device_vector("dists for 16:", + // dists.data() + min_samples * 16, min_samples, std::cout); /** * Compute L2 norm */ - mutual_reachability_knn_l2( - handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha); - + mutual_reachability_knn_l2(handle, + inds.data(), + dists.data(), + X, + m, + n, + min_samples, + core_dists, + (value_t)1.0 / alpha, + build_algo, + build_params); + raft::print_device_vector("indices after knnl2", inds.data(), min_samples, std::cout); + raft::print_device_vector("distances after knnl2", dists.data(), min_samples, std::cout); // self-loops get max distance auto coo_rows_counting_itr = thrust::make_counting_iterator(0); thrust::transform(exec_policy, diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu index ea64d20f6b..019687b72c 100644 --- a/cpp/src/hdbscan/hdbscan.cu +++ b/cpp/src/hdbscan/hdbscan.cu @@ -152,16 +152,18 @@ void out_of_sample_predict(const raft::handle_t& handle, namespace HDBSCAN::HELPER { -void compute_core_dists(const raft::handle_t& handle, - const float* X, - float* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples) +void compute_core_dists( + const raft::handle_t& handle, + const float* X, + float* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { HDBSCAN::detail::Reachability::_compute_core_dists( - handle, X, core_dists, m, n, metric, min_samples); + handle, X, core_dists, m, n, metric, min_samples, build_algo); } void compute_inverse_label_map(const raft::handle_t& handle, diff --git a/cpp/src/hdbscan/runner.h b/cpp/src/hdbscan/runner.h index c79148eed2..d9591bf0f1 100644 --- a/cpp/src/hdbscan/runner.h +++ b/cpp/src/hdbscan/runner.h @@ -183,7 +183,9 @@ void build_linkage(const raft::handle_t& handle, params.alpha, mutual_reachability_indptr.data(), core_dists, - mutual_reachability_coo); + mutual_reachability_coo, + params.build_algo, + params.nn_descent_params); /** * Construct MST sorted by weights diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index f7691c1684..0edaf64752 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,12 +46,24 @@ IF GPUBUILD == 1: from pylibraft.common.handle import Handle from pylibraft.common.handle cimport handle_t + cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbors::experimental::nn_descent": + cdef struct index_params: + size_t graph_degree, + size_t intermediate_graph_degree, + size_t max_iterations, + float termination_threshold, + bool return_distances + cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": ctypedef enum CLUSTER_SELECTION_METHOD: EOM "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::EOM" LEAF "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::LEAF" + ctypedef enum GRAPH_BUILD_ALGO: + BRUTE_FORCE_KNN "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN" + NN_DESCENT "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::NN_DESCENT" + cdef cppclass CondensedHierarchy[value_idx, value_t]: CondensedHierarchy( const handle_t &handle, size_t n_leaves) @@ -98,6 +110,8 @@ IF GPUBUILD == 1: bool allow_single_cluster, CLUSTER_SELECTION_METHOD cluster_selection_method, + GRAPH_BUILD_ALGO build_algo, + index_params nn_descent_params, cdef cppclass PredictionData[int, float]: PredictionData(const handle_t &handle, @@ -151,7 +165,8 @@ IF GPUBUILD == 1: size_t m, size_t n, DistanceType metric, - int min_samples) + int min_samples, + GRAPH_BUILD_ALGO build_algo) void compute_inverse_label_map(const handle_t& handle, CondensedHierarchy[int, float]& @@ -501,7 +516,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): verbose=False, connectivity='knn', output_type=None, - prediction_data=False): + prediction_data=False, + build_algo='brute_force_knn', + build_kwds=None): super().__init__(handle=handle, verbose=verbose, @@ -532,6 +549,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): self.fit_called_ = False self.prediction_data = prediction_data + self.build_algo = build_algo + self.build_kwds = build_kwds + self.n_clusters_ = None self.n_leaves_ = None @@ -831,6 +851,26 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): raise ValueError("Cluster selection method not supported. " "Must one of {'eom', 'leaf'}") + if self.build_algo == 'brute_force_knn': + params.build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN + elif self.build_algo == 'nn_descent': + params.build_algo = GRAPH_BUILD_ALGO.NN_DESCENT + if self.build_kwds is None: + params.nn_descent_params.graph_degree = 64 + params.nn_descent_params.intermediate_graph_degree = 128 + params.nn_descent_params.max_iterations = 20 + params.nn_descent_params.termination_threshold = 0.0001 + params.nn_descent_params.return_distances = True + else: + params.nn_descent_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) + params.nn_descent_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) + params.nn_descent_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) + params.nn_descent_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + params.nn_descent_params.return_distances = True + else: + raise ValueError("Build algo not supported. " + "Must one of {'brute_force_knn', 'nn_descent'}") + cdef DistanceType metric if self.metric in _metrics_mapping: metric = _metrics_mapping[self.metric] @@ -1071,13 +1111,20 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): cdef uintptr_t X_ptr = self.X_m.ptr cdef uintptr_t core_dists_ptr = self.core_dists.ptr + cdef GRAPH_BUILD_ALGO build_algo + if self.build_algo == 'brute_force_knn': + build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN + elif self.build_algo == 'nn_descent': + build_algo = GRAPH_BUILD_ALGO.NN_DESCENT + compute_core_dists(handle_[0], X_ptr, core_dists_ptr, self.n_rows, self.n_cols, metric, - self.min_samples) + self.min_samples, + build_algo) cdef device_uvector[int] *inverse_label_map = \ new device_uvector[int](0, handle_[0].get_stream()) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 0a9a3a6382..fab72780d8 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -162,6 +162,7 @@ def assert_membership_vectors(cu_vecs, sk_vecs): @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_blobs( nrows, ncols, @@ -173,6 +174,7 @@ def test_hdbscan_blobs( min_cluster_size, max_cluster_size, min_samples, + build_algo, ): X, y = make_blobs( @@ -192,6 +194,7 @@ def test_hdbscan_blobs( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -233,6 +236,7 @@ def test_hdbscan_blobs( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_sklearn_datasets( test_datasets, connectivity, @@ -240,6 +244,7 @@ def test_hdbscan_sklearn_datasets( cluster_selection_method, min_samples_cluster_size_bounds, allow_single_cluster, + build_algo, ): ( @@ -259,6 +264,7 @@ def test_hdbscan_sklearn_datasets( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -297,6 +303,7 @@ def test_hdbscan_sklearn_datasets( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_sklearn_extract_clusters( test_datasets, connectivity, @@ -306,6 +313,7 @@ def test_hdbscan_sklearn_extract_clusters( min_cluster_size, max_cluster_size, allow_single_cluster, + build_algo, ): X = test_datasets.data cuml_agg = HDBSCAN( @@ -317,6 +325,7 @@ def test_hdbscan_sklearn_extract_clusters( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, ) sk_agg = hdbscan.HDBSCAN( @@ -349,6 +358,7 @@ def test_hdbscan_sklearn_extract_clusters( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_cluster_patterns( dataset, nrows, @@ -359,6 +369,7 @@ def test_hdbscan_cluster_patterns( allow_single_cluster, max_cluster_size, min_samples, + build_algo, ): # This also tests duplicate data points @@ -372,6 +383,8 @@ def test_hdbscan_cluster_patterns( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, + # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -412,6 +425,7 @@ def test_hdbscan_cluster_patterns( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, @@ -422,6 +436,7 @@ def test_hdbscan_cluster_patterns_extract_clusters( allow_single_cluster, max_cluster_size, min_samples, + build_algo, ): # This also tests duplicate data points @@ -435,6 +450,7 @@ def test_hdbscan_cluster_patterns_extract_clusters( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, ) sk_agg = hdbscan.HDBSCAN( @@ -494,7 +510,8 @@ def test_hdbscan_metric_parameter_input(metric, supported): clf.fit(X) -def test_hdbscan_empty_cluster_tree(): +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +def test_hdbscan_empty_cluster_tree(build_algo): raw_tree = np.recarray( shape=(5,), @@ -510,7 +527,9 @@ def test_hdbscan_empty_cluster_tree(): condensed_tree = CondensedTree(raw_tree, 0.0, True) cuml_agg = HDBSCAN( - allow_single_cluster=True, cluster_selection_method="eom" + allow_single_cluster=True, + cluster_selection_method="eom", + build_algo=build_algo, ) cuml_agg._extract_clusters(condensed_tree) @@ -518,7 +537,8 @@ def test_hdbscan_empty_cluster_tree(): assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0 -def test_hdbscan_plots(): +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +def test_hdbscan_plots(build_algo): X, y = make_blobs( n_samples=int(100), @@ -529,7 +549,7 @@ def test_hdbscan_plots(): random_state=42, ) - cuml_agg = HDBSCAN(gen_min_span_tree=True) + cuml_agg = HDBSCAN(gen_min_span_tree=True, build_algo=build_algo) cuml_agg.fit(X) assert cuml_agg.condensed_tree_ is not None @@ -551,6 +571,7 @@ def test_hdbscan_plots(): @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("batch_size", [128, 1000]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_blobs( nrows, ncols, @@ -561,6 +582,7 @@ def test_all_points_membership_vectors_blobs( allow_single_cluster, max_cluster_size, batch_size, + build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -579,6 +601,7 @@ def test_all_points_membership_vectors_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -613,6 +636,7 @@ def test_all_points_membership_vectors_blobs( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [128, 1000]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_moons( nrows, min_samples, @@ -623,6 +647,7 @@ def test_all_points_membership_vectors_moons( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_moons(n_samples=nrows, noise=0.05, random_state=42) @@ -636,6 +661,8 @@ def test_all_points_membership_vectors_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, + # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -670,6 +697,7 @@ def test_all_points_membership_vectors_moons( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [128, 1000]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_circles( nrows, min_samples, @@ -680,6 +708,7 @@ def test_all_points_membership_vectors_circles( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_circles( n_samples=nrows, factor=0.5, noise=0.05, random_state=42 @@ -694,6 +723,8 @@ def test_all_points_membership_vectors_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, + # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -732,6 +763,7 @@ def test_all_points_membership_vectors_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_blobs( nrows, n_points_to_predict, @@ -742,6 +774,7 @@ def test_approximate_predict_blobs( min_cluster_size, max_cluster_size, allow_single_cluster, + build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -769,6 +802,7 @@ def test_approximate_predict_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -789,7 +823,8 @@ def test_approximate_predict_blobs( sk_labels, sk_probs = hdbscan.approximate_predict( sk_agg, points_to_predict ) - + # print(f"cu labels: {cu_labels}\ncu probs: {cu_probs}") + # print(f"sk labels: {sk_labels}\ncu probs: {sk_probs}") assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95 assert np.allclose(cu_probs, sk_probs, atol=0.05) @@ -803,6 +838,7 @@ def test_approximate_predict_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_moons( nrows, n_points_to_predict, @@ -813,6 +849,7 @@ def test_approximate_predict_moons( max_cluster_size, cluster_selection_method, connectivity, + build_algo, ): X, y = datasets.make_moons( @@ -831,6 +868,7 @@ def test_approximate_predict_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -868,6 +906,7 @@ def test_approximate_predict_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_circles( nrows, n_points_to_predict, @@ -878,6 +917,7 @@ def test_approximate_predict_circles( max_cluster_size, cluster_selection_method, connectivity, + build_algo, ): X, y = datasets.make_circles( n_samples=nrows + n_points_to_predict, @@ -898,6 +938,7 @@ def test_approximate_predict_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -934,6 +975,7 @@ def test_approximate_predict_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_digits( n_points_to_predict, min_samples, @@ -943,6 +985,7 @@ def test_approximate_predict_digits( max_cluster_size, cluster_selection_method, connectivity, + build_algo, ): digits = datasets.load_digits() X, y = digits.data, digits.target @@ -966,6 +1009,7 @@ def test_approximate_predict_digits( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -1001,6 +1045,7 @@ def test_approximate_predict_digits( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("batch_size", [128]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_blobs( nrows, n_points_to_predict, @@ -1012,6 +1057,7 @@ def test_membership_vector_blobs( allow_single_cluster, max_cluster_size, batch_size, + build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -1039,6 +1085,7 @@ def test_membership_vector_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -1077,6 +1124,7 @@ def test_membership_vector_blobs( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [16]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_moons( nrows, n_points_to_predict, @@ -1088,6 +1136,7 @@ def test_membership_vector_moons( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_moons( @@ -1106,6 +1155,8 @@ def test_membership_vector_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, + # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X_train) @@ -1141,6 +1192,7 @@ def test_membership_vector_moons( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [16]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_circles( nrows, n_points_to_predict, @@ -1152,6 +1204,7 @@ def test_membership_vector_circles( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_circles( n_samples=nrows + n_points_to_predict, @@ -1172,6 +1225,8 @@ def test_membership_vector_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, + # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X_train) @@ -1193,5 +1248,6 @@ def test_membership_vector_circles( sk_membership_vectors = hdbscan.membership_vector(sk_agg, X_test).astype( "float32" ) - + print(f"cu memberhsip vec: {cu_membership_vectors}") + print(f"sk memberhsip vec: {sk_membership_vectors}") assert_membership_vectors(cu_membership_vectors, sk_membership_vectors) From 8f036a985aaa19420a5018acbb83a2be58f4ba5f Mon Sep 17 00:00:00 2001 From: jinsolp Date: Sun, 16 Jun 2024 22:05:09 +0000 Subject: [PATCH 02/25] change epilogue functor --- cpp/src/hdbscan/detail/reachability.cuh | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index e381f52222..11afae068e 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -77,7 +77,10 @@ void core_distances( // Functor to post-process distances into reachability space template struct DistancePostProcessSqrt { - DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); } + DI value_t operator()(value_t value, value_idx row, value_idx col) const + { + return powf(fabsf(value), 0.5); + } }; /** @@ -139,8 +142,6 @@ void compute_knn(const raft::handle_t& handle, true, metric); } else { // NN_DESCENT - // [JS] TODO: add check for graph degree - // [JS] TODO: pass params auto epilogue = DistancePostProcessSqrt{}; build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, @@ -149,7 +150,7 @@ void compute_knn(const raft::handle_t& handle, auto dataset = raft::make_host_matrix_view(X, m, n); auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); - for (int i = 0; i < n_search_items; i++) { + for (size_t i = 0; i < n_search_items; i++) { if (graph.distances().has_value()) { raft::copy(dists + i * k + 1, graph.distances().value().data_handle() + i * build_params.graph_degree, @@ -248,9 +249,8 @@ template struct ReachabilityPostProcessSqrt { DI value_t operator()(value_t value, value_idx row, value_idx col) const { - return max(core_dists[col], max(core_dists[row], sqrtf(alpha * value))); + return max(core_dists[col], max(core_dists[row], powf(fabsf(alpha * value), 0.5))); } - const value_t* core_dists; value_t alpha; }; @@ -307,7 +307,6 @@ void mutual_reachability_knn_l2( std::nullopt, epilogue); } else { - // [JS] TODO: add check for graph degree auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; // NNDescent::index_params params = {}; build_params.return_distances = true; @@ -329,8 +328,7 @@ void mutual_reachability_knn_l2( graph.distances().value().data_handle() + i * build_params.graph_degree, k - 1, handle.get_stream()); - thrust::fill( - thrust::device.on(handle.get_stream()), out_dists + i * k, out_dists + i * k + 1, 0.0); + raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream()); } // raft::copy(out_dists + i * k, // graph.distances().data_handle() + i * build_params.graph_degree, @@ -430,8 +428,8 @@ void mutual_reachability_graph( metric, build_algo, build_params); - raft::print_device_vector("indices", inds.data(), min_samples, std::cout); - raft::print_device_vector("distances", dists.data(), min_samples, std::cout); + // raft::print_device_vector("indices", inds.data(), 20, std::cout); + // raft::print_device_vector("distances", dists.data(), 20, std::cout); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); // raft::print_device_vector("core dists", core_dists, 20, std::cout); @@ -456,8 +454,8 @@ void mutual_reachability_graph( (value_t)1.0 / alpha, build_algo, build_params); - raft::print_device_vector("indices after knnl2", inds.data(), min_samples, std::cout); - raft::print_device_vector("distances after knnl2", dists.data(), min_samples, std::cout); + // raft::print_device_vector("indices after knnl2", inds.data(), 20, std::cout); + // raft::print_device_vector("distances after knnl2", dists.data(), 20, std::cout); // self-loops get max distance auto coo_rows_counting_itr = thrust::make_counting_iterator(0); thrust::transform(exec_policy, From e158a74cafe376e18c8891908be58938aa92e7b3 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Sun, 16 Jun 2024 22:07:12 +0000 Subject: [PATCH 03/25] cleanup --- cpp/src/hdbscan/detail/reachability.cuh | 39 ------------------------- 1 file changed, 39 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 11afae068e..b31f5dabf3 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -167,31 +167,6 @@ void compute_knn(const raft::handle_t& handle, int64_indices.data() + i * k + 1, i); } - // NNDescent::index_params params = {}; - // params.return_distances = true; - // size_t graph_degree = align32::roundUp(static_cast(k * 3.0)); - // params.graph_degree = graph_degree; - // params.intermediate_graph_degree = align32::roundUp(static_cast(graph_degree * 1.3)); - // params.max_iterations = 50; - - // RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be - // smaller than the graph degree computed by nn descent"); - - // auto dataset = - // raft::make_host_matrix_view(X, m, n); - // auto graph = - // NNDescent::detail::build(handle, build_params, dataset, epilogue); - - // for (int i = 0; i < n_search_items; i++) { - // raft::copy(dists + i * k, - // graph.distances().data_handle() + i * build_params.graph_degree, - // k, - // stream); - // raft::copy(int64_indices.data() + i * k, - // graph.graph().data_handle() + i * build_params.graph_degree, - // k, - // stream); - // } } // convert from current knn's 64-bit to 32-bit. @@ -308,17 +283,11 @@ void mutual_reachability_knn_l2( epilogue); } else { auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; - // NNDescent::index_params params = {}; build_params.return_distances = true; - // size_t graph_degree = align32::roundUp(static_cast(k * 3.0)); - // params.graph_degree = graph_degree; - // params.intermediate_graph_degree = align32::roundUp(static_cast(graph_degree * 1.3)); - // params.max_iterations = 50; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); auto dataset = raft::make_host_matrix_view(X, m, n); - // [JS] TODO: add distance epilogue here auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); @@ -330,20 +299,12 @@ void mutual_reachability_knn_l2( handle.get_stream()); raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream()); } - // raft::copy(out_dists + i * k, - // graph.distances().data_handle() + i * build_params.graph_degree, - // k, - // handle.get_stream()); raft::copy(out_inds + i * k + 1, graph.graph().data_handle() + i * build_params.graph_degree, k - 1, handle.get_stream()); thrust::fill( thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i); - // raft::copy(out_inds + i * k, - // graph.graph().data_handle() + i * build_params.graph_degree, - // k, - // handle.get_stream()); } } } From ac03040961a8083894f510549825a3c463a5acd0 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Mon, 17 Jun 2024 02:12:31 +0000 Subject: [PATCH 04/25] fix test + add param for compute_core_dist --- cpp/include/cuml/cluster/hdbscan.hpp | 18 +++--- cpp/src/hdbscan/detail/reachability.cuh | 16 ++++- cpp/src/hdbscan/hdbscan.cu | 20 +++---- python/cuml/cluster/hdbscan/hdbscan.pyx | 19 +++++- python/cuml/tests/test_hdbscan.py | 78 +++++++++++++++++++------ 5 files changed, 111 insertions(+), 40 deletions(-) diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp index 3d98ec1faa..98f19e901c 100644 --- a/cpp/include/cuml/cluster/hdbscan.hpp +++ b/cpp/include/cuml/cluster/hdbscan.hpp @@ -502,14 +502,16 @@ namespace HDBSCAN::HELPER { * @param metric distance metric to use * @param min_samples minimum number of samples to use for computing core distances */ -void compute_core_dists(const raft::handle_t& handle, - const float* X, - float* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples, - HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo); +void compute_core_dists( + const raft::handle_t& handle, + const float* X, + float* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + HDBSCAN::Common::nn_index_params build_params = Common::nn_index_params{}); /** * @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index b31f5dabf3..4225d57e1a 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -190,7 +190,8 @@ void _compute_core_dists( size_t n, raft::distance::DistanceType metric, int min_samples, - Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); @@ -201,7 +202,18 @@ void _compute_core_dists( rmm::device_uvector dists(min_samples * m, stream); // perform knn - compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric, build_algo); + compute_knn(handle, + X, + inds.data(), + dists.data(), + m, + n, + X, + m, + min_samples, + metric, + build_algo, + build_params); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu index 019687b72c..32ef78b470 100644 --- a/cpp/src/hdbscan/hdbscan.cu +++ b/cpp/src/hdbscan/hdbscan.cu @@ -152,18 +152,18 @@ void out_of_sample_predict(const raft::handle_t& handle, namespace HDBSCAN::HELPER { -void compute_core_dists( - const raft::handle_t& handle, - const float* X, - float* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples, - HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) +void compute_core_dists(const raft::handle_t& handle, + const float* X, + float* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo, + HDBSCAN::Common::nn_index_params build_params) { HDBSCAN::detail::Reachability::_compute_core_dists( - handle, X, core_dists, m, n, metric, min_samples, build_algo); + handle, X, core_dists, m, n, metric, min_samples, build_algo, build_params); } void compute_inverse_label_map(const raft::handle_t& handle, diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 0edaf64752..d6c6d670d3 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -166,7 +166,8 @@ IF GPUBUILD == 1: size_t n, DistanceType metric, int min_samples, - GRAPH_BUILD_ALGO build_algo) + GRAPH_BUILD_ALGO build_algo, + index_params build_params) void compute_inverse_label_map(const handle_t& handle, CondensedHierarchy[int, float]& @@ -1112,10 +1113,23 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): cdef uintptr_t core_dists_ptr = self.core_dists.ptr cdef GRAPH_BUILD_ALGO build_algo + cdef index_params build_params if self.build_algo == 'brute_force_knn': build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN elif self.build_algo == 'nn_descent': build_algo = GRAPH_BUILD_ALGO.NN_DESCENT + if self.build_kwds is None: + build_params.graph_degree = 64 + build_params.intermediate_graph_degree = 128 + build_params.max_iterations = 20 + build_params.termination_threshold = 0.0001 + build_params.return_distances = True + else: + build_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) + build_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) + build_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) + build_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + build_params.return_distances = True compute_core_dists(handle_[0], X_ptr, @@ -1124,7 +1138,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): self.n_cols, metric, self.min_samples, - build_algo) + build_algo, + build_params) cdef device_uvector[int] *inverse_label_map = \ new device_uvector[int](0, handle_[0].get_stream()) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index fab72780d8..3e060e1647 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -44,6 +44,13 @@ dataset_names = ["noisy_circles", "noisy_moons", "varied"] +def get_graph_degree(n_samples): + graph_degree = max(int((1 + ((n_samples * 1.5) // 32)) * 32), 64) + intermediate_graph_degree = int(1 + ((graph_degree * 1.3) // 32) * 32) + max_iters = max(n_samples // 2, 20) + return graph_degree, intermediate_graph_degree, max_iters + + def assert_cluster_counts(sk_agg, cuml_agg, digits=25): sk_unique, sk_counts = np.unique(sk_agg.labels_, return_counts=True) sk_counts = np.sort(sk_counts) @@ -186,6 +193,9 @@ def test_hdbscan_blobs( random_state=42, ) + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_samples + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -195,6 +205,11 @@ def test_hdbscan_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) cuml_agg.fit(X) @@ -236,7 +251,7 @@ def test_hdbscan_blobs( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_hdbscan_sklearn_datasets( test_datasets, connectivity, @@ -255,6 +270,9 @@ def test_hdbscan_sklearn_datasets( X = test_datasets.data + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_samples + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -265,6 +283,11 @@ def test_hdbscan_sklearn_datasets( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) cuml_agg.fit(X) @@ -316,6 +339,9 @@ def test_hdbscan_sklearn_extract_clusters( build_algo, ): X = test_datasets.data + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_samples + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -326,6 +352,11 @@ def test_hdbscan_sklearn_extract_clusters( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) sk_agg = hdbscan.HDBSCAN( @@ -374,7 +405,6 @@ def test_hdbscan_cluster_patterns( # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] - cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -384,7 +414,6 @@ def test_hdbscan_cluster_patterns( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, build_algo=build_algo, - # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -441,7 +470,9 @@ def test_hdbscan_cluster_patterns_extract_clusters( # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] - + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_samples + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -451,6 +482,11 @@ def test_hdbscan_cluster_patterns_extract_clusters( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) sk_agg = hdbscan.HDBSCAN( @@ -592,7 +628,9 @@ def test_all_points_membership_vectors_blobs( shuffle=True, random_state=42, ) - + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_cluster_size + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -602,6 +640,11 @@ def test_all_points_membership_vectors_blobs( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) cuml_agg.fit(X) @@ -662,7 +705,6 @@ def test_all_points_membership_vectors_moons( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, - # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -724,7 +766,6 @@ def test_all_points_membership_vectors_circles( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, - # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X) @@ -763,7 +804,7 @@ def test_all_points_membership_vectors_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_approximate_predict_blobs( nrows, n_points_to_predict, @@ -838,7 +879,7 @@ def test_approximate_predict_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_approximate_predict_moons( nrows, n_points_to_predict, @@ -906,7 +947,7 @@ def test_approximate_predict_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_approximate_predict_circles( nrows, n_points_to_predict, @@ -975,7 +1016,7 @@ def test_approximate_predict_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_approximate_predict_digits( n_points_to_predict, min_samples, @@ -1077,6 +1118,9 @@ def test_membership_vector_blobs( random_state=42, ) + graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + min_cluster_size + ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -1086,6 +1130,11 @@ def test_membership_vector_blobs( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + "nnd_max_iterations": max_iters, + }, ) cuml_agg.fit(X) @@ -1156,7 +1205,6 @@ def test_membership_vector_moons( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, - # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X_train) @@ -1192,7 +1240,6 @@ def test_membership_vector_moons( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [16]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_circles( nrows, n_points_to_predict, @@ -1204,7 +1251,6 @@ def test_membership_vector_circles( max_cluster_size, connectivity, batch_size, - build_algo, ): X, y = datasets.make_circles( n_samples=nrows + n_points_to_predict, @@ -1225,8 +1271,6 @@ def test_membership_vector_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, - # build_kwds={"nnd_max_iterations":50}, ) cuml_agg.fit(X_train) @@ -1248,6 +1292,4 @@ def test_membership_vector_circles( sk_membership_vectors = hdbscan.membership_vector(sk_agg, X_test).astype( "float32" ) - print(f"cu memberhsip vec: {cu_membership_vectors}") - print(f"sk memberhsip vec: {sk_membership_vectors}") assert_membership_vectors(cu_membership_vectors, sk_membership_vectors) From f2c3c920ddd546bfcd6ee13d26d95eb5fdbc6396 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Mon, 17 Jun 2024 16:15:32 +0000 Subject: [PATCH 05/25] remove and add comments --- cpp/src/hdbscan/detail/reachability.cuh | 26 +++++++------------------ 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 4225d57e1a..585593850e 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -37,7 +37,6 @@ #include #include -using align32 = raft::Pow2<32>; namespace NNDescent = raft::neighbors::experimental::nn_descent; namespace ML { @@ -74,7 +73,8 @@ void core_distances( }); } -// Functor to post-process distances into reachability space +// Functor to post-process distances by sqrt +// For usage with NN Descent which internally supports L2Expanded only template struct DistancePostProcessSqrt { DI value_t operator()(value_t value, value_idx row, value_idx col) const @@ -124,9 +124,6 @@ void compute_knn(const raft::handle_t& handle, std::vector sizes; sizes.push_back(m); - // // This is temporary. Once faiss is updated, we should be able to - // // pass value_idx through to knn. - // rmm::device_uvector int64_indices(k * n_search_items, stream); // perform knn brute_force_knn(handle, @@ -231,7 +228,8 @@ struct ReachabilityPostProcess { value_t alpha; }; -// Functor to post-process distances into reachability space +// Functor to post-process distances into reachability space (Sqrt) +// For usage with NN Descent which internally supports L2Expanded only template struct ReachabilityPostProcessSqrt { DI value_t operator()(value_t value, value_idx row, value_idx col) const @@ -387,7 +385,6 @@ void mutual_reachability_graph( rmm::device_uvector inds(min_samples * m, stream); rmm::device_uvector dists(min_samples * m, stream); - // printf("[JS] min samples: %d\n", min_samples); // perform knn compute_knn(handle, X, @@ -401,18 +398,10 @@ void mutual_reachability_graph( metric, build_algo, build_params); - // raft::print_device_vector("indices", inds.data(), 20, std::cout); - // raft::print_device_vector("distances", dists.data(), 20, std::cout); + // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); - // raft::print_device_vector("core dists", core_dists, 20, std::cout); - - // raft::print_device_vector("dists for 4:", dists.data() + min_samples * 4, min_samples, - // std::cout); raft::print_device_vector("dists for 5:", dists.data() + min_samples * 5, - // min_samples, std::cout); raft::print_device_vector("dists for 14:", dists.data() + min_samples - // * 14, min_samples, std::cout); raft::print_device_vector("dists for 15:", dists.data() + - // min_samples * 15, min_samples, std::cout); raft::print_device_vector("dists for 16:", - // dists.data() + min_samples * 16, min_samples, std::cout); + /** * Compute L2 norm */ @@ -427,8 +416,7 @@ void mutual_reachability_graph( (value_t)1.0 / alpha, build_algo, build_params); - // raft::print_device_vector("indices after knnl2", inds.data(), 20, std::cout); - // raft::print_device_vector("distances after knnl2", dists.data(), 20, std::cout); + // self-loops get max distance auto coo_rows_counting_itr = thrust::make_counting_iterator(0); thrust::transform(exec_policy, From b461d31c5685eed390bd028a0cc07c935c42aece Mon Sep 17 00:00:00 2001 From: jinsolp Date: Tue, 18 Jun 2024 23:22:06 +0000 Subject: [PATCH 06/25] refine distances due to precision issues --- cpp/src/hdbscan/detail/reachability.cuh | 109 ++++++++++----------- python/cuml/cluster/hdbscan/hdbscan.pyx | 5 - python/cuml/tests/test_hdbscan.py | 120 +++++++++++++++++++++--- 3 files changed, 154 insertions(+), 80 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 585593850e..99907897e5 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -139,31 +140,46 @@ void compute_knn(const raft::handle_t& handle, true, metric); } else { // NN_DESCENT - auto epilogue = DistancePostProcessSqrt{}; - build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); auto dataset = raft::make_host_matrix_view(X, m, n); - auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); + auto graph = NNDescent::detail::build(handle, build_params, dataset); + // NN Descent build does not include itself in nearest neighbors for (size_t i = 0; i < n_search_items; i++) { - if (graph.distances().has_value()) { - raft::copy(dists + i * k + 1, - graph.distances().value().data_handle() + i * build_params.graph_degree, - k - 1, - handle.get_stream()); - thrust::fill(thrust::device.on(stream), dists + i * k, dists + i * k + 1, 0.0); + for (size_t j = k - 1; j >= 1; j--) { + graph.graph().data_handle()[i * build_params.graph_degree + j] = + graph.graph().data_handle()[i * build_params.graph_degree + j - 1]; } - raft::copy(int64_indices.data() + i * k + 1, - graph.graph().data_handle() + i * build_params.graph_degree, - k - 1, - handle.get_stream()); - thrust::fill(thrust::device.on(stream), - int64_indices.data() + i * k, - int64_indices.data() + i * k + 1, - i); + graph.graph().data_handle()[i * build_params.graph_degree] = i; } + + auto dataset_dev = raft::make_device_matrix(handle, m, n); + raft::copy(dataset_dev.data_handle(), dataset.data_handle(), m * n, handle.get_stream()); + auto dataset_dev_view = raft::make_device_matrix_view( + dataset_dev.data_handle(), m, n); + + auto neighbor_candidates = raft::make_device_matrix( + handle, m, build_params.graph_degree); + raft::copy(neighbor_candidates.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + handle.get_stream()); + auto neighbor_candidates_view = + raft::make_device_matrix_view( + neighbor_candidates.data_handle(), m, build_params.graph_degree); + + auto indices = + raft::make_device_matrix_view(int64_indices.data(), n_search_items, k); + auto distances = raft::make_device_matrix_view(dists, n_search_items, k); + raft::neighbors::refine(handle, + dataset_dev_view, + dataset_dev_view, + neighbor_candidates_view, + indices, + distances, + metric); } // convert from current knn's 64-bit to 32-bit. @@ -274,49 +290,22 @@ void mutual_reachability_knn_l2( // enclosing parent function (and any parent classes) of an extended __device__ // or __host__ __device__ lambda` - if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { - auto epilogue = ReachabilityPostProcess{core_dists, alpha}; - auto X_view = raft::make_device_matrix_view(X, m, n); - std::vector> index = {X_view}; - - raft::neighbors::brute_force::knn( - handle, - index, - X_view, - raft::make_device_matrix_view(out_inds, m, static_cast(k)), - raft::make_device_matrix_view(out_dists, m, static_cast(k)), - // TODO: expand distance metrics to support more than just L2 distance - // https://github.com/rapidsai/cuml/issues/5301 - raft::distance::DistanceType::L2SqrtExpanded, - std::make_optional(2.0f), - std::nullopt, - epilogue); - } else { - auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; - build_params.return_distances = true; - RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, - "n_neighbors should be smaller than the graph degree computed by nn descent"); - - auto dataset = raft::make_host_matrix_view(X, m, n); - auto graph = - NNDescent::detail::build(handle, build_params, dataset, epilogue); - - for (size_t i = 0; i < m; i++) { - if (graph.distances().has_value()) { - raft::copy(out_dists + i * k + 1, - graph.distances().value().data_handle() + i * build_params.graph_degree, - k - 1, - handle.get_stream()); - raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream()); - } - raft::copy(out_inds + i * k + 1, - graph.graph().data_handle() + i * build_params.graph_degree, - k - 1, - handle.get_stream()); - thrust::fill( - thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i); - } - } + auto epilogue = ReachabilityPostProcess{core_dists, alpha}; + auto X_view = raft::make_device_matrix_view(X, m, n); + std::vector> index = {X_view}; + + raft::neighbors::brute_force::knn( + handle, + index, + X_view, + raft::make_device_matrix_view(out_inds, m, static_cast(k)), + raft::make_device_matrix_view(out_dists, m, static_cast(k)), + // TODO: expand distance metrics to support more than just L2 distance + // https://github.com/rapidsai/cuml/issues/5301 + raft::distance::DistanceType::L2SqrtExpanded, + std::make_optional(2.0f), + std::nullopt, + epilogue); } /** diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index d6c6d670d3..23dc26e31b 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -52,7 +52,6 @@ IF GPUBUILD == 1: size_t intermediate_graph_degree, size_t max_iterations, float termination_threshold, - bool return_distances cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": @@ -861,13 +860,11 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): params.nn_descent_params.intermediate_graph_degree = 128 params.nn_descent_params.max_iterations = 20 params.nn_descent_params.termination_threshold = 0.0001 - params.nn_descent_params.return_distances = True else: params.nn_descent_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) params.nn_descent_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) params.nn_descent_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) params.nn_descent_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) - params.nn_descent_params.return_distances = True else: raise ValueError("Build algo not supported. " "Must one of {'brute_force_knn', 'nn_descent'}") @@ -1123,13 +1120,11 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): build_params.intermediate_graph_degree = 128 build_params.max_iterations = 20 build_params.termination_threshold = 0.0001 - build_params.return_distances = True else: build_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) build_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) build_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) build_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) - build_params.return_distances = True compute_core_dists(handle_[0], X_ptr, diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 3e060e1647..d71fa88678 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -149,14 +149,19 @@ def assert_membership_vectors(cu_vecs, sk_vecs): cu_labels_sorted = np.argsort(cu_vecs)[::-1] sk_labels_sorted = np.argsort(sk_vecs)[::-1] - k = min(sk_vecs.shape[1], 10) - for i in range(k): + if len(sk_vecs.shape) == 1: assert ( - adjusted_rand_score( - cu_labels_sorted[:, i], sk_labels_sorted[:, i] - ) - >= 0.90 + adjusted_rand_score(cu_labels_sorted, sk_labels_sorted) >= 0.9 ) + else: + k = min(sk_vecs.shape[1], 10) + for i in range(k): + assert ( + adjusted_rand_score( + cu_labels_sorted[:, i], sk_labels_sorted[:, i] + ) + >= 0.9 + ) @pytest.mark.parametrize("nrows", [500]) @@ -804,7 +809,7 @@ def test_all_points_membership_vectors_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_blobs( nrows, n_points_to_predict, @@ -844,6 +849,11 @@ def test_approximate_predict_blobs( cluster_selection_method=cluster_selection_method, prediction_data=True, build_algo=build_algo, + build_kwds={ + "nnd_max_iterations": 100, + "nnd_graph_degree": 96, + "nnd_intermediate_graph_degree": 128, + }, ) cuml_agg.fit(X) @@ -864,10 +874,13 @@ def test_approximate_predict_blobs( sk_labels, sk_probs = hdbscan.approximate_predict( sk_agg, points_to_predict ) - # print(f"cu labels: {cu_labels}\ncu probs: {cu_probs}") - # print(f"sk labels: {sk_labels}\ncu probs: {sk_probs}") - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95 - assert np.allclose(cu_probs, sk_probs, atol=0.05) + + if build_algo == "brute_force_knn": + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95 + assert np.allclose(cu_probs, sk_probs, atol=0.05) + else: + # this test case is not so stable for nn descent at this moment + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.9 @pytest.mark.parametrize("nrows", [1000]) @@ -879,7 +892,7 @@ def test_approximate_predict_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_moons( nrows, n_points_to_predict, @@ -934,8 +947,15 @@ def test_approximate_predict_moons( sk_unique = np.unique(sk_labels) cu_unique = np.unique(cu_labels) if len(sk_unique) == len(cu_unique): - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99 - assert array_equal(cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005) + if build_algo == "brute_force_knn": + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99 + assert array_equal( + cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005 + ) + else: + # this test case is not so stable for nn descent at this moment + # a few configs result in scores around 0.85 + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.8 @pytest.mark.parametrize("nrows", [1000]) @@ -947,7 +967,7 @@ def test_approximate_predict_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_circles( nrows, n_points_to_predict, @@ -1293,3 +1313,73 @@ def test_membership_vector_circles( "float32" ) assert_membership_vectors(cu_membership_vectors, sk_membership_vectors) + + +@pytest.mark.parametrize("nrows", [1000]) +@pytest.mark.parametrize("n_points_to_predict", [1000]) +@pytest.mark.parametrize("min_samples", [20, 30]) +@pytest.mark.parametrize("min_cluster_size", [100, 150]) +@pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5]) +@pytest.mark.parametrize("allow_single_cluster", [True, False]) +@pytest.mark.parametrize("max_cluster_size", [0]) +@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) +@pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("batch_size", [16]) +@pytest.mark.parametrize("build_algo", ["nn_descent"]) +def test_membership_vector_circles_nnd( + nrows, + n_points_to_predict, + min_samples, + cluster_selection_epsilon, + cluster_selection_method, + min_cluster_size, + allow_single_cluster, + max_cluster_size, + connectivity, + batch_size, + build_algo, +): + X, y = datasets.make_circles( + n_samples=nrows + n_points_to_predict, + factor=0.8, + noise=0.05, + random_state=42, + ) + + X_train = X[:nrows] + X_test = X[nrows:] + + cuml_agg_nnd = HDBSCAN( + verbose=logger.level_info, + min_samples=min_samples, + allow_single_cluster=allow_single_cluster, + max_cluster_size=max_cluster_size, + min_cluster_size=min_cluster_size, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method, + prediction_data=True, + build_algo=build_algo, + ) + cuml_agg_nnd.fit(X_train) + + cuml_agg_bf = HDBSCAN( + verbose=logger.level_info, + min_samples=min_samples, + allow_single_cluster=allow_single_cluster, + max_cluster_size=max_cluster_size, + min_cluster_size=min_cluster_size, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method, + prediction_data=True, + ) + cuml_agg_bf.fit(X_train) + + cu_membership_vectors_nnd = membership_vector( + cuml_agg_nnd, X_test, batch_size + ) + cu_membership_vectors_bf = membership_vector( + cuml_agg_bf, X_test, batch_size + ) + assert_membership_vectors( + cu_membership_vectors_nnd, cu_membership_vectors_bf + ) From 5b15ce5b413413347208458c3434c7bdc9526235 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Tue, 18 Jun 2024 23:48:28 +0000 Subject: [PATCH 07/25] add return_distances for cdef --- python/cuml/cluster/hdbscan/hdbscan.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 23dc26e31b..08f76ab22c 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -52,6 +52,7 @@ IF GPUBUILD == 1: size_t intermediate_graph_degree, size_t max_iterations, float termination_threshold, + bool return_distances, cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": From 437406fe9523725b7e5e8cb84a655b395e66cae8 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Sat, 22 Jun 2024 23:02:14 +0000 Subject: [PATCH 08/25] Add to param names --- python/cuml/cluster/hdbscan/hdbscan.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 08f76ab22c..6377477afe 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -1183,7 +1183,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): "connectivity", "alpha", "gen_min_span_tree", - "prediction_data" + "prediction_data", + "build_algo", + "build_kwds" ] def get_attr_names(self): From 7683d36b6d43a0d6aa970f26b4564d241d2b7f06 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Sun, 23 Jun 2024 18:50:41 +0000 Subject: [PATCH 09/25] add documentation --- cpp/include/cuml/cluster/hdbscan.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp index 98f19e901c..0b98aeca86 100644 --- a/cpp/include/cuml/cluster/hdbscan.hpp +++ b/cpp/include/cuml/cluster/hdbscan.hpp @@ -501,6 +501,8 @@ namespace HDBSCAN::HELPER { * @param n number of columns in X * @param metric distance metric to use * @param min_samples minimum number of samples to use for computing core distances + * @param build_algo build algo for building the knn graph (default: brute_force_knn) + * @param build_params build parameters for build_algo */ void compute_core_dists( const raft::handle_t& handle, From 70b8835b93fff4327df3084d83f816afc381639e Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 12 Jul 2024 20:09:44 +0000 Subject: [PATCH 10/25] return distances param in python --- python/cuml/cluster/hdbscan/hdbscan.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 6377477afe..26c0eb7cd3 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -861,11 +861,13 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): params.nn_descent_params.intermediate_graph_degree = 128 params.nn_descent_params.max_iterations = 20 params.nn_descent_params.termination_threshold = 0.0001 + params.nn_descent_params.return_distances = True else: params.nn_descent_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) params.nn_descent_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) params.nn_descent_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) params.nn_descent_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + params.nn_descent_params.return_distances = self.build_kwds.get("nnd_return_distances", True) else: raise ValueError("Build algo not supported. " "Must one of {'brute_force_knn', 'nn_descent'}") From 152a8a3e6dbae6faf5446e0492a447a68136116e Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 12 Jul 2024 20:34:56 +0000 Subject: [PATCH 11/25] change test --- python/cuml/tests/test_hdbscan.py | 175 ++---------------------------- 1 file changed, 11 insertions(+), 164 deletions(-) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index d71fa88678..94c2e0002b 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -47,8 +47,7 @@ def get_graph_degree(n_samples): graph_degree = max(int((1 + ((n_samples * 1.5) // 32)) * 32), 64) intermediate_graph_degree = int(1 + ((graph_degree * 1.3) // 32) * 32) - max_iters = max(n_samples // 2, 20) - return graph_degree, intermediate_graph_degree, max_iters + return graph_degree, intermediate_graph_degree def assert_cluster_counts(sk_agg, cuml_agg, digits=25): @@ -174,7 +173,6 @@ def assert_membership_vectors(cu_vecs, sk_vecs): @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_blobs( nrows, ncols, @@ -186,7 +184,6 @@ def test_hdbscan_blobs( min_cluster_size, max_cluster_size, min_samples, - build_algo, ): X, y = make_blobs( @@ -198,9 +195,6 @@ def test_hdbscan_blobs( random_state=42, ) - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( - min_samples - ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -209,12 +203,6 @@ def test_hdbscan_blobs( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, - build_algo=build_algo, - build_kwds={ - "nnd_graph_degree": graph_degree, - "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, - }, ) cuml_agg.fit(X) @@ -256,7 +244,6 @@ def test_hdbscan_blobs( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) def test_hdbscan_sklearn_datasets( test_datasets, connectivity, @@ -275,9 +262,6 @@ def test_hdbscan_sklearn_datasets( X = test_datasets.data - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( - min_samples - ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -287,12 +271,6 @@ def test_hdbscan_sklearn_datasets( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, - build_algo=build_algo, - build_kwds={ - "nnd_graph_degree": graph_degree, - "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, - }, ) cuml_agg.fit(X) @@ -331,7 +309,6 @@ def test_hdbscan_sklearn_datasets( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_sklearn_extract_clusters( test_datasets, connectivity, @@ -341,12 +318,9 @@ def test_hdbscan_sklearn_extract_clusters( min_cluster_size, max_cluster_size, allow_single_cluster, - build_algo, ): X = test_datasets.data - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( - min_samples - ) + cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -356,12 +330,6 @@ def test_hdbscan_sklearn_extract_clusters( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, - build_algo=build_algo, - build_kwds={ - "nnd_graph_degree": graph_degree, - "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, - }, ) sk_agg = hdbscan.HDBSCAN( @@ -475,9 +443,7 @@ def test_hdbscan_cluster_patterns_extract_clusters( # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( - min_samples - ) + graph_degree, intermediate_graph_degree = get_graph_degree(min_samples) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -490,7 +456,6 @@ def test_hdbscan_cluster_patterns_extract_clusters( build_kwds={ "nnd_graph_degree": graph_degree, "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, }, ) @@ -578,8 +543,7 @@ def test_hdbscan_empty_cluster_tree(build_algo): assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0 -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) -def test_hdbscan_plots(build_algo): +def test_hdbscan_plots(): X, y = make_blobs( n_samples=int(100), @@ -590,7 +554,7 @@ def test_hdbscan_plots(build_algo): random_state=42, ) - cuml_agg = HDBSCAN(gen_min_span_tree=True, build_algo=build_algo) + cuml_agg = HDBSCAN(gen_min_span_tree=True) cuml_agg.fit(X) assert cuml_agg.condensed_tree_ is not None @@ -612,7 +576,6 @@ def test_hdbscan_plots(build_algo): @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("batch_size", [128, 1000]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_blobs( nrows, ncols, @@ -623,7 +586,6 @@ def test_all_points_membership_vectors_blobs( allow_single_cluster, max_cluster_size, batch_size, - build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -633,9 +595,6 @@ def test_all_points_membership_vectors_blobs( shuffle=True, random_state=42, ) - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( - min_cluster_size - ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -644,12 +603,6 @@ def test_all_points_membership_vectors_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, - build_kwds={ - "nnd_graph_degree": graph_degree, - "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, - }, ) cuml_agg.fit(X) @@ -744,7 +697,6 @@ def test_all_points_membership_vectors_moons( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [128, 1000]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_circles( nrows, min_samples, @@ -755,7 +707,6 @@ def test_all_points_membership_vectors_circles( max_cluster_size, connectivity, batch_size, - build_algo, ): X, y = datasets.make_circles( n_samples=nrows, factor=0.5, noise=0.05, random_state=42 @@ -770,7 +721,6 @@ def test_all_points_membership_vectors_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, ) cuml_agg.fit(X) @@ -809,7 +759,6 @@ def test_all_points_membership_vectors_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_blobs( nrows, n_points_to_predict, @@ -820,7 +769,6 @@ def test_approximate_predict_blobs( min_cluster_size, max_cluster_size, allow_single_cluster, - build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -848,12 +796,6 @@ def test_approximate_predict_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, - build_kwds={ - "nnd_max_iterations": 100, - "nnd_graph_degree": 96, - "nnd_intermediate_graph_degree": 128, - }, ) cuml_agg.fit(X) @@ -875,12 +817,8 @@ def test_approximate_predict_blobs( sk_agg, points_to_predict ) - if build_algo == "brute_force_knn": - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95 - assert np.allclose(cu_probs, sk_probs, atol=0.05) - else: - # this test case is not so stable for nn descent at this moment - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.9 + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95 + assert np.allclose(cu_probs, sk_probs, atol=0.05) @pytest.mark.parametrize("nrows", [1000]) @@ -892,7 +830,6 @@ def test_approximate_predict_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_moons( nrows, n_points_to_predict, @@ -903,7 +840,6 @@ def test_approximate_predict_moons( max_cluster_size, cluster_selection_method, connectivity, - build_algo, ): X, y = datasets.make_moons( @@ -922,7 +858,6 @@ def test_approximate_predict_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -947,15 +882,8 @@ def test_approximate_predict_moons( sk_unique = np.unique(sk_labels) cu_unique = np.unique(cu_labels) if len(sk_unique) == len(cu_unique): - if build_algo == "brute_force_knn": - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99 - assert array_equal( - cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005 - ) - else: - # this test case is not so stable for nn descent at this moment - # a few configs result in scores around 0.85 - assert adjusted_rand_score(cu_labels, sk_labels) >= 0.8 + assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99 + assert array_equal(cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005) @pytest.mark.parametrize("nrows", [1000]) @@ -967,7 +895,6 @@ def test_approximate_predict_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_circles( nrows, n_points_to_predict, @@ -978,7 +905,6 @@ def test_approximate_predict_circles( max_cluster_size, cluster_selection_method, connectivity, - build_algo, ): X, y = datasets.make_circles( n_samples=nrows + n_points_to_predict, @@ -999,7 +925,6 @@ def test_approximate_predict_circles( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -1036,7 +961,7 @@ def test_approximate_predict_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_digits( n_points_to_predict, min_samples, @@ -1106,7 +1031,6 @@ def test_approximate_predict_digits( @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("batch_size", [128]) -@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_blobs( nrows, n_points_to_predict, @@ -1118,7 +1042,6 @@ def test_membership_vector_blobs( allow_single_cluster, max_cluster_size, batch_size, - build_algo, ): X, y = make_blobs( n_samples=nrows, @@ -1138,7 +1061,7 @@ def test_membership_vector_blobs( random_state=42, ) - graph_degree, intermediate_graph_degree, max_iters = get_graph_degree( + graph_degree, intermediate_graph_degree = get_graph_degree( min_cluster_size ) cuml_agg = HDBSCAN( @@ -1149,12 +1072,6 @@ def test_membership_vector_blobs( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, - build_algo=build_algo, - build_kwds={ - "nnd_graph_degree": graph_degree, - "nnd_intermediate_graph_degree": intermediate_graph_degree, - "nnd_max_iterations": max_iters, - }, ) cuml_agg.fit(X) @@ -1313,73 +1230,3 @@ def test_membership_vector_circles( "float32" ) assert_membership_vectors(cu_membership_vectors, sk_membership_vectors) - - -@pytest.mark.parametrize("nrows", [1000]) -@pytest.mark.parametrize("n_points_to_predict", [1000]) -@pytest.mark.parametrize("min_samples", [20, 30]) -@pytest.mark.parametrize("min_cluster_size", [100, 150]) -@pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5]) -@pytest.mark.parametrize("allow_single_cluster", [True, False]) -@pytest.mark.parametrize("max_cluster_size", [0]) -@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("batch_size", [16]) -@pytest.mark.parametrize("build_algo", ["nn_descent"]) -def test_membership_vector_circles_nnd( - nrows, - n_points_to_predict, - min_samples, - cluster_selection_epsilon, - cluster_selection_method, - min_cluster_size, - allow_single_cluster, - max_cluster_size, - connectivity, - batch_size, - build_algo, -): - X, y = datasets.make_circles( - n_samples=nrows + n_points_to_predict, - factor=0.8, - noise=0.05, - random_state=42, - ) - - X_train = X[:nrows] - X_test = X[nrows:] - - cuml_agg_nnd = HDBSCAN( - verbose=logger.level_info, - min_samples=min_samples, - allow_single_cluster=allow_single_cluster, - max_cluster_size=max_cluster_size, - min_cluster_size=min_cluster_size, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - prediction_data=True, - build_algo=build_algo, - ) - cuml_agg_nnd.fit(X_train) - - cuml_agg_bf = HDBSCAN( - verbose=logger.level_info, - min_samples=min_samples, - allow_single_cluster=allow_single_cluster, - max_cluster_size=max_cluster_size, - min_cluster_size=min_cluster_size, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - prediction_data=True, - ) - cuml_agg_bf.fit(X_train) - - cu_membership_vectors_nnd = membership_vector( - cuml_agg_nnd, X_test, batch_size - ) - cu_membership_vectors_bf = membership_vector( - cuml_agg_bf, X_test, batch_size - ) - assert_membership_vectors( - cu_membership_vectors_nnd, cu_membership_vectors_bf - ) From f0cdd3c42fe69415ee890cf4eb024fff36c82d79 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 12 Jul 2024 21:13:18 +0000 Subject: [PATCH 12/25] add copy kernel --- cpp/src/hdbscan/detail/reachability.cuh | 167 ++++++++++++++++-------- 1 file changed, 116 insertions(+), 51 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 99907897e5..c0af5dc292 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -17,14 +17,15 @@ #pragma once #include +#include #include +#include #include #include #include #include #include -#include #include #include #include @@ -84,6 +85,45 @@ struct DistancePostProcessSqrt { } }; +template +CUML_KERNEL void copy_first_k_cols_shift_self( + T* out, T* in, size_t out_k, size_t in_k, size_t nrows) +{ + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < nrows) { + for (size_t i = 1; i < out_k; i++) { + out[row * out_k + i] = in[row * in_k + i - 1]; + } + out[row * out_k] = row; + } +} + +template +CUML_KERNEL void copy_first_k_cols_shift_zero( + T* out, T* in, size_t out_k, size_t in_k, size_t nrows) +{ + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < nrows) { + for (size_t i = 1; i < out_k; i++) { + out[row * out_k + i] = in[row * in_k + i - 1]; + } + out[row * out_k] = static_cast(0); + } +} + +template +CUML_KERNEL void copy_first_k_cols_shift_core_dists( + T* out, T* in, T* core_dists, size_t out_k, size_t in_k, size_t nrows) +{ + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < nrows) { + for (size_t i = 1; i < out_k; i++) { + out[row * out_k + i] = in[row * in_k + i - 1]; + } + out[row * out_k] = static_cast(core_dists[row]); + } +} + /** * Wraps the brute force knn API, to be used for both training and prediction * @tparam value_idx data type for integrals @@ -140,46 +180,37 @@ void compute_knn(const raft::handle_t& handle, true, metric); } else { // NN_DESCENT + auto epilogue = DistancePostProcessSqrt{}; + build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); auto dataset = raft::make_host_matrix_view(X, m, n); - auto graph = NNDescent::detail::build(handle, build_params, dataset); - - // NN Descent build does not include itself in nearest neighbors - for (size_t i = 0; i < n_search_items; i++) { - for (size_t j = k - 1; j >= 1; j--) { - graph.graph().data_handle()[i * build_params.graph_degree + j] = - graph.graph().data_handle()[i * build_params.graph_degree + j - 1]; - } - graph.graph().data_handle()[i * build_params.graph_degree] = i; - } - auto dataset_dev = raft::make_device_matrix(handle, m, n); - raft::copy(dataset_dev.data_handle(), dataset.data_handle(), m * n, handle.get_stream()); - auto dataset_dev_view = raft::make_device_matrix_view( - dataset_dev.data_handle(), m, n); + auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); - auto neighbor_candidates = raft::make_device_matrix( - handle, m, build_params.graph_degree); - raft::copy(neighbor_candidates.data_handle(), - graph.graph().data_handle(), - m * build_params.graph_degree, - handle.get_stream()); - auto neighbor_candidates_view = - raft::make_device_matrix_view( - neighbor_candidates.data_handle(), m, build_params.graph_degree); - - auto indices = - raft::make_device_matrix_view(int64_indices.data(), n_search_items, k); - auto distances = raft::make_device_matrix_view(dists, n_search_items, k); - raft::neighbors::refine(handle, - dataset_dev_view, - dataset_dev_view, - neighbor_candidates_view, - indices, - distances, - metric); + size_t TPB = 256; + size_t num_blocks = static_cast((m + TPB) / TPB); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy( + indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream); + + if (graph.distances().has_value()) { + copy_first_k_cols_shift_zero + <<>>(dists, + graph.distances().value().data_handle(), + static_cast(k), + build_params.graph_degree, + m); + } + copy_first_k_cols_shift_self<<>>(int64_indices.data(), + indices_d.data_handle(), + static_cast(k), + build_params.graph_degree, + m); } // convert from current knn's 64-bit to 32-bit. @@ -290,22 +321,56 @@ void mutual_reachability_knn_l2( // enclosing parent function (and any parent classes) of an extended __device__ // or __host__ __device__ lambda` - auto epilogue = ReachabilityPostProcess{core_dists, alpha}; - auto X_view = raft::make_device_matrix_view(X, m, n); - std::vector> index = {X_view}; - - raft::neighbors::brute_force::knn( - handle, - index, - X_view, - raft::make_device_matrix_view(out_inds, m, static_cast(k)), - raft::make_device_matrix_view(out_dists, m, static_cast(k)), - // TODO: expand distance metrics to support more than just L2 distance - // https://github.com/rapidsai/cuml/issues/5301 - raft::distance::DistanceType::L2SqrtExpanded, - std::make_optional(2.0f), - std::nullopt, - epilogue); + if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { + auto epilogue = ReachabilityPostProcess{core_dists, alpha}; + auto X_view = raft::make_device_matrix_view(X, m, n); + std::vector> index = {X_view}; + + raft::neighbors::brute_force::knn( + handle, + index, + X_view, + raft::make_device_matrix_view(out_inds, m, static_cast(k)), + raft::make_device_matrix_view(out_dists, m, static_cast(k)), + // TODO: expand distance metrics to support more than just L2 distance + // https://github.com/rapidsai/cuml/issues/5301 + raft::distance::DistanceType::L2SqrtExpanded, + std::make_optional(2.0f), + std::nullopt, + epilogue); + } else { + auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; + build_params.return_distances = true; + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto dataset = raft::make_host_matrix_view(X, m, n); + auto graph = + NNDescent::detail::build(handle, build_params, dataset, epilogue); + + size_t TPB = 256; + size_t num_blocks = static_cast((m + TPB) / TPB); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy(indices_d.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + handle.get_stream()); + + if (graph.distances().has_value()) { + copy_first_k_cols_shift_core_dists + <<>>(out_dists, + graph.distances().value().data_handle(), + core_dists, + static_cast(k), + build_params.graph_degree, + m); + } + copy_first_k_cols_shift_self<<>>( + out_inds, indices_d.data_handle(), static_cast(k), build_params.graph_degree, m); + } } /** From be5c167c413f9fdd30fdce935a4f9957f6453a5e Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 12 Jul 2024 23:57:24 +0000 Subject: [PATCH 13/25] remove build_algo in test --- python/cuml/tests/test_hdbscan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 94c2e0002b..ef60e964a8 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -251,7 +251,6 @@ def test_hdbscan_sklearn_datasets( cluster_selection_method, min_samples_cluster_size_bounds, allow_single_cluster, - build_algo, ): ( From e2735f552f2b986b4b25c063b2501291e5c098a0 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 24 Jul 2024 23:19:04 +0000 Subject: [PATCH 14/25] auto option as default for build_algo --- python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 22 +++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx index 26c0eb7cd3..c4934608aa 100644 --- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx @@ -23,6 +23,7 @@ from cuml.internals.safe_imports import gpu_only_import cp = gpu_only_import('cupy') from warnings import warn +from cuml.internals import logger from cuml.internals.array import CumlArray from cuml.internals.base import UniversalBase from cuml.common.doc_utils import generate_docstring @@ -518,7 +519,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): connectivity='knn', output_type=None, prediction_data=False, - build_algo='brute_force_knn', + build_algo='auto', build_kwds=None): super().__init__(handle=handle, @@ -852,6 +853,15 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): raise ValueError("Cluster selection method not supported. " "Must one of {'eom', 'leaf'}") + if self.build_algo == "auto": + if self.n_rows <= 50000: + # brute force is faster for small datasets + logger.warn("Building knn graph using brute force") + self.build_algo = "brute_force_knn" + else: + logger.warn("Building knn graph using nn descent") + self.build_algo = "nn_descent" + if self.build_algo == 'brute_force_knn': params.build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN elif self.build_algo == 'nn_descent': @@ -1114,6 +1124,16 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): cdef GRAPH_BUILD_ALGO build_algo cdef index_params build_params + + if self.build_algo == "auto": + if self.n_rows <= 50000: + # brute force is faster for small datasets + logger.warn("Building knn graph using brute force") + self.build_algo = "brute_force_knn" + else: + logger.warn("Building knn graph using nn descent") + self.build_algo = "nn_descent" + if self.build_algo == 'brute_force_knn': build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN elif self.build_algo == 'nn_descent': From 746ac33786d59c85afda1e2d0563bf0968d24d9f Mon Sep 17 00:00:00 2001 From: jinsolp Date: Mon, 19 Aug 2024 20:49:36 +0000 Subject: [PATCH 15/25] use slice kernels --- cpp/src/hdbscan/detail/reachability.cuh | 52 +++++++++++-------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index c0af5dc292..710da8ff6e 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -78,7 +78,7 @@ void core_distances( // Functor to post-process distances by sqrt // For usage with NN Descent which internally supports L2Expanded only template -struct DistancePostProcessSqrt { +struct DistancePostProcessSqrt : NNDescent::DistEpilogue { DI value_t operator()(value_t value, value_idx row, value_idx col) const { return powf(fabsf(value), 0.5); @@ -111,19 +111,6 @@ CUML_KERNEL void copy_first_k_cols_shift_zero( } } -template -CUML_KERNEL void copy_first_k_cols_shift_core_dists( - T* out, T* in, T* core_dists, size_t out_k, size_t in_k, size_t nrows) -{ - size_t row = blockIdx.x * blockDim.x + threadIdx.x; - if (row < nrows) { - for (size_t i = 1; i < out_k; i++) { - out[row * out_k + i] = in[row * in_k + i - 1]; - } - out[row * out_k] = static_cast(core_dists[row]); - } -} - /** * Wraps the brute force knn API, to be used for both training and prediction * @tparam value_idx data type for integrals @@ -278,11 +265,15 @@ struct ReachabilityPostProcess { // Functor to post-process distances into reachability space (Sqrt) // For usage with NN Descent which internally supports L2Expanded only template -struct ReachabilityPostProcessSqrt { +struct ReachabilityPostProcessSqrt : NNDescent::DistEpilogue { + ReachabilityPostProcessSqrt(value_t* core_dists_, value_t alpha_) + : NNDescent::DistEpilogue(), core_dists(core_dists_), alpha(alpha_){}; + DI value_t operator()(value_t value, value_idx row, value_idx col) const { return max(core_dists[col], max(core_dists[row], powf(fabsf(alpha * value), 0.5))); } + const value_t* core_dists; value_t alpha; }; @@ -339,7 +330,7 @@ void mutual_reachability_knn_l2( std::nullopt, epilogue); } else { - auto epilogue = ReachabilityPostProcessSqrt{core_dists, alpha}; + auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); @@ -348,9 +339,6 @@ void mutual_reachability_knn_l2( auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); - size_t TPB = 256; - size_t num_blocks = static_cast((m + TPB) / TPB); - auto indices_d = raft::make_device_matrix(handle, m, build_params.graph_degree); @@ -359,17 +347,21 @@ void mutual_reachability_knn_l2( m * build_params.graph_degree, handle.get_stream()); - if (graph.distances().has_value()) { - copy_first_k_cols_shift_core_dists - <<>>(out_dists, - graph.distances().value().data_handle(), - core_dists, - static_cast(k), - build_params.graph_degree, - m); - } - copy_first_k_cols_shift_self<<>>( - out_inds, indices_d.data_handle(), static_cast(k), build_params.graph_degree, m); + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(m), + static_cast(k)}; + + auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords); + auto out_knn_indices_view = + raft::make_device_matrix_view(out_inds, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords); } } From 2b46746cbd37e6a196f7a1b69eed39864454146c Mon Sep 17 00:00:00 2001 From: jinsolp Date: Tue, 20 Aug 2024 20:02:31 +0000 Subject: [PATCH 16/25] tests --- python/cuml/cuml/tests/test_hdbscan.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cuml/cuml/tests/test_hdbscan.py b/python/cuml/cuml/tests/test_hdbscan.py index ef60e964a8..a061f71d56 100644 --- a/python/cuml/cuml/tests/test_hdbscan.py +++ b/python/cuml/cuml/tests/test_hdbscan.py @@ -1060,9 +1060,6 @@ def test_membership_vector_blobs( random_state=42, ) - graph_degree, intermediate_graph_degree = get_graph_degree( - min_cluster_size - ) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, From 6bb63571e7481a10dcc81225d902870114cc726c Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 21 Aug 2024 16:42:26 +0000 Subject: [PATCH 17/25] make data view depending on host/dev --- cpp/src/hdbscan/detail/reachability.cuh | 38 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 710da8ff6e..eae628f0a5 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -111,6 +111,26 @@ CUML_KERNEL void copy_first_k_cols_shift_zero( } } +template +auto get_graph_nnd(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + epilogue_op distance_epilogue, + Common::nn_index_params build_params) +{ + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, X)); + float* ptr = reinterpret_cast(attr.devicePointer); + if (ptr != nullptr) { + auto dataset = raft::make_device_matrix_view(X, m, n); + return NNDescent::build(handle, build_params, dataset, distance_epilogue); + } else { + auto dataset = raft::make_host_matrix_view(X, m, n); + return NNDescent::build(handle, build_params, dataset, distance_epilogue); + } +} + /** * Wraps the brute force knn API, to be used for both training and prediction * @tparam value_idx data type for integrals @@ -167,14 +187,12 @@ void compute_knn(const raft::handle_t& handle, true, metric); } else { // NN_DESCENT - auto epilogue = DistancePostProcessSqrt{}; - build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); - auto dataset = raft::make_host_matrix_view(X, m, n); - - auto graph = NNDescent::detail::build(handle, build_params, dataset, epilogue); + auto epilogue = DistancePostProcessSqrt{}; + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); size_t TPB = 256; size_t num_blocks = static_cast((m + TPB) / TPB); @@ -330,14 +348,12 @@ void mutual_reachability_knn_l2( std::nullopt, epilogue); } else { - auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); - build_params.return_distances = true; RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, "n_neighbors should be smaller than the graph degree computed by nn descent"); - auto dataset = raft::make_host_matrix_view(X, m, n); - auto graph = - NNDescent::detail::build(handle, build_params, dataset, epilogue); + auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); auto indices_d = raft::make_device_matrix(handle, m, build_params.graph_degree); From 3ec57c763647dab9635a33b26d6f63cb048170f9 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 21 Aug 2024 20:18:45 +0000 Subject: [PATCH 18/25] adding arg --- python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx index c4934608aa..5a6e62f6d9 100644 --- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx @@ -1143,11 +1143,13 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): build_params.intermediate_graph_degree = 128 build_params.max_iterations = 20 build_params.termination_threshold = 0.0001 + build_params.return_distances = True else: build_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) build_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) build_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) build_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + build_params.return_distances = self.build_kwds.get("nnd_return_distances", True) compute_core_dists(handle_[0], X_ptr, From d93aee6a77544a31ea29598eb55e8d49beac51a7 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 21 Aug 2024 20:19:43 +0000 Subject: [PATCH 19/25] for building + CI check --- cpp/cmake/thirdparty/get_raft.cmake | 4 ++-- cpp/src/umap/knn_graph/algo.cuh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index 7bc860eed8..8ac1cee130 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -82,8 +82,8 @@ endfunction() # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} - FORK rapidsai - PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} + FORK jinsolp + PINNED_TAG batch-nnd EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} # When PINNED_TAG above doesn't match cuml, # force local raft clone in build directory diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh index f6284e6a91..92c717afcd 100644 --- a/cpp/src/umap/knn_graph/algo.cuh +++ b/cpp/src/umap/knn_graph/algo.cuh @@ -59,7 +59,7 @@ void launcher(const raft::handle_t& handle, // Functor to post-process distances as L2Sqrt* template -struct DistancePostProcessSqrt { +struct DistancePostProcessSqrt : NNDescent::DistEpilogue { DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); } }; From 7dbe38d2d93464668a9888ab91cea4b81a762209 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 22 Aug 2024 17:41:46 +0000 Subject: [PATCH 20/25] ann types fix --- python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx index 5a6e62f6d9..afbfd34a6d 100644 --- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx @@ -54,6 +54,7 @@ IF GPUBUILD == 1: size_t max_iterations, float termination_threshold, bool return_distances, + uint64_t n_clusters, cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": From 337aa8802dd4ca0be37405f87da920472e7ca9f5 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 22 Aug 2024 22:17:16 +0000 Subject: [PATCH 21/25] type fix --- python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx index afbfd34a6d..37ef2ded3d 100644 --- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx @@ -54,7 +54,7 @@ IF GPUBUILD == 1: size_t max_iterations, float termination_threshold, bool return_distances, - uint64_t n_clusters, + size_t n_clusters, cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": @@ -570,6 +570,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): self.prediction_data_ptr = None self._cpu_to_gpu_interop_prepped = False + logger.set_level(verbose) + @property def condensed_tree_(self): From 154bbda8a167859e9f4e86d306ffdb94022f9586 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 23 Aug 2024 00:18:37 +0000 Subject: [PATCH 22/25] change if to raft_expects --- cpp/src/hdbscan/detail/reachability.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index eae628f0a5..c5910a6213 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -203,14 +203,14 @@ void compute_knn(const raft::handle_t& handle, raft::copy( indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream); - if (graph.distances().has_value()) { - copy_first_k_cols_shift_zero - <<>>(dists, - graph.distances().value().data_handle(), - static_cast(k), - build_params.graph_degree, - m); - } + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + copy_first_k_cols_shift_zero + <<>>(dists, + graph.distances().value().data_handle(), + static_cast(k), + build_params.graph_degree, + m); copy_first_k_cols_shift_self<<>>(int64_indices.data(), indices_d.data_handle(), static_cast(k), From 6083812707bc6d0975ce38ffdf7311b841ec5f14 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 23 Aug 2024 00:27:43 +0000 Subject: [PATCH 23/25] revert fork and pinned tag --- cpp/cmake/thirdparty/get_raft.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index 8ac1cee130..7bc860eed8 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -82,8 +82,8 @@ endfunction() # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} - FORK jinsolp - PINNED_TAG batch-nnd + FORK rapidsai + PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} # When PINNED_TAG above doesn't match cuml, # force local raft clone in build directory From 732a06eb552f42d68303869512ae2a97d93ac1fd Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 23 Aug 2024 03:37:57 +0000 Subject: [PATCH 24/25] Trigger CI From 904ab1b79050869ca931d7c0c500c9c5f812c3de Mon Sep 17 00:00:00 2001 From: soleee99 Date: Sun, 22 Sep 2024 03:28:57 +0000 Subject: [PATCH 25/25] change to switch --- cpp/src/hdbscan/detail/reachability.cuh | 210 +++++++++++++----------- 1 file changed, 112 insertions(+), 98 deletions(-) diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index c5910a6213..9538536723 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -166,56 +166,65 @@ void compute_knn(const raft::handle_t& handle, // pass value_idx through to knn. rmm::device_uvector int64_indices(k * n_search_items, stream); - if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { - std::vector inputs; - inputs.push_back(const_cast(X)); - - std::vector sizes; - sizes.push_back(m); - - // perform knn - brute_force_knn(handle, - inputs, - sizes, - n, - const_cast(search_items), - n_search_items, - int64_indices.data(), - dists, - k, - true, - true, - metric); - } else { // NN_DESCENT - RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, - "n_neighbors should be smaller than the graph degree computed by nn descent"); - - auto epilogue = DistancePostProcessSqrt{}; - build_params.return_distances = true; - auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); - - size_t TPB = 256; - size_t num_blocks = static_cast((m + TPB) / TPB); - - auto indices_d = - raft::make_device_matrix(handle, m, build_params.graph_degree); - - raft::copy( - indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream); - - RAFT_EXPECTS(graph.distances().has_value(), - "return_distances for nn descent should be set to true to be used for HDBSCAN"); - copy_first_k_cols_shift_zero - <<>>(dists, - graph.distances().value().data_handle(), - static_cast(k), - build_params.graph_degree, - m); - copy_first_k_cols_shift_self<<>>(int64_indices.data(), - indices_d.data_handle(), - static_cast(k), - build_params.graph_degree, - m); + switch (build_algo) { + case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: { + std::vector inputs; + inputs.push_back(const_cast(X)); + + std::vector sizes; + sizes.push_back(m); + + // perform knn + brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(search_items), + n_search_items, + int64_indices.data(), + dists, + k, + true, + true, + metric); + break; + } + + case Common::GRAPH_BUILD_ALGO::NN_DESCENT: { + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto epilogue = DistancePostProcessSqrt{}; + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); + + size_t TPB = 256; + size_t num_blocks = static_cast((m + TPB) / TPB); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy(indices_d.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + stream); + + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + copy_first_k_cols_shift_zero + <<>>(dists, + graph.distances().value().data_handle(), + static_cast(k), + build_params.graph_degree, + m); + copy_first_k_cols_shift_self + <<>>(int64_indices.data(), + indices_d.data_handle(), + static_cast(k), + build_params.graph_degree, + m); + break; + } } // convert from current knn's 64-bit to 32-bit. @@ -329,55 +338,60 @@ void mutual_reachability_knn_l2( // `A type local to a function cannot be used in the template argument of the // enclosing parent function (and any parent classes) of an extended __device__ // or __host__ __device__ lambda` + switch (build_algo) { + case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: { + auto epilogue = ReachabilityPostProcess{core_dists, alpha}; + auto X_view = raft::make_device_matrix_view(X, m, n); + std::vector> index = {X_view}; + + raft::neighbors::brute_force::knn( + handle, + index, + X_view, + raft::make_device_matrix_view(out_inds, m, static_cast(k)), + raft::make_device_matrix_view(out_dists, m, static_cast(k)), + // TODO: expand distance metrics to support more than just L2 distance + // https://github.com/rapidsai/cuml/issues/5301 + raft::distance::DistanceType::L2SqrtExpanded, + std::make_optional(2.0f), + std::nullopt, + epilogue); + break; + } - if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) { - auto epilogue = ReachabilityPostProcess{core_dists, alpha}; - auto X_view = raft::make_device_matrix_view(X, m, n); - std::vector> index = {X_view}; - - raft::neighbors::brute_force::knn( - handle, - index, - X_view, - raft::make_device_matrix_view(out_inds, m, static_cast(k)), - raft::make_device_matrix_view(out_dists, m, static_cast(k)), - // TODO: expand distance metrics to support more than just L2 distance - // https://github.com/rapidsai/cuml/issues/5301 - raft::distance::DistanceType::L2SqrtExpanded, - std::make_optional(2.0f), - std::nullopt, - epilogue); - } else { - RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, - "n_neighbors should be smaller than the graph degree computed by nn descent"); - - auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); - build_params.return_distances = true; - auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); - - auto indices_d = - raft::make_device_matrix(handle, m, build_params.graph_degree); - - raft::copy(indices_d.data_handle(), - graph.graph().data_handle(), - m * build_params.graph_degree, - handle.get_stream()); - - RAFT_EXPECTS(graph.distances().has_value(), - "return_distances for nn descent should be set to true to be used for HDBSCAN"); - - raft::matrix::slice_coordinates coords{static_cast(0), - static_cast(0), - static_cast(m), - static_cast(k)}; - - auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k); - raft::matrix::slice( - handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords); - auto out_knn_indices_view = - raft::make_device_matrix_view(out_inds, m, (size_t)k); - raft::matrix::slice( - handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords); + case Common::GRAPH_BUILD_ALGO::NN_DESCENT: { + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy(indices_d.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + handle.get_stream()); + + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(m), + static_cast(k)}; + + auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords); + auto out_knn_indices_view = + raft::make_device_matrix_view(out_inds, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords); + break; + } } }