diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp index eb1223fd88..0b98aeca86 100644 --- a/cpp/include/cuml/cluster/hdbscan.hpp +++ b/cpp/include/cuml/cluster/hdbscan.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -27,6 +28,8 @@ namespace ML { namespace HDBSCAN { namespace Common { +using nn_index_params = raft::neighbors::experimental::nn_descent::index_params; + /** * The Condensed hierarchicy is represented by an edge list with * parents as the source vertices, children as the destination, @@ -134,6 +137,7 @@ class CondensedHierarchy { }; enum CLUSTER_SELECTION_METHOD { EOM = 0, LEAF = 1 }; +enum GRAPH_BUILD_ALGO { BRUTE_FORCE_KNN = 0, NN_DESCENT = 1 }; class RobustSingleLinkageParams { public: @@ -151,6 +155,8 @@ class RobustSingleLinkageParams { class HDBSCANParams : public RobustSingleLinkageParams { public: CLUSTER_SELECTION_METHOD cluster_selection_method = CLUSTER_SELECTION_METHOD::EOM; + GRAPH_BUILD_ALGO build_algo = GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN; + nn_index_params nn_descent_params = {}; }; /** @@ -495,14 +501,19 @@ namespace HDBSCAN::HELPER { * @param n number of columns in X * @param metric distance metric to use * @param min_samples minimum number of samples to use for computing core distances + * @param build_algo build algo for building the knn graph (default: brute_force_knn) + * @param build_params build parameters for build_algo */ -void compute_core_dists(const raft::handle_t& handle, - const float* X, - float* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples); +void compute_core_dists( + const raft::handle_t& handle, + const float* X, + float* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + HDBSCAN::Common::nn_index_params build_params = Common::nn_index_params{}); /** * @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh index 03a7f7c0ad..9538536723 100644 --- a/cpp/src/hdbscan/detail/reachability.cuh +++ b/cpp/src/hdbscan/detail/reachability.cuh @@ -16,11 +16,16 @@ #pragma once +#include +#include #include +#include #include #include #include +#include +#include #include #include #include @@ -34,6 +39,8 @@ #include #include +namespace NNDescent = raft::neighbors::experimental::nn_descent; + namespace ML { namespace HDBSCAN { namespace detail { @@ -68,6 +75,62 @@ void core_distances( }); } +// Functor to post-process distances by sqrt +// For usage with NN Descent which internally supports L2Expanded only +template +struct DistancePostProcessSqrt : NNDescent::DistEpilogue { + DI value_t operator()(value_t value, value_idx row, value_idx col) const + { + return powf(fabsf(value), 0.5); + } +}; + +template +CUML_KERNEL void copy_first_k_cols_shift_self( + T* out, T* in, size_t out_k, size_t in_k, size_t nrows) +{ + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < nrows) { + for (size_t i = 1; i < out_k; i++) { + out[row * out_k + i] = in[row * in_k + i - 1]; + } + out[row * out_k] = row; + } +} + +template +CUML_KERNEL void copy_first_k_cols_shift_zero( + T* out, T* in, size_t out_k, size_t in_k, size_t nrows) +{ + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < nrows) { + for (size_t i = 1; i < out_k; i++) { + out[row * out_k + i] = in[row * in_k + i - 1]; + } + out[row * out_k] = static_cast(0); + } +} + +template +auto get_graph_nnd(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + epilogue_op distance_epilogue, + Common::nn_index_params build_params) +{ + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, X)); + float* ptr = reinterpret_cast(attr.devicePointer); + if (ptr != nullptr) { + auto dataset = raft::make_device_matrix_view(X, m, n); + return NNDescent::build(handle, build_params, dataset, distance_epilogue); + } else { + auto dataset = raft::make_host_matrix_view(X, m, n); + return NNDescent::build(handle, build_params, dataset, distance_epilogue); + } +} + /** * Wraps the brute force knn API, to be used for both training and prediction * @tparam value_idx data type for integrals @@ -93,33 +156,76 @@ void compute_knn(const raft::handle_t& handle, const value_t* search_items, size_t n_search_items, int k, - raft::distance::DistanceType metric) + raft::distance::DistanceType metric, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { auto stream = handle.get_stream(); auto exec_policy = handle.get_thrust_policy(); - std::vector inputs; - inputs.push_back(const_cast(X)); - - std::vector sizes; - sizes.push_back(m); - // This is temporary. Once faiss is updated, we should be able to // pass value_idx through to knn. rmm::device_uvector int64_indices(k * n_search_items, stream); - // perform knn - brute_force_knn(handle, - inputs, - sizes, - n, - const_cast(search_items), - n_search_items, - int64_indices.data(), - dists, - k, - true, - true, - metric); + switch (build_algo) { + case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: { + std::vector inputs; + inputs.push_back(const_cast(X)); + + std::vector sizes; + sizes.push_back(m); + + // perform knn + brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(search_items), + n_search_items, + int64_indices.data(), + dists, + k, + true, + true, + metric); + break; + } + + case Common::GRAPH_BUILD_ALGO::NN_DESCENT: { + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto epilogue = DistancePostProcessSqrt{}; + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); + + size_t TPB = 256; + size_t num_blocks = static_cast((m + TPB) / TPB); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy(indices_d.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + stream); + + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + copy_first_k_cols_shift_zero + <<>>(dists, + graph.distances().value().data_handle(), + static_cast(k), + build_params.graph_degree, + m); + copy_first_k_cols_shift_self + <<>>(int64_indices.data(), + indices_d.data_handle(), + static_cast(k), + build_params.graph_degree, + m); + break; + } + } // convert from current knn's 64-bit to 32-bit. thrust::transform(exec_policy, @@ -134,13 +240,16 @@ void compute_knn(const raft::handle_t& handle, to compute core_dists */ template -void _compute_core_dists(const raft::handle_t& handle, - const value_t* X, - value_t* core_dists, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples) +void _compute_core_dists( + const raft::handle_t& handle, + const value_t* X, + value_t* core_dists, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); @@ -151,7 +260,18 @@ void _compute_core_dists(const raft::handle_t& handle, rmm::device_uvector dists(min_samples * m, stream); // perform knn - compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric); + compute_knn(handle, + X, + inds.data(), + dists.data(), + m, + n, + X, + m, + min_samples, + metric, + build_algo, + build_params); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); @@ -169,6 +289,22 @@ struct ReachabilityPostProcess { value_t alpha; }; +// Functor to post-process distances into reachability space (Sqrt) +// For usage with NN Descent which internally supports L2Expanded only +template +struct ReachabilityPostProcessSqrt : NNDescent::DistEpilogue { + ReachabilityPostProcessSqrt(value_t* core_dists_, value_t alpha_) + : NNDescent::DistEpilogue(), core_dists(core_dists_), alpha(alpha_){}; + + DI value_t operator()(value_t value, value_idx row, value_idx col) const + { + return max(core_dists[col], max(core_dists[row], powf(fabsf(alpha * value), 0.5))); + } + + const value_t* core_dists; + value_t alpha; +}; + /** * Given core distances, Fuses computations of L2 distances between all * points, projection into mutual reachability space, and k-selection. @@ -184,38 +320,79 @@ struct ReachabilityPostProcess { * @param[in] core_dists array of core distances (size m) */ template -void mutual_reachability_knn_l2(const raft::handle_t& handle, - value_idx* out_inds, - value_t* out_dists, - const value_t* X, - size_t m, - size_t n, - int k, - value_t* core_dists, - value_t alpha) +void mutual_reachability_knn_l2( + const raft::handle_t& handle, + value_idx* out_inds, + value_t* out_dists, + const value_t* X, + size_t m, + size_t n, + int k, + value_t* core_dists, + value_t alpha, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { // Create a functor to postprocess distances into mutual reachability space // Note that we can't use a lambda for this here, since we get errors like: // `A type local to a function cannot be used in the template argument of the // enclosing parent function (and any parent classes) of an extended __device__ // or __host__ __device__ lambda` - auto epilogue = ReachabilityPostProcess{core_dists, alpha}; - - auto X_view = raft::make_device_matrix_view(X, m, n); - std::vector> index = {X_view}; - - raft::neighbors::brute_force::knn( - handle, - index, - X_view, - raft::make_device_matrix_view(out_inds, m, static_cast(k)), - raft::make_device_matrix_view(out_dists, m, static_cast(k)), - // TODO: expand distance metrics to support more than just L2 distance - // https://github.com/rapidsai/cuml/issues/5301 - raft::distance::DistanceType::L2SqrtExpanded, - std::make_optional(2.0f), - std::nullopt, - epilogue); + switch (build_algo) { + case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: { + auto epilogue = ReachabilityPostProcess{core_dists, alpha}; + auto X_view = raft::make_device_matrix_view(X, m, n); + std::vector> index = {X_view}; + + raft::neighbors::brute_force::knn( + handle, + index, + X_view, + raft::make_device_matrix_view(out_inds, m, static_cast(k)), + raft::make_device_matrix_view(out_dists, m, static_cast(k)), + // TODO: expand distance metrics to support more than just L2 distance + // https://github.com/rapidsai/cuml/issues/5301 + raft::distance::DistanceType::L2SqrtExpanded, + std::make_optional(2.0f), + std::nullopt, + epilogue); + break; + } + + case Common::GRAPH_BUILD_ALGO::NN_DESCENT: { + RAFT_EXPECTS(static_cast(k) <= build_params.graph_degree, + "n_neighbors should be smaller than the graph degree computed by nn descent"); + + auto epilogue = ReachabilityPostProcessSqrt(core_dists, alpha); + build_params.return_distances = true; + auto graph = get_graph_nnd(handle, X, m, n, epilogue, build_params); + + auto indices_d = + raft::make_device_matrix(handle, m, build_params.graph_degree); + + raft::copy(indices_d.data_handle(), + graph.graph().data_handle(), + m * build_params.graph_degree, + handle.get_stream()); + + RAFT_EXPECTS(graph.distances().has_value(), + "return_distances for nn descent should be set to true to be used for HDBSCAN"); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(m), + static_cast(k)}; + + auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords); + auto out_knn_indices_view = + raft::make_device_matrix_view(out_inds, m, (size_t)k); + raft::matrix::slice( + handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords); + break; + } + } } /** @@ -260,16 +437,19 @@ void mutual_reachability_knn_l2(const raft::handle_t& handle, * neighbors. */ template -void mutual_reachability_graph(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - int min_samples, - value_t alpha, - value_idx* indptr, - value_t* core_dists, - raft::sparse::COO& out) +void mutual_reachability_graph( + const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + int min_samples, + value_t alpha, + value_idx* indptr, + value_t* core_dists, + raft::sparse::COO& out, + Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN, + Common::nn_index_params build_params = Common::nn_index_params{}) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); @@ -282,7 +462,18 @@ void mutual_reachability_graph(const raft::handle_t& handle, rmm::device_uvector dists(min_samples * m, stream); // perform knn - compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric); + compute_knn(handle, + X, + inds.data(), + dists.data(), + m, + n, + X, + m, + min_samples, + metric, + build_algo, + build_params); // Slice core distances (distances to kth nearest neighbor) core_distances(dists.data(), min_samples, min_samples, m, core_dists, stream); @@ -290,8 +481,17 @@ void mutual_reachability_graph(const raft::handle_t& handle, /** * Compute L2 norm */ - mutual_reachability_knn_l2( - handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha); + mutual_reachability_knn_l2(handle, + inds.data(), + dists.data(), + X, + m, + n, + min_samples, + core_dists, + (value_t)1.0 / alpha, + build_algo, + build_params); // self-loops get max distance auto coo_rows_counting_itr = thrust::make_counting_iterator(0); diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu index ea64d20f6b..32ef78b470 100644 --- a/cpp/src/hdbscan/hdbscan.cu +++ b/cpp/src/hdbscan/hdbscan.cu @@ -158,10 +158,12 @@ void compute_core_dists(const raft::handle_t& handle, size_t m, size_t n, raft::distance::DistanceType metric, - int min_samples) + int min_samples, + HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo, + HDBSCAN::Common::nn_index_params build_params) { HDBSCAN::detail::Reachability::_compute_core_dists( - handle, X, core_dists, m, n, metric, min_samples); + handle, X, core_dists, m, n, metric, min_samples, build_algo, build_params); } void compute_inverse_label_map(const raft::handle_t& handle, diff --git a/cpp/src/hdbscan/runner.h b/cpp/src/hdbscan/runner.h index c79148eed2..d9591bf0f1 100644 --- a/cpp/src/hdbscan/runner.h +++ b/cpp/src/hdbscan/runner.h @@ -183,7 +183,9 @@ void build_linkage(const raft::handle_t& handle, params.alpha, mutual_reachability_indptr.data(), core_dists, - mutual_reachability_coo); + mutual_reachability_coo, + params.build_algo, + params.nn_descent_params); /** * Construct MST sorted by weights diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx index f7691c1684..37ef2ded3d 100644 --- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ from cuml.internals.safe_imports import gpu_only_import cp = gpu_only_import('cupy') from warnings import warn +from cuml.internals import logger from cuml.internals.array import CumlArray from cuml.internals.base import UniversalBase from cuml.common.doc_utils import generate_docstring @@ -46,12 +47,25 @@ IF GPUBUILD == 1: from pylibraft.common.handle import Handle from pylibraft.common.handle cimport handle_t + cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbors::experimental::nn_descent": + cdef struct index_params: + size_t graph_degree, + size_t intermediate_graph_degree, + size_t max_iterations, + float termination_threshold, + bool return_distances, + size_t n_clusters, + cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common": ctypedef enum CLUSTER_SELECTION_METHOD: EOM "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::EOM" LEAF "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::LEAF" + ctypedef enum GRAPH_BUILD_ALGO: + BRUTE_FORCE_KNN "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN" + NN_DESCENT "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::NN_DESCENT" + cdef cppclass CondensedHierarchy[value_idx, value_t]: CondensedHierarchy( const handle_t &handle, size_t n_leaves) @@ -98,6 +112,8 @@ IF GPUBUILD == 1: bool allow_single_cluster, CLUSTER_SELECTION_METHOD cluster_selection_method, + GRAPH_BUILD_ALGO build_algo, + index_params nn_descent_params, cdef cppclass PredictionData[int, float]: PredictionData(const handle_t &handle, @@ -151,7 +167,9 @@ IF GPUBUILD == 1: size_t m, size_t n, DistanceType metric, - int min_samples) + int min_samples, + GRAPH_BUILD_ALGO build_algo, + index_params build_params) void compute_inverse_label_map(const handle_t& handle, CondensedHierarchy[int, float]& @@ -501,7 +519,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): verbose=False, connectivity='knn', output_type=None, - prediction_data=False): + prediction_data=False, + build_algo='auto', + build_kwds=None): super().__init__(handle=handle, verbose=verbose, @@ -532,6 +552,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): self.fit_called_ = False self.prediction_data = prediction_data + self.build_algo = build_algo + self.build_kwds = build_kwds + self.n_clusters_ = None self.n_leaves_ = None @@ -547,6 +570,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): self.prediction_data_ptr = None self._cpu_to_gpu_interop_prepped = False + logger.set_level(verbose) + @property def condensed_tree_(self): @@ -831,6 +856,35 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): raise ValueError("Cluster selection method not supported. " "Must one of {'eom', 'leaf'}") + if self.build_algo == "auto": + if self.n_rows <= 50000: + # brute force is faster for small datasets + logger.warn("Building knn graph using brute force") + self.build_algo = "brute_force_knn" + else: + logger.warn("Building knn graph using nn descent") + self.build_algo = "nn_descent" + + if self.build_algo == 'brute_force_knn': + params.build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN + elif self.build_algo == 'nn_descent': + params.build_algo = GRAPH_BUILD_ALGO.NN_DESCENT + if self.build_kwds is None: + params.nn_descent_params.graph_degree = 64 + params.nn_descent_params.intermediate_graph_degree = 128 + params.nn_descent_params.max_iterations = 20 + params.nn_descent_params.termination_threshold = 0.0001 + params.nn_descent_params.return_distances = True + else: + params.nn_descent_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) + params.nn_descent_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) + params.nn_descent_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) + params.nn_descent_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + params.nn_descent_params.return_distances = self.build_kwds.get("nnd_return_distances", True) + else: + raise ValueError("Build algo not supported. " + "Must one of {'brute_force_knn', 'nn_descent'}") + cdef DistanceType metric if self.metric in _metrics_mapping: metric = _metrics_mapping[self.metric] @@ -1071,13 +1125,44 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): cdef uintptr_t X_ptr = self.X_m.ptr cdef uintptr_t core_dists_ptr = self.core_dists.ptr + cdef GRAPH_BUILD_ALGO build_algo + cdef index_params build_params + + if self.build_algo == "auto": + if self.n_rows <= 50000: + # brute force is faster for small datasets + logger.warn("Building knn graph using brute force") + self.build_algo = "brute_force_knn" + else: + logger.warn("Building knn graph using nn descent") + self.build_algo = "nn_descent" + + if self.build_algo == 'brute_force_knn': + build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN + elif self.build_algo == 'nn_descent': + build_algo = GRAPH_BUILD_ALGO.NN_DESCENT + if self.build_kwds is None: + build_params.graph_degree = 64 + build_params.intermediate_graph_degree = 128 + build_params.max_iterations = 20 + build_params.termination_threshold = 0.0001 + build_params.return_distances = True + else: + build_params.graph_degree = self.build_kwds.get("nnd_graph_degree", 64) + build_params.intermediate_graph_degree = self.build_kwds.get("nnd_intermediate_graph_degree", 128) + build_params.max_iterations = self.build_kwds.get("nnd_max_iterations", 20) + build_params.termination_threshold = self.build_kwds.get("nnd_termination_threshold", 0.0001) + build_params.return_distances = self.build_kwds.get("nnd_return_distances", True) + compute_core_dists(handle_[0], X_ptr, core_dists_ptr, self.n_rows, self.n_cols, metric, - self.min_samples) + self.min_samples, + build_algo, + build_params) cdef device_uvector[int] *inverse_label_map = \ new device_uvector[int](0, handle_[0].get_stream()) @@ -1125,7 +1210,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): "connectivity", "alpha", "gen_min_span_tree", - "prediction_data" + "prediction_data", + "build_algo", + "build_kwds" ] def get_attr_names(self): diff --git a/python/cuml/cuml/tests/test_hdbscan.py b/python/cuml/cuml/tests/test_hdbscan.py index 0a9a3a6382..a061f71d56 100644 --- a/python/cuml/cuml/tests/test_hdbscan.py +++ b/python/cuml/cuml/tests/test_hdbscan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,6 +44,12 @@ dataset_names = ["noisy_circles", "noisy_moons", "varied"] +def get_graph_degree(n_samples): + graph_degree = max(int((1 + ((n_samples * 1.5) // 32)) * 32), 64) + intermediate_graph_degree = int(1 + ((graph_degree * 1.3) // 32) * 32) + return graph_degree, intermediate_graph_degree + + def assert_cluster_counts(sk_agg, cuml_agg, digits=25): sk_unique, sk_counts = np.unique(sk_agg.labels_, return_counts=True) sk_counts = np.sort(sk_counts) @@ -142,14 +148,19 @@ def assert_membership_vectors(cu_vecs, sk_vecs): cu_labels_sorted = np.argsort(cu_vecs)[::-1] sk_labels_sorted = np.argsort(sk_vecs)[::-1] - k = min(sk_vecs.shape[1], 10) - for i in range(k): + if len(sk_vecs.shape) == 1: assert ( - adjusted_rand_score( - cu_labels_sorted[:, i], sk_labels_sorted[:, i] - ) - >= 0.90 + adjusted_rand_score(cu_labels_sorted, sk_labels_sorted) >= 0.9 ) + else: + k = min(sk_vecs.shape[1], 10) + for i in range(k): + assert ( + adjusted_rand_score( + cu_labels_sorted[:, i], sk_labels_sorted[:, i] + ) + >= 0.9 + ) @pytest.mark.parametrize("nrows", [500]) @@ -308,6 +319,7 @@ def test_hdbscan_sklearn_extract_clusters( allow_single_cluster, ): X = test_datasets.data + cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -349,6 +361,7 @@ def test_hdbscan_sklearn_extract_clusters( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_cluster_patterns( dataset, nrows, @@ -359,11 +372,11 @@ def test_hdbscan_cluster_patterns( allow_single_cluster, max_cluster_size, min_samples, + build_algo, ): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] - cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -372,6 +385,7 @@ def test_hdbscan_cluster_patterns( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -412,6 +426,7 @@ def test_hdbscan_cluster_patterns( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, @@ -422,11 +437,12 @@ def test_hdbscan_cluster_patterns_extract_clusters( allow_single_cluster, max_cluster_size, min_samples, + build_algo, ): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] - + graph_degree, intermediate_graph_degree = get_graph_degree(min_samples) cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -435,6 +451,11 @@ def test_hdbscan_cluster_patterns_extract_clusters( min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, + build_algo=build_algo, + build_kwds={ + "nnd_graph_degree": graph_degree, + "nnd_intermediate_graph_degree": intermediate_graph_degree, + }, ) sk_agg = hdbscan.HDBSCAN( @@ -494,7 +515,8 @@ def test_hdbscan_metric_parameter_input(metric, supported): clf.fit(X) -def test_hdbscan_empty_cluster_tree(): +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) +def test_hdbscan_empty_cluster_tree(build_algo): raw_tree = np.recarray( shape=(5,), @@ -510,7 +532,9 @@ def test_hdbscan_empty_cluster_tree(): condensed_tree = CondensedTree(raw_tree, 0.0, True) cuml_agg = HDBSCAN( - allow_single_cluster=True, cluster_selection_method="eom" + allow_single_cluster=True, + cluster_selection_method="eom", + build_algo=build_algo, ) cuml_agg._extract_clusters(condensed_tree) @@ -570,7 +594,6 @@ def test_all_points_membership_vectors_blobs( shuffle=True, random_state=42, ) - cuml_agg = HDBSCAN( verbose=logger.level_info, allow_single_cluster=allow_single_cluster, @@ -613,6 +636,7 @@ def test_all_points_membership_vectors_blobs( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [128, 1000]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_all_points_membership_vectors_moons( nrows, min_samples, @@ -623,6 +647,7 @@ def test_all_points_membership_vectors_moons( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_moons(n_samples=nrows, noise=0.05, random_state=42) @@ -636,6 +661,7 @@ def test_all_points_membership_vectors_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X) @@ -934,6 +960,7 @@ def test_approximate_predict_circles( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom"]) @pytest.mark.parametrize("connectivity", ["knn"]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_approximate_predict_digits( n_points_to_predict, min_samples, @@ -943,6 +970,7 @@ def test_approximate_predict_digits( max_cluster_size, cluster_selection_method, connectivity, + build_algo, ): digits = datasets.load_digits() X, y = digits.data, digits.target @@ -966,6 +994,7 @@ def test_approximate_predict_digits( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -1077,6 +1106,7 @@ def test_membership_vector_blobs( @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) @pytest.mark.parametrize("batch_size", [16]) +@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"]) def test_membership_vector_moons( nrows, n_points_to_predict, @@ -1088,6 +1118,7 @@ def test_membership_vector_moons( max_cluster_size, connectivity, batch_size, + build_algo, ): X, y = datasets.make_moons( @@ -1106,6 +1137,7 @@ def test_membership_vector_moons( cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, prediction_data=True, + build_algo=build_algo, ) cuml_agg.fit(X_train) @@ -1193,5 +1225,4 @@ def test_membership_vector_circles( sk_membership_vectors = hdbscan.membership_vector(sk_agg, X_test).astype( "float32" ) - assert_membership_vectors(cu_membership_vectors, sk_membership_vectors)