From f0b1ac70e4e3607ef8c6cc42b9af97148e7c6fec Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 14 Jun 2024 23:16:36 +0000
Subject: [PATCH 01/25] enable nn descent in hdbscan

---
 cpp/include/cuml/cluster/hdbscan.hpp    |   9 +-
 cpp/src/hdbscan/detail/reachability.cuh | 305 ++++++++++++++++++------
 cpp/src/hdbscan/hdbscan.cu              |  18 +-
 cpp/src/hdbscan/runner.h                |   4 +-
 python/cuml/cluster/hdbscan/hdbscan.pyx |  55 ++++-
 python/cuml/tests/test_hdbscan.py       |  70 +++++-
 6 files changed, 370 insertions(+), 91 deletions(-)
diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp
index eb1223fd88..3d98ec1faa 100644
--- a/cpp/include/cuml/cluster/hdbscan.hpp
+++ b/cpp/include/cuml/cluster/hdbscan.hpp
@@ -18,6 +18,7 @@
 
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/nn_descent_types.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -27,6 +28,8 @@ namespace ML {
 namespace HDBSCAN {
 namespace Common {
 
+using nn_index_params = raft::neighbors::experimental::nn_descent::index_params;
+
 /**
  * The Condensed hierarchicy is represented by an edge list with
  * parents as the source vertices, children as the destination,
@@ -134,6 +137,7 @@ class CondensedHierarchy {
 };
 
 enum CLUSTER_SELECTION_METHOD { EOM = 0, LEAF = 1 };
+enum GRAPH_BUILD_ALGO { BRUTE_FORCE_KNN = 0, NN_DESCENT = 1 };
 
 class RobustSingleLinkageParams {
  public:
@@ -151,6 +155,8 @@ class RobustSingleLinkageParams {
 class HDBSCANParams : public RobustSingleLinkageParams {
  public:
   CLUSTER_SELECTION_METHOD cluster_selection_method = CLUSTER_SELECTION_METHOD::EOM;
+  GRAPH_BUILD_ALGO build_algo                       = GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN;
+  nn_index_params nn_descent_params                 = {};
 };
 
 /**
@@ -502,7 +508,8 @@ void compute_core_dists(const raft::handle_t& handle,
                         size_t m,
                         size_t n,
                         raft::distance::DistanceType metric,
-                        int min_samples);
+                        int min_samples,
+                        HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo);
 
 /**
  * @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 03a7f7c0ad..e381f52222 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -16,11 +16,14 @@
 
 #pragma once
 
+#include <cuml/cluster/hdbscan.hpp>
 #include <cuml/neighbors/knn.hpp>
 
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/detail/nn_descent.cuh>
+#include <raft/neighbors/nn_descent_types.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -34,6 +37,9 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+using align32       = raft::Pow2<32>;
+namespace NNDescent = raft::neighbors::experimental::nn_descent;
+
 namespace ML {
 namespace HDBSCAN {
 namespace detail {
@@ -68,6 +74,12 @@ void core_distances(
   });
 }
 
+//  Functor to post-process distances into reachability space
+template <typename value_idx, typename value_t = float>
+struct DistancePostProcessSqrt {
+  DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); }
+};
+
 /**
  * Wraps the brute force knn API, to be used for both training and prediction
  * @tparam value_idx data type for integrals
@@ -93,33 +105,93 @@ void compute_knn(const raft::handle_t& handle,
                  const value_t* search_items,
                  size_t n_search_items,
                  int k,
-                 raft::distance::DistanceType metric)
+                 raft::distance::DistanceType metric,
+                 Common::GRAPH_BUILD_ALGO build_algo  = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN,
+                 Common::nn_index_params build_params = Common::nn_index_params{})
 {
   auto stream      = handle.get_stream();
   auto exec_policy = handle.get_thrust_policy();
-  std::vector<value_t*> inputs;
-  inputs.push_back(const_cast<value_t*>(X));
-
-  std::vector<int> sizes;
-  sizes.push_back(m);
-
   // This is temporary. Once faiss is updated, we should be able to
   // pass value_idx through to knn.
   rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
 
-  // perform knn
-  brute_force_knn(handle,
-                  inputs,
-                  sizes,
-                  n,
-                  const_cast<value_t*>(search_items),
-                  n_search_items,
-                  int64_indices.data(),
-                  dists,
-                  k,
-                  true,
-                  true,
-                  metric);
+  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
+    std::vector<value_t*> inputs;
+    inputs.push_back(const_cast<value_t*>(X));
+
+    std::vector<int> sizes;
+    sizes.push_back(m);
+    // // This is temporary. Once faiss is updated, we should be able to
+    // // pass value_idx through to knn.
+    // rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
+
+    // perform knn
+    brute_force_knn(handle,
+                    inputs,
+                    sizes,
+                    n,
+                    const_cast<value_t*>(search_items),
+                    n_search_items,
+                    int64_indices.data(),
+                    dists,
+                    k,
+                    true,
+                    true,
+                    metric);
+  } else {  // NN_DESCENT
+    // [JS] TODO: add check for graph degree
+    // [JS] TODO: pass params
+    auto epilogue                 = DistancePostProcessSqrt<value_idx, float>{};
+    build_params.return_distances = true;
+    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
+                 "n_neighbors should be smaller than the graph degree computed by nn descent");
+
+    auto dataset = raft::make_host_matrix_view<const float, int64_t>(X, m, n);
+    auto graph = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
+
+    for (int i = 0; i < n_search_items; i++) {
+      if (graph.distances().has_value()) {
+        raft::copy(dists + i * k + 1,
+                   graph.distances().value().data_handle() + i * build_params.graph_degree,
+                   k - 1,
+                   handle.get_stream());
+        thrust::fill(thrust::device.on(stream), dists + i * k, dists + i * k + 1, 0.0);
+      }
+      raft::copy(int64_indices.data() + i * k + 1,
+                 graph.graph().data_handle() + i * build_params.graph_degree,
+                 k - 1,
+                 handle.get_stream());
+      thrust::fill(thrust::device.on(stream),
+                   int64_indices.data() + i * k,
+                   int64_indices.data() + i * k + 1,
+                   i);
+    }
+    // NNDescent::index_params params = {};
+    // params.return_distances = true;
+    // size_t graph_degree = align32::roundUp(static_cast<size_t>(k * 3.0));
+    // params.graph_degree = graph_degree;
+    // params.intermediate_graph_degree = align32::roundUp(static_cast<size_t>(graph_degree * 1.3));
+    // params.max_iterations = 50;
+
+    // RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree, "n_neighbors should be
+    // smaller than the graph degree computed by nn descent");
+
+    // auto dataset =
+    //   raft::make_host_matrix_view<const float, int64_t>(X, m, n);
+    // auto graph =
+    //   NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
+
+    // for (int i = 0; i < n_search_items; i++) {
+    //   raft::copy(dists + i * k,
+    //              graph.distances().data_handle() + i * build_params.graph_degree,
+    //              k,
+    //              stream);
+    //   raft::copy(int64_indices.data() + i * k,
+    //              graph.graph().data_handle() + i * build_params.graph_degree,
+    //              k,
+    //              stream);
+    // }
+  }
 
   // convert from current knn's 64-bit to 32-bit.
   thrust::transform(exec_policy,
@@ -134,13 +206,15 @@ void compute_knn(const raft::handle_t& handle,
          to compute core_dists
 */
 template <typename value_idx, typename value_t>
-void _compute_core_dists(const raft::handle_t& handle,
-                         const value_t* X,
-                         value_t* core_dists,
-                         size_t m,
-                         size_t n,
-                         raft::distance::DistanceType metric,
-                         int min_samples)
+void _compute_core_dists(
+  const raft::handle_t& handle,
+  const value_t* X,
+  value_t* core_dists,
+  size_t m,
+  size_t n,
+  raft::distance::DistanceType metric,
+  int min_samples,
+  Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN)
 {
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Currently only L2 expanded distance is supported");
@@ -151,7 +225,7 @@ void _compute_core_dists(const raft::handle_t& handle,
   rmm::device_uvector<value_t> dists(min_samples * m, stream);
 
   // perform knn
-  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
+  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric, build_algo);
 
   // Slice core distances (distances to kth nearest neighbor)
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
@@ -169,6 +243,18 @@ struct ReachabilityPostProcess {
   value_t alpha;
 };
 
+//  Functor to post-process distances into reachability space
+template <typename value_idx, typename value_t = float>
+struct ReachabilityPostProcessSqrt {
+  DI value_t operator()(value_t value, value_idx row, value_idx col) const
+  {
+    return max(core_dists[col], max(core_dists[row], sqrtf(alpha * value)));
+  }
+
+  const value_t* core_dists;
+  value_t alpha;
+};
+
 /**
  * Given core distances, Fuses computations of L2 distances between all
  * points, projection into mutual reachability space, and k-selection.
@@ -184,38 +270,84 @@ struct ReachabilityPostProcess {
  * @param[in] core_dists array of core distances (size m)
  */
 template <typename value_idx, typename value_t>
-void mutual_reachability_knn_l2(const raft::handle_t& handle,
-                                value_idx* out_inds,
-                                value_t* out_dists,
-                                const value_t* X,
-                                size_t m,
-                                size_t n,
-                                int k,
-                                value_t* core_dists,
-                                value_t alpha)
+void mutual_reachability_knn_l2(
+  const raft::handle_t& handle,
+  value_idx* out_inds,
+  value_t* out_dists,
+  const value_t* X,
+  size_t m,
+  size_t n,
+  int k,
+  value_t* core_dists,
+  value_t alpha,
+  Common::GRAPH_BUILD_ALGO build_algo  = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN,
+  Common::nn_index_params build_params = Common::nn_index_params{})
 {
   // Create a functor to postprocess distances into mutual reachability space
   // Note that we can't use a lambda for this here, since we get errors like:
   // `A type local to a function cannot be used in the template argument of the
   // enclosing parent function (and any parent classes) of an extended __device__
   // or __host__ __device__ lambda`
-  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
-
-  auto X_view = raft::make_device_matrix_view(X, m, n);
-  std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
-
-  raft::neighbors::brute_force::knn<value_idx, value_t>(
-    handle,
-    index,
-    X_view,
-    raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
-    raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
-    // TODO: expand distance metrics to support more than just L2 distance
-    // https://github.com/rapidsai/cuml/issues/5301
-    raft::distance::DistanceType::L2SqrtExpanded,
-    std::make_optional<float>(2.0f),
-    std::nullopt,
-    epilogue);
+
+  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
+    auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+    auto X_view   = raft::make_device_matrix_view(X, m, n);
+    std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
+
+    raft::neighbors::brute_force::knn<value_idx, value_t>(
+      handle,
+      index,
+      X_view,
+      raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
+      raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
+      // TODO: expand distance metrics to support more than just L2 distance
+      // https://github.com/rapidsai/cuml/issues/5301
+      raft::distance::DistanceType::L2SqrtExpanded,
+      std::make_optional<float>(2.0f),
+      std::nullopt,
+      epilogue);
+  } else {
+    // [JS] TODO: add check for graph degree
+    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
+    // NNDescent::index_params params = {};
+    build_params.return_distances = true;
+    // size_t graph_degree = align32::roundUp(static_cast<size_t>(k * 3.0));
+    // params.graph_degree = graph_degree;
+    // params.intermediate_graph_degree = align32::roundUp(static_cast<size_t>(graph_degree * 1.3));
+    // params.max_iterations = 50;
+    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
+                 "n_neighbors should be smaller than the graph degree computed by nn descent");
+
+    auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
+    // [JS] TODO: add distance epilogue here
+    auto graph =
+      NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
+
+    for (size_t i = 0; i < m; i++) {
+      if (graph.distances().has_value()) {
+        raft::copy(out_dists + i * k + 1,
+                   graph.distances().value().data_handle() + i * build_params.graph_degree,
+                   k - 1,
+                   handle.get_stream());
+        thrust::fill(
+          thrust::device.on(handle.get_stream()), out_dists + i * k, out_dists + i * k + 1, 0.0);
+      }
+      // raft::copy(out_dists + i * k,
+      //            graph.distances().data_handle() + i * build_params.graph_degree,
+      //            k,
+      //            handle.get_stream());
+      raft::copy(out_inds + i * k + 1,
+                 graph.graph().data_handle() + i * build_params.graph_degree,
+                 k - 1,
+                 handle.get_stream());
+      thrust::fill(
+        thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i);
+      // raft::copy(out_inds + i * k,
+      //            graph.graph().data_handle() + i * build_params.graph_degree,
+      //            k,
+      //            handle.get_stream());
+    }
+  }
 }
 
 /**
@@ -260,16 +392,19 @@ void mutual_reachability_knn_l2(const raft::handle_t& handle,
  *             neighbors.
  */
 template <typename value_idx, typename value_t>
-void mutual_reachability_graph(const raft::handle_t& handle,
-                               const value_t* X,
-                               size_t m,
-                               size_t n,
-                               raft::distance::DistanceType metric,
-                               int min_samples,
-                               value_t alpha,
-                               value_idx* indptr,
-                               value_t* core_dists,
-                               raft::sparse::COO<value_t, value_idx>& out)
+void mutual_reachability_graph(
+  const raft::handle_t& handle,
+  const value_t* X,
+  size_t m,
+  size_t n,
+  raft::distance::DistanceType metric,
+  int min_samples,
+  value_t alpha,
+  value_idx* indptr,
+  value_t* core_dists,
+  raft::sparse::COO<value_t, value_idx>& out,
+  Common::GRAPH_BUILD_ALGO build_algo  = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN,
+  Common::nn_index_params build_params = Common::nn_index_params{})
 {
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Currently only L2 expanded distance is supported");
@@ -281,18 +416,48 @@ void mutual_reachability_graph(const raft::handle_t& handle,
   rmm::device_uvector<value_idx> inds(min_samples * m, stream);
   rmm::device_uvector<value_t> dists(min_samples * m, stream);
 
+  // printf("[JS] min samples: %d\n", min_samples);
   // perform knn
-  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
-
+  compute_knn(handle,
+              X,
+              inds.data(),
+              dists.data(),
+              m,
+              n,
+              X,
+              m,
+              min_samples,
+              metric,
+              build_algo,
+              build_params);
+  raft::print_device_vector("indices", inds.data(), min_samples, std::cout);
+  raft::print_device_vector("distances", dists.data(), min_samples, std::cout);
   // Slice core distances (distances to kth nearest neighbor)
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
-
+  // raft::print_device_vector("core dists", core_dists, 20, std::cout);
+
+  // raft::print_device_vector("dists for 4:", dists.data() + min_samples * 4, min_samples,
+  // std::cout); raft::print_device_vector("dists for 5:", dists.data() + min_samples * 5,
+  // min_samples, std::cout); raft::print_device_vector("dists for 14:", dists.data() + min_samples
+  // * 14, min_samples, std::cout); raft::print_device_vector("dists for 15:", dists.data() +
+  // min_samples * 15, min_samples, std::cout); raft::print_device_vector("dists for 16:",
+  // dists.data() + min_samples * 16, min_samples, std::cout);
   /**
    * Compute L2 norm
    */
-  mutual_reachability_knn_l2(
-    handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha);
-
+  mutual_reachability_knn_l2(handle,
+                             inds.data(),
+                             dists.data(),
+                             X,
+                             m,
+                             n,
+                             min_samples,
+                             core_dists,
+                             (value_t)1.0 / alpha,
+                             build_algo,
+                             build_params);
+  raft::print_device_vector("indices after knnl2", inds.data(), min_samples, std::cout);
+  raft::print_device_vector("distances after knnl2", dists.data(), min_samples, std::cout);
   // self-loops get max distance
   auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
   thrust::transform(exec_policy,
diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu
index ea64d20f6b..019687b72c 100644
--- a/cpp/src/hdbscan/hdbscan.cu
+++ b/cpp/src/hdbscan/hdbscan.cu
@@ -152,16 +152,18 @@ void out_of_sample_predict(const raft::handle_t& handle,
 
 namespace HDBSCAN::HELPER {
 
-void compute_core_dists(const raft::handle_t& handle,
-                        const float* X,
-                        float* core_dists,
-                        size_t m,
-                        size_t n,
-                        raft::distance::DistanceType metric,
-                        int min_samples)
+void compute_core_dists(
+  const raft::handle_t& handle,
+  const float* X,
+  float* core_dists,
+  size_t m,
+  size_t n,
+  raft::distance::DistanceType metric,
+  int min_samples,
+  HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN)
 {
   HDBSCAN::detail::Reachability::_compute_core_dists<int, float>(
-    handle, X, core_dists, m, n, metric, min_samples);
+    handle, X, core_dists, m, n, metric, min_samples, build_algo);
 }
 
 void compute_inverse_label_map(const raft::handle_t& handle,
diff --git a/cpp/src/hdbscan/runner.h b/cpp/src/hdbscan/runner.h
index c79148eed2..d9591bf0f1 100644
--- a/cpp/src/hdbscan/runner.h
+++ b/cpp/src/hdbscan/runner.h
@@ -183,7 +183,9 @@ void build_linkage(const raft::handle_t& handle,
                                                   params.alpha,
                                                   mutual_reachability_indptr.data(),
                                                   core_dists,
-                                                  mutual_reachability_coo);
+                                                  mutual_reachability_coo,
+                                                  params.build_algo,
+                                                  params.nn_descent_params);
 
   /**
    * Construct MST sorted by weights
diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index f7691c1684..0edaf64752 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,12 +46,24 @@ IF GPUBUILD == 1:
     from pylibraft.common.handle import Handle
     from pylibraft.common.handle cimport handle_t
 
+    cdef extern from "raft/neighbors/nn_descent_types.hpp" namespace "raft::neighbors::experimental::nn_descent":
+        cdef struct index_params:
+            size_t graph_degree,
+            size_t intermediate_graph_degree,
+            size_t max_iterations,
+            float termination_threshold,
+            bool return_distances
+
     cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
 
         ctypedef enum CLUSTER_SELECTION_METHOD:
             EOM "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::EOM"
             LEAF "ML::HDBSCAN::Common::CLUSTER_SELECTION_METHOD::LEAF"
 
+        ctypedef enum GRAPH_BUILD_ALGO:
+            BRUTE_FORCE_KNN "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN"
+            NN_DESCENT "ML::HDBSCAN::Common::GRAPH_BUILD_ALGO::NN_DESCENT"
+
         cdef cppclass CondensedHierarchy[value_idx, value_t]:
             CondensedHierarchy(
                 const handle_t &handle, size_t n_leaves)
@@ -98,6 +110,8 @@ IF GPUBUILD == 1:
 
             bool allow_single_cluster,
             CLUSTER_SELECTION_METHOD cluster_selection_method,
+            GRAPH_BUILD_ALGO build_algo,
+            index_params nn_descent_params,
 
         cdef cppclass PredictionData[int, float]:
             PredictionData(const handle_t &handle,
@@ -151,7 +165,8 @@ IF GPUBUILD == 1:
                                 size_t m,
                                 size_t n,
                                 DistanceType metric,
-                                int min_samples)
+                                int min_samples,
+                                GRAPH_BUILD_ALGO build_algo)
 
         void compute_inverse_label_map(const handle_t& handle,
                                        CondensedHierarchy[int, float]&
@@ -501,7 +516,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                  verbose=False,
                  connectivity='knn',
                  output_type=None,
-                 prediction_data=False):
+                 prediction_data=False,
+                 build_algo='brute_force_knn',
+                 build_kwds=None):
 
         super().__init__(handle=handle,
                          verbose=verbose,
@@ -532,6 +549,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
         self.fit_called_ = False
         self.prediction_data = prediction_data
 
+        self.build_algo = build_algo
+        self.build_kwds = build_kwds
+
         self.n_clusters_ = None
         self.n_leaves_ = None
 
@@ -831,6 +851,26 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                 raise ValueError("Cluster selection method not supported. "
                                  "Must one of {'eom', 'leaf'}")
 
+            if self.build_algo == 'brute_force_knn':
+                params.build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN
+            elif self.build_algo == 'nn_descent':
+                params.build_algo = GRAPH_BUILD_ALGO.NN_DESCENT
+                if self.build_kwds is None:
+                    params.nn_descent_params.graph_degree = <size_t> 64
+                    params.nn_descent_params.intermediate_graph_degree = <size_t> 128
+                    params.nn_descent_params.max_iterations = <size_t> 20
+                    params.nn_descent_params.termination_threshold = <float> 0.0001
+                    params.nn_descent_params.return_distances = <bool> True
+                else:
+                    params.nn_descent_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
+                    params.nn_descent_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
+                    params.nn_descent_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
+                    params.nn_descent_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
+                    params.nn_descent_params.return_distances = <bool> True
+            else:
+                raise ValueError("Build algo not supported. "
+                                 "Must one of {'brute_force_knn', 'nn_descent'}")
+
             cdef DistanceType metric
             if self.metric in _metrics_mapping:
                 metric = _metrics_mapping[self.metric]
@@ -1071,13 +1111,20 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
             cdef uintptr_t X_ptr = self.X_m.ptr
             cdef uintptr_t core_dists_ptr = self.core_dists.ptr
 
+            cdef GRAPH_BUILD_ALGO build_algo
+            if self.build_algo == 'brute_force_knn':
+                build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN
+            elif self.build_algo == 'nn_descent':
+                build_algo = GRAPH_BUILD_ALGO.NN_DESCENT
+
             compute_core_dists(handle_[0],
                                <float*> X_ptr,
                                <float*> core_dists_ptr,
                                <size_t> self.n_rows,
                                <size_t> self.n_cols,
                                <DistanceType> metric,
-                               <int> self.min_samples)
+                               <int> self.min_samples,
+                               <GRAPH_BUILD_ALGO> build_algo)
 
             cdef device_uvector[int] *inverse_label_map = \
                 new device_uvector[int](0, handle_[0].get_stream())
diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
index 0a9a3a6382..fab72780d8 100644
--- a/python/cuml/tests/test_hdbscan.py
+++ b/python/cuml/tests/test_hdbscan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -162,6 +162,7 @@ def assert_membership_vectors(cu_vecs, sk_vecs):
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_blobs(
     nrows,
     ncols,
@@ -173,6 +174,7 @@ def test_hdbscan_blobs(
     min_cluster_size,
     max_cluster_size,
     min_samples,
+    build_algo,
 ):
 
     X, y = make_blobs(
@@ -192,6 +194,7 @@ def test_hdbscan_blobs(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
+        build_algo=build_algo,
     )
 
     cuml_agg.fit(X)
@@ -233,6 +236,7 @@ def test_hdbscan_blobs(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_sklearn_datasets(
     test_datasets,
     connectivity,
@@ -240,6 +244,7 @@ def test_hdbscan_sklearn_datasets(
     cluster_selection_method,
     min_samples_cluster_size_bounds,
     allow_single_cluster,
+    build_algo,
 ):
 
     (
@@ -259,6 +264,7 @@ def test_hdbscan_sklearn_datasets(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
+        build_algo=build_algo,
     )
 
     cuml_agg.fit(X)
@@ -297,6 +303,7 @@ def test_hdbscan_sklearn_datasets(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_sklearn_extract_clusters(
     test_datasets,
     connectivity,
@@ -306,6 +313,7 @@ def test_hdbscan_sklearn_extract_clusters(
     min_cluster_size,
     max_cluster_size,
     allow_single_cluster,
+    build_algo,
 ):
     X = test_datasets.data
     cuml_agg = HDBSCAN(
@@ -317,6 +325,7 @@ def test_hdbscan_sklearn_extract_clusters(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
+        build_algo=build_algo,
     )
 
     sk_agg = hdbscan.HDBSCAN(
@@ -349,6 +358,7 @@ def test_hdbscan_sklearn_extract_clusters(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_cluster_patterns(
     dataset,
     nrows,
@@ -359,6 +369,7 @@ def test_hdbscan_cluster_patterns(
     allow_single_cluster,
     max_cluster_size,
     min_samples,
+    build_algo,
 ):
 
     # This also tests duplicate data points
@@ -372,6 +383,8 @@ def test_hdbscan_cluster_patterns(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
+        build_algo=build_algo,
+        # build_kwds={"nnd_max_iterations":50},
     )
 
     cuml_agg.fit(X)
@@ -412,6 +425,7 @@ def test_hdbscan_cluster_patterns(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_cluster_patterns_extract_clusters(
     dataset,
     nrows,
@@ -422,6 +436,7 @@ def test_hdbscan_cluster_patterns_extract_clusters(
     allow_single_cluster,
     max_cluster_size,
     min_samples,
+    build_algo,
 ):
 
     # This also tests duplicate data points
@@ -435,6 +450,7 @@ def test_hdbscan_cluster_patterns_extract_clusters(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
+        build_algo=build_algo,
     )
 
     sk_agg = hdbscan.HDBSCAN(
@@ -494,7 +510,8 @@ def test_hdbscan_metric_parameter_input(metric, supported):
             clf.fit(X)
 
 
-def test_hdbscan_empty_cluster_tree():
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+def test_hdbscan_empty_cluster_tree(build_algo):
 
     raw_tree = np.recarray(
         shape=(5,),
@@ -510,7 +527,9 @@ def test_hdbscan_empty_cluster_tree():
     condensed_tree = CondensedTree(raw_tree, 0.0, True)
 
     cuml_agg = HDBSCAN(
-        allow_single_cluster=True, cluster_selection_method="eom"
+        allow_single_cluster=True,
+        cluster_selection_method="eom",
+        build_algo=build_algo,
     )
     cuml_agg._extract_clusters(condensed_tree)
 
@@ -518,7 +537,8 @@ def test_hdbscan_empty_cluster_tree():
     assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0
 
 
-def test_hdbscan_plots():
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+def test_hdbscan_plots(build_algo):
 
     X, y = make_blobs(
         n_samples=int(100),
@@ -529,7 +549,7 @@ def test_hdbscan_plots():
         random_state=42,
     )
 
-    cuml_agg = HDBSCAN(gen_min_span_tree=True)
+    cuml_agg = HDBSCAN(gen_min_span_tree=True, build_algo=build_algo)
     cuml_agg.fit(X)
 
     assert cuml_agg.condensed_tree_ is not None
@@ -551,6 +571,7 @@ def test_hdbscan_plots():
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("batch_size", [128, 1000])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_all_points_membership_vectors_blobs(
     nrows,
     ncols,
@@ -561,6 +582,7 @@ def test_all_points_membership_vectors_blobs(
     allow_single_cluster,
     max_cluster_size,
     batch_size,
+    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -579,6 +601,7 @@ def test_all_points_membership_vectors_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
     cuml_agg.fit(X)
 
@@ -613,6 +636,7 @@ def test_all_points_membership_vectors_blobs(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [128, 1000])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_all_points_membership_vectors_moons(
     nrows,
     min_samples,
@@ -623,6 +647,7 @@ def test_all_points_membership_vectors_moons(
     max_cluster_size,
     connectivity,
     batch_size,
+    build_algo,
 ):
 
     X, y = datasets.make_moons(n_samples=nrows, noise=0.05, random_state=42)
@@ -636,6 +661,8 @@ def test_all_points_membership_vectors_moons(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
+        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X)
 
@@ -670,6 +697,7 @@ def test_all_points_membership_vectors_moons(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [128, 1000])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_all_points_membership_vectors_circles(
     nrows,
     min_samples,
@@ -680,6 +708,7 @@ def test_all_points_membership_vectors_circles(
     max_cluster_size,
     connectivity,
     batch_size,
+    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows, factor=0.5, noise=0.05, random_state=42
@@ -694,6 +723,8 @@ def test_all_points_membership_vectors_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
+        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X)
 
@@ -732,6 +763,7 @@ def test_all_points_membership_vectors_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_blobs(
     nrows,
     n_points_to_predict,
@@ -742,6 +774,7 @@ def test_approximate_predict_blobs(
     min_cluster_size,
     max_cluster_size,
     allow_single_cluster,
+    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -769,6 +802,7 @@ def test_approximate_predict_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
     cuml_agg.fit(X)
 
@@ -789,7 +823,8 @@ def test_approximate_predict_blobs(
     sk_labels, sk_probs = hdbscan.approximate_predict(
         sk_agg, points_to_predict
     )
-
+    # print(f"cu labels: {cu_labels}\ncu probs: {cu_probs}")
+    # print(f"sk labels: {sk_labels}\ncu probs: {sk_probs}")
     assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95
     assert np.allclose(cu_probs, sk_probs, atol=0.05)
 
@@ -803,6 +838,7 @@ def test_approximate_predict_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_moons(
     nrows,
     n_points_to_predict,
@@ -813,6 +849,7 @@ def test_approximate_predict_moons(
     max_cluster_size,
     cluster_selection_method,
     connectivity,
+    build_algo,
 ):
 
     X, y = datasets.make_moons(
@@ -831,6 +868,7 @@ def test_approximate_predict_moons(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
 
     cuml_agg.fit(X_train)
@@ -868,6 +906,7 @@ def test_approximate_predict_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_circles(
     nrows,
     n_points_to_predict,
@@ -878,6 +917,7 @@ def test_approximate_predict_circles(
     max_cluster_size,
     cluster_selection_method,
     connectivity,
+    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows + n_points_to_predict,
@@ -898,6 +938,7 @@ def test_approximate_predict_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
 
     cuml_agg.fit(X_train)
@@ -934,6 +975,7 @@ def test_approximate_predict_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom"])
 @pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_digits(
     n_points_to_predict,
     min_samples,
@@ -943,6 +985,7 @@ def test_approximate_predict_digits(
     max_cluster_size,
     cluster_selection_method,
     connectivity,
+    build_algo,
 ):
     digits = datasets.load_digits()
     X, y = digits.data, digits.target
@@ -966,6 +1009,7 @@ def test_approximate_predict_digits(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
 
     cuml_agg.fit(X_train)
@@ -1001,6 +1045,7 @@ def test_approximate_predict_digits(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("batch_size", [128])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_membership_vector_blobs(
     nrows,
     n_points_to_predict,
@@ -1012,6 +1057,7 @@ def test_membership_vector_blobs(
     allow_single_cluster,
     max_cluster_size,
     batch_size,
+    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -1039,6 +1085,7 @@ def test_membership_vector_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
     )
     cuml_agg.fit(X)
 
@@ -1077,6 +1124,7 @@ def test_membership_vector_blobs(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [16])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_membership_vector_moons(
     nrows,
     n_points_to_predict,
@@ -1088,6 +1136,7 @@ def test_membership_vector_moons(
     max_cluster_size,
     connectivity,
     batch_size,
+    build_algo,
 ):
 
     X, y = datasets.make_moons(
@@ -1106,6 +1155,8 @@ def test_membership_vector_moons(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
+        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X_train)
 
@@ -1141,6 +1192,7 @@ def test_membership_vector_moons(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [16])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_membership_vector_circles(
     nrows,
     n_points_to_predict,
@@ -1152,6 +1204,7 @@ def test_membership_vector_circles(
     max_cluster_size,
     connectivity,
     batch_size,
+    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows + n_points_to_predict,
@@ -1172,6 +1225,8 @@ def test_membership_vector_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
+        build_algo=build_algo,
+        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X_train)
 
@@ -1193,5 +1248,6 @@ def test_membership_vector_circles(
     sk_membership_vectors = hdbscan.membership_vector(sk_agg, X_test).astype(
         "float32"
     )
-
+    print(f"cu memberhsip vec: {cu_membership_vectors}")
+    print(f"sk memberhsip vec: {sk_membership_vectors}")
     assert_membership_vectors(cu_membership_vectors, sk_membership_vectors)

From 8f036a985aaa19420a5018acbb83a2be58f4ba5f Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Sun, 16 Jun 2024 22:05:09 +0000
Subject: [PATCH 02/25] change epilogue functor

---
 cpp/src/hdbscan/detail/reachability.cuh | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index e381f52222..11afae068e 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -77,7 +77,10 @@ void core_distances(
 //  Functor to post-process distances into reachability space
 template <typename value_idx, typename value_t = float>
 struct DistancePostProcessSqrt {
-  DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); }
+  DI value_t operator()(value_t value, value_idx row, value_idx col) const
+  {
+    return powf(fabsf(value), 0.5);
+  }
 };
 
 /**
@@ -139,8 +142,6 @@ void compute_knn(const raft::handle_t& handle,
                     true,
                     metric);
   } else {  // NN_DESCENT
-    // [JS] TODO: add check for graph degree
-    // [JS] TODO: pass params
     auto epilogue                 = DistancePostProcessSqrt<value_idx, float>{};
     build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
@@ -149,7 +150,7 @@ void compute_knn(const raft::handle_t& handle,
     auto dataset = raft::make_host_matrix_view<const float, int64_t>(X, m, n);
     auto graph = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
 
-    for (int i = 0; i < n_search_items; i++) {
+    for (size_t i = 0; i < n_search_items; i++) {
       if (graph.distances().has_value()) {
         raft::copy(dists + i * k + 1,
                    graph.distances().value().data_handle() + i * build_params.graph_degree,
@@ -248,9 +249,8 @@ template <typename value_idx, typename value_t = float>
 struct ReachabilityPostProcessSqrt {
   DI value_t operator()(value_t value, value_idx row, value_idx col) const
   {
-    return max(core_dists[col], max(core_dists[row], sqrtf(alpha * value)));
+    return max(core_dists[col], max(core_dists[row], powf(fabsf(alpha * value), 0.5)));
   }
-
   const value_t* core_dists;
   value_t alpha;
 };
@@ -307,7 +307,6 @@ void mutual_reachability_knn_l2(
       std::nullopt,
       epilogue);
   } else {
-    // [JS] TODO: add check for graph degree
     auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
     // NNDescent::index_params params = {};
     build_params.return_distances = true;
@@ -329,8 +328,7 @@ void mutual_reachability_knn_l2(
                    graph.distances().value().data_handle() + i * build_params.graph_degree,
                    k - 1,
                    handle.get_stream());
-        thrust::fill(
-          thrust::device.on(handle.get_stream()), out_dists + i * k, out_dists + i * k + 1, 0.0);
+        raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream());
       }
       // raft::copy(out_dists + i * k,
       //            graph.distances().data_handle() + i * build_params.graph_degree,
@@ -430,8 +428,8 @@ void mutual_reachability_graph(
               metric,
               build_algo,
               build_params);
-  raft::print_device_vector("indices", inds.data(), min_samples, std::cout);
-  raft::print_device_vector("distances", dists.data(), min_samples, std::cout);
+  // raft::print_device_vector("indices", inds.data(), 20, std::cout);
+  // raft::print_device_vector("distances", dists.data(), 20, std::cout);
   // Slice core distances (distances to kth nearest neighbor)
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
   // raft::print_device_vector("core dists", core_dists, 20, std::cout);
@@ -456,8 +454,8 @@ void mutual_reachability_graph(
                              (value_t)1.0 / alpha,
                              build_algo,
                              build_params);
-  raft::print_device_vector("indices after knnl2", inds.data(), min_samples, std::cout);
-  raft::print_device_vector("distances after knnl2", dists.data(), min_samples, std::cout);
+  // raft::print_device_vector("indices after knnl2", inds.data(), 20, std::cout);
+  // raft::print_device_vector("distances after knnl2", dists.data(), 20, std::cout);
   // self-loops get max distance
   auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
   thrust::transform(exec_policy,

From e158a74cafe376e18c8891908be58938aa92e7b3 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Sun, 16 Jun 2024 22:07:12 +0000
Subject: [PATCH 03/25] cleanup

---
 cpp/src/hdbscan/detail/reachability.cuh | 39 -------------------------
 1 file changed, 39 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 11afae068e..b31f5dabf3 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -167,31 +167,6 @@ void compute_knn(const raft::handle_t& handle,
                    int64_indices.data() + i * k + 1,
                    i);
     }
-    // NNDescent::index_params params = {};
-    // params.return_distances = true;
-    // size_t graph_degree = align32::roundUp(static_cast<size_t>(k * 3.0));
-    // params.graph_degree = graph_degree;
-    // params.intermediate_graph_degree = align32::roundUp(static_cast<size_t>(graph_degree * 1.3));
-    // params.max_iterations = 50;
-
-    // RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree, "n_neighbors should be
-    // smaller than the graph degree computed by nn descent");
-
-    // auto dataset =
-    //   raft::make_host_matrix_view<const float, int64_t>(X, m, n);
-    // auto graph =
-    //   NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
-
-    // for (int i = 0; i < n_search_items; i++) {
-    //   raft::copy(dists + i * k,
-    //              graph.distances().data_handle() + i * build_params.graph_degree,
-    //              k,
-    //              stream);
-    //   raft::copy(int64_indices.data() + i * k,
-    //              graph.graph().data_handle() + i * build_params.graph_degree,
-    //              k,
-    //              stream);
-    // }
   }
 
   // convert from current knn's 64-bit to 32-bit.
@@ -308,17 +283,11 @@ void mutual_reachability_knn_l2(
       epilogue);
   } else {
     auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
-    // NNDescent::index_params params = {};
     build_params.return_distances = true;
-    // size_t graph_degree = align32::roundUp(static_cast<size_t>(k * 3.0));
-    // params.graph_degree = graph_degree;
-    // params.intermediate_graph_degree = align32::roundUp(static_cast<size_t>(graph_degree * 1.3));
-    // params.max_iterations = 50;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
 
     auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
-    // [JS] TODO: add distance epilogue here
     auto graph =
       NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
 
@@ -330,20 +299,12 @@ void mutual_reachability_knn_l2(
                    handle.get_stream());
         raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream());
       }
-      // raft::copy(out_dists + i * k,
-      //            graph.distances().data_handle() + i * build_params.graph_degree,
-      //            k,
-      //            handle.get_stream());
       raft::copy(out_inds + i * k + 1,
                  graph.graph().data_handle() + i * build_params.graph_degree,
                  k - 1,
                  handle.get_stream());
       thrust::fill(
         thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i);
-      // raft::copy(out_inds + i * k,
-      //            graph.graph().data_handle() + i * build_params.graph_degree,
-      //            k,
-      //            handle.get_stream());
     }
   }
 }

From ac03040961a8083894f510549825a3c463a5acd0 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Mon, 17 Jun 2024 02:12:31 +0000
Subject: [PATCH 04/25] fix test + add param for compute_core_dist

---
 cpp/include/cuml/cluster/hdbscan.hpp    | 18 +++---
 cpp/src/hdbscan/detail/reachability.cuh | 16 ++++-
 cpp/src/hdbscan/hdbscan.cu              | 20 +++----
 python/cuml/cluster/hdbscan/hdbscan.pyx | 19 +++++-
 python/cuml/tests/test_hdbscan.py       | 78 +++++++++++++++++++------
 5 files changed, 111 insertions(+), 40 deletions(-)

diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp
index 3d98ec1faa..98f19e901c 100644
--- a/cpp/include/cuml/cluster/hdbscan.hpp
+++ b/cpp/include/cuml/cluster/hdbscan.hpp
@@ -502,14 +502,16 @@ namespace HDBSCAN::HELPER {
  * @param metric distance metric to use
  * @param min_samples minimum number of samples to use for computing core distances
  */
-void compute_core_dists(const raft::handle_t& handle,
-                        const float* X,
-                        float* core_dists,
-                        size_t m,
-                        size_t n,
-                        raft::distance::DistanceType metric,
-                        int min_samples,
-                        HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo);
+void compute_core_dists(
+  const raft::handle_t& handle,
+  const float* X,
+  float* core_dists,
+  size_t m,
+  size_t n,
+  raft::distance::DistanceType metric,
+  int min_samples,
+  HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN,
+  HDBSCAN::Common::nn_index_params build_params = Common::nn_index_params{});
 
 /**
  * @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index b31f5dabf3..4225d57e1a 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -190,7 +190,8 @@ void _compute_core_dists(
   size_t n,
   raft::distance::DistanceType metric,
   int min_samples,
-  Common::GRAPH_BUILD_ALGO build_algo = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN)
+  Common::GRAPH_BUILD_ALGO build_algo  = Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN,
+  Common::nn_index_params build_params = Common::nn_index_params{})
 {
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Currently only L2 expanded distance is supported");
@@ -201,7 +202,18 @@ void _compute_core_dists(
   rmm::device_uvector<value_t> dists(min_samples * m, stream);
 
   // perform knn
-  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric, build_algo);
+  compute_knn(handle,
+              X,
+              inds.data(),
+              dists.data(),
+              m,
+              n,
+              X,
+              m,
+              min_samples,
+              metric,
+              build_algo,
+              build_params);
 
   // Slice core distances (distances to kth nearest neighbor)
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu
index 019687b72c..32ef78b470 100644
--- a/cpp/src/hdbscan/hdbscan.cu
+++ b/cpp/src/hdbscan/hdbscan.cu
@@ -152,18 +152,18 @@ void out_of_sample_predict(const raft::handle_t& handle,
 
 namespace HDBSCAN::HELPER {
 
-void compute_core_dists(
-  const raft::handle_t& handle,
-  const float* X,
-  float* core_dists,
-  size_t m,
-  size_t n,
-  raft::distance::DistanceType metric,
-  int min_samples,
-  HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo = HDBSCAN::Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN)
+void compute_core_dists(const raft::handle_t& handle,
+                        const float* X,
+                        float* core_dists,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        int min_samples,
+                        HDBSCAN::Common::GRAPH_BUILD_ALGO build_algo,
+                        HDBSCAN::Common::nn_index_params build_params)
 {
   HDBSCAN::detail::Reachability::_compute_core_dists<int, float>(
-    handle, X, core_dists, m, n, metric, min_samples, build_algo);
+    handle, X, core_dists, m, n, metric, min_samples, build_algo, build_params);
 }
 
 void compute_inverse_label_map(const raft::handle_t& handle,
diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index 0edaf64752..d6c6d670d3 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -166,7 +166,8 @@ IF GPUBUILD == 1:
                                 size_t n,
                                 DistanceType metric,
                                 int min_samples,
-                                GRAPH_BUILD_ALGO build_algo)
+                                GRAPH_BUILD_ALGO build_algo,
+                                index_params build_params)
 
         void compute_inverse_label_map(const handle_t& handle,
                                        CondensedHierarchy[int, float]&
@@ -1112,10 +1113,23 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
             cdef uintptr_t core_dists_ptr = self.core_dists.ptr
 
             cdef GRAPH_BUILD_ALGO build_algo
+            cdef index_params build_params
             if self.build_algo == 'brute_force_knn':
                 build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN
             elif self.build_algo == 'nn_descent':
                 build_algo = GRAPH_BUILD_ALGO.NN_DESCENT
+                if self.build_kwds is None:
+                    build_params.graph_degree = <size_t> 64
+                    build_params.intermediate_graph_degree = <size_t> 128
+                    build_params.max_iterations = <size_t> 20
+                    build_params.termination_threshold = <float> 0.0001
+                    build_params.return_distances = <bool> True
+                else:
+                    build_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
+                    build_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
+                    build_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
+                    build_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
+                    build_params.return_distances = <bool> True
 
             compute_core_dists(handle_[0],
                                <float*> X_ptr,
@@ -1124,7 +1138,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                                <size_t> self.n_cols,
                                <DistanceType> metric,
                                <int> self.min_samples,
-                               <GRAPH_BUILD_ALGO> build_algo)
+                               <GRAPH_BUILD_ALGO> build_algo,
+                               build_params)
 
             cdef device_uvector[int] *inverse_label_map = \
                 new device_uvector[int](0, handle_[0].get_stream())
diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
index fab72780d8..3e060e1647 100644
--- a/python/cuml/tests/test_hdbscan.py
+++ b/python/cuml/tests/test_hdbscan.py
@@ -44,6 +44,13 @@
 dataset_names = ["noisy_circles", "noisy_moons", "varied"]
 
 
+def get_graph_degree(n_samples):
+    graph_degree = max(int((1 + ((n_samples * 1.5) // 32)) * 32), 64)
+    intermediate_graph_degree = int(1 + ((graph_degree * 1.3) // 32) * 32)
+    max_iters = max(n_samples // 2, 20)
+    return graph_degree, intermediate_graph_degree, max_iters
+
+
 def assert_cluster_counts(sk_agg, cuml_agg, digits=25):
     sk_unique, sk_counts = np.unique(sk_agg.labels_, return_counts=True)
     sk_counts = np.sort(sk_counts)
@@ -186,6 +193,9 @@ def test_hdbscan_blobs(
         random_state=42,
     )
 
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_samples
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -195,6 +205,11 @@ def test_hdbscan_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
 
     cuml_agg.fit(X)
@@ -236,7 +251,7 @@ def test_hdbscan_blobs(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_hdbscan_sklearn_datasets(
     test_datasets,
     connectivity,
@@ -255,6 +270,9 @@ def test_hdbscan_sklearn_datasets(
 
     X = test_datasets.data
 
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_samples
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -265,6 +283,11 @@ def test_hdbscan_sklearn_datasets(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
 
     cuml_agg.fit(X)
@@ -316,6 +339,9 @@ def test_hdbscan_sklearn_extract_clusters(
     build_algo,
 ):
     X = test_datasets.data
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_samples
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -326,6 +352,11 @@ def test_hdbscan_sklearn_extract_clusters(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
 
     sk_agg = hdbscan.HDBSCAN(
@@ -374,7 +405,6 @@ def test_hdbscan_cluster_patterns(
 
     # This also tests duplicate data points
     X, y = get_pattern(dataset, nrows)[0]
-
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -384,7 +414,6 @@ def test_hdbscan_cluster_patterns(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         build_algo=build_algo,
-        # build_kwds={"nnd_max_iterations":50},
     )
 
     cuml_agg.fit(X)
@@ -441,7 +470,9 @@ def test_hdbscan_cluster_patterns_extract_clusters(
 
     # This also tests duplicate data points
     X, y = get_pattern(dataset, nrows)[0]
-
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_samples
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -451,6 +482,11 @@ def test_hdbscan_cluster_patterns_extract_clusters(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
 
     sk_agg = hdbscan.HDBSCAN(
@@ -592,7 +628,9 @@ def test_all_points_membership_vectors_blobs(
         shuffle=True,
         random_state=42,
     )
-
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_cluster_size
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -602,6 +640,11 @@ def test_all_points_membership_vectors_blobs(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
     cuml_agg.fit(X)
 
@@ -662,7 +705,6 @@ def test_all_points_membership_vectors_moons(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
-        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X)
 
@@ -724,7 +766,6 @@ def test_all_points_membership_vectors_circles(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
-        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X)
 
@@ -763,7 +804,7 @@ def test_all_points_membership_vectors_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_approximate_predict_blobs(
     nrows,
     n_points_to_predict,
@@ -838,7 +879,7 @@ def test_approximate_predict_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_approximate_predict_moons(
     nrows,
     n_points_to_predict,
@@ -906,7 +947,7 @@ def test_approximate_predict_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_approximate_predict_circles(
     nrows,
     n_points_to_predict,
@@ -975,7 +1016,7 @@ def test_approximate_predict_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_approximate_predict_digits(
     n_points_to_predict,
     min_samples,
@@ -1077,6 +1118,9 @@ def test_membership_vector_blobs(
         random_state=42,
     )
 
+    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+        min_cluster_size
+    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -1086,6 +1130,11 @@ def test_membership_vector_blobs(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_graph_degree": graph_degree,
+            "nnd_intermediate_graph_degree": intermediate_graph_degree,
+            "nnd_max_iterations": max_iters,
+        },
     )
     cuml_agg.fit(X)
 
@@ -1156,7 +1205,6 @@ def test_membership_vector_moons(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
-        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X_train)
 
@@ -1192,7 +1240,6 @@ def test_membership_vector_moons(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [16])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_membership_vector_circles(
     nrows,
     n_points_to_predict,
@@ -1204,7 +1251,6 @@ def test_membership_vector_circles(
     max_cluster_size,
     connectivity,
     batch_size,
-    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows + n_points_to_predict,
@@ -1225,8 +1271,6 @@ def test_membership_vector_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
-        # build_kwds={"nnd_max_iterations":50},
     )
     cuml_agg.fit(X_train)
 
@@ -1248,6 +1292,4 @@ def test_membership_vector_circles(
     sk_membership_vectors = hdbscan.membership_vector(sk_agg, X_test).astype(
         "float32"
     )
-    print(f"cu memberhsip vec: {cu_membership_vectors}")
-    print(f"sk memberhsip vec: {sk_membership_vectors}")
     assert_membership_vectors(cu_membership_vectors, sk_membership_vectors)

From f2c3c920ddd546bfcd6ee13d26d95eb5fdbc6396 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Mon, 17 Jun 2024 16:15:32 +0000
Subject: [PATCH 05/25] remove and add comments

---
 cpp/src/hdbscan/detail/reachability.cuh | 26 +++++++------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 4225d57e1a..585593850e 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -37,7 +37,6 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-using align32       = raft::Pow2<32>;
 namespace NNDescent = raft::neighbors::experimental::nn_descent;
 
 namespace ML {
@@ -74,7 +73,8 @@ void core_distances(
   });
 }
 
-//  Functor to post-process distances into reachability space
+// Functor to post-process distances by sqrt
+// For usage with NN Descent which internally supports L2Expanded only
 template <typename value_idx, typename value_t = float>
 struct DistancePostProcessSqrt {
   DI value_t operator()(value_t value, value_idx row, value_idx col) const
@@ -124,9 +124,6 @@ void compute_knn(const raft::handle_t& handle,
 
     std::vector<int> sizes;
     sizes.push_back(m);
-    // // This is temporary. Once faiss is updated, we should be able to
-    // // pass value_idx through to knn.
-    // rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
 
     // perform knn
     brute_force_knn(handle,
@@ -231,7 +228,8 @@ struct ReachabilityPostProcess {
   value_t alpha;
 };
 
-//  Functor to post-process distances into reachability space
+// Functor to post-process distances into reachability space (Sqrt)
+// For usage with NN Descent which internally supports L2Expanded only
 template <typename value_idx, typename value_t = float>
 struct ReachabilityPostProcessSqrt {
   DI value_t operator()(value_t value, value_idx row, value_idx col) const
@@ -387,7 +385,6 @@ void mutual_reachability_graph(
   rmm::device_uvector<value_idx> inds(min_samples * m, stream);
   rmm::device_uvector<value_t> dists(min_samples * m, stream);
 
-  // printf("[JS] min samples: %d\n", min_samples);
   // perform knn
   compute_knn(handle,
               X,
@@ -401,18 +398,10 @@ void mutual_reachability_graph(
               metric,
               build_algo,
               build_params);
-  // raft::print_device_vector("indices", inds.data(), 20, std::cout);
-  // raft::print_device_vector("distances", dists.data(), 20, std::cout);
+
   // Slice core distances (distances to kth nearest neighbor)
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
-  // raft::print_device_vector("core dists", core_dists, 20, std::cout);
-
-  // raft::print_device_vector("dists for 4:", dists.data() + min_samples * 4, min_samples,
-  // std::cout); raft::print_device_vector("dists for 5:", dists.data() + min_samples * 5,
-  // min_samples, std::cout); raft::print_device_vector("dists for 14:", dists.data() + min_samples
-  // * 14, min_samples, std::cout); raft::print_device_vector("dists for 15:", dists.data() +
-  // min_samples * 15, min_samples, std::cout); raft::print_device_vector("dists for 16:",
-  // dists.data() + min_samples * 16, min_samples, std::cout);
+
   /**
    * Compute L2 norm
    */
@@ -427,8 +416,7 @@ void mutual_reachability_graph(
                              (value_t)1.0 / alpha,
                              build_algo,
                              build_params);
-  // raft::print_device_vector("indices after knnl2", inds.data(), 20, std::cout);
-  // raft::print_device_vector("distances after knnl2", dists.data(), 20, std::cout);
+
   // self-loops get max distance
   auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
   thrust::transform(exec_policy,

From b461d31c5685eed390bd028a0cc07c935c42aece Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Tue, 18 Jun 2024 23:22:06 +0000
Subject: [PATCH 06/25] refine distances due to precision issues

---
 cpp/src/hdbscan/detail/reachability.cuh | 109 ++++++++++-----------
 python/cuml/cluster/hdbscan/hdbscan.pyx |   5 -
 python/cuml/tests/test_hdbscan.py       | 120 +++++++++++++++++++++---
 3 files changed, 154 insertions(+), 80 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 585593850e..99907897e5 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -24,6 +24,7 @@
 #include <raft/neighbors/brute_force.cuh>
 #include <raft/neighbors/detail/nn_descent.cuh>
 #include <raft/neighbors/nn_descent_types.hpp>
+#include <raft/neighbors/refine-inl.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -139,31 +140,46 @@ void compute_knn(const raft::handle_t& handle,
                     true,
                     metric);
   } else {  // NN_DESCENT
-    auto epilogue                 = DistancePostProcessSqrt<value_idx, float>{};
-    build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
 
     auto dataset = raft::make_host_matrix_view<const float, int64_t>(X, m, n);
-    auto graph = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
+    auto graph   = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset);
 
+    // NN Descent build does not include itself in nearest neighbors
     for (size_t i = 0; i < n_search_items; i++) {
-      if (graph.distances().has_value()) {
-        raft::copy(dists + i * k + 1,
-                   graph.distances().value().data_handle() + i * build_params.graph_degree,
-                   k - 1,
-                   handle.get_stream());
-        thrust::fill(thrust::device.on(stream), dists + i * k, dists + i * k + 1, 0.0);
+      for (size_t j = k - 1; j >= 1; j--) {
+        graph.graph().data_handle()[i * build_params.graph_degree + j] =
+          graph.graph().data_handle()[i * build_params.graph_degree + j - 1];
       }
-      raft::copy(int64_indices.data() + i * k + 1,
-                 graph.graph().data_handle() + i * build_params.graph_degree,
-                 k - 1,
-                 handle.get_stream());
-      thrust::fill(thrust::device.on(stream),
-                   int64_indices.data() + i * k,
-                   int64_indices.data() + i * k + 1,
-                   i);
+      graph.graph().data_handle()[i * build_params.graph_degree] = i;
     }
+
+    auto dataset_dev = raft::make_device_matrix<float, int64_t, raft::row_major>(handle, m, n);
+    raft::copy(dataset_dev.data_handle(), dataset.data_handle(), m * n, handle.get_stream());
+    auto dataset_dev_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      dataset_dev.data_handle(), m, n);
+
+    auto neighbor_candidates = raft::make_device_matrix<int64_t, int64_t, raft::row_major>(
+      handle, m, build_params.graph_degree);
+    raft::copy(neighbor_candidates.data_handle(),
+               graph.graph().data_handle(),
+               m * build_params.graph_degree,
+               handle.get_stream());
+    auto neighbor_candidates_view =
+      raft::make_device_matrix_view<const int64_t, int64_t, raft::row_major>(
+        neighbor_candidates.data_handle(), m, build_params.graph_degree);
+
+    auto indices =
+      raft::make_device_matrix_view<int64_t, int64_t>(int64_indices.data(), n_search_items, k);
+    auto distances = raft::make_device_matrix_view<float, int64_t>(dists, n_search_items, k);
+    raft::neighbors::refine(handle,
+                            dataset_dev_view,
+                            dataset_dev_view,
+                            neighbor_candidates_view,
+                            indices,
+                            distances,
+                            metric);
   }
 
   // convert from current knn's 64-bit to 32-bit.
@@ -274,49 +290,22 @@ void mutual_reachability_knn_l2(
   // enclosing parent function (and any parent classes) of an extended __device__
   // or __host__ __device__ lambda`
 
-  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
-    auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
-    auto X_view   = raft::make_device_matrix_view(X, m, n);
-    std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
-
-    raft::neighbors::brute_force::knn<value_idx, value_t>(
-      handle,
-      index,
-      X_view,
-      raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
-      raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
-      // TODO: expand distance metrics to support more than just L2 distance
-      // https://github.com/rapidsai/cuml/issues/5301
-      raft::distance::DistanceType::L2SqrtExpanded,
-      std::make_optional<float>(2.0f),
-      std::nullopt,
-      epilogue);
-  } else {
-    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
-    build_params.return_distances = true;
-    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
-                 "n_neighbors should be smaller than the graph degree computed by nn descent");
-
-    auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
-    auto graph =
-      NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
-
-    for (size_t i = 0; i < m; i++) {
-      if (graph.distances().has_value()) {
-        raft::copy(out_dists + i * k + 1,
-                   graph.distances().value().data_handle() + i * build_params.graph_degree,
-                   k - 1,
-                   handle.get_stream());
-        raft::copy(out_dists + i * k, core_dists + i, 1, handle.get_stream());
-      }
-      raft::copy(out_inds + i * k + 1,
-                 graph.graph().data_handle() + i * build_params.graph_degree,
-                 k - 1,
-                 handle.get_stream());
-      thrust::fill(
-        thrust::device.on(handle.get_stream()), out_inds + i * k, out_inds + i * k + 1, i);
-    }
-  }
+  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+  auto X_view   = raft::make_device_matrix_view(X, m, n);
+  std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
+
+  raft::neighbors::brute_force::knn<value_idx, value_t>(
+    handle,
+    index,
+    X_view,
+    raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
+    raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
+    // TODO: expand distance metrics to support more than just L2 distance
+    // https://github.com/rapidsai/cuml/issues/5301
+    raft::distance::DistanceType::L2SqrtExpanded,
+    std::make_optional<float>(2.0f),
+    std::nullopt,
+    epilogue);
 }
 
 /**
diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index d6c6d670d3..23dc26e31b 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -52,7 +52,6 @@ IF GPUBUILD == 1:
             size_t intermediate_graph_degree,
             size_t max_iterations,
             float termination_threshold,
-            bool return_distances
 
     cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
 
@@ -861,13 +860,11 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                     params.nn_descent_params.intermediate_graph_degree = <size_t> 128
                     params.nn_descent_params.max_iterations = <size_t> 20
                     params.nn_descent_params.termination_threshold = <float> 0.0001
-                    params.nn_descent_params.return_distances = <bool> True
                 else:
                     params.nn_descent_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
                     params.nn_descent_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
                     params.nn_descent_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
                     params.nn_descent_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
-                    params.nn_descent_params.return_distances = <bool> True
             else:
                 raise ValueError("Build algo not supported. "
                                  "Must one of {'brute_force_knn', 'nn_descent'}")
@@ -1123,13 +1120,11 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                     build_params.intermediate_graph_degree = <size_t> 128
                     build_params.max_iterations = <size_t> 20
                     build_params.termination_threshold = <float> 0.0001
-                    build_params.return_distances = <bool> True
                 else:
                     build_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
                     build_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
                     build_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
                     build_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
-                    build_params.return_distances = <bool> True
 
             compute_core_dists(handle_[0],
                                <float*> X_ptr,
diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
index 3e060e1647..d71fa88678 100644
--- a/python/cuml/tests/test_hdbscan.py
+++ b/python/cuml/tests/test_hdbscan.py
@@ -149,14 +149,19 @@ def assert_membership_vectors(cu_vecs, sk_vecs):
         cu_labels_sorted = np.argsort(cu_vecs)[::-1]
         sk_labels_sorted = np.argsort(sk_vecs)[::-1]
 
-        k = min(sk_vecs.shape[1], 10)
-        for i in range(k):
+        if len(sk_vecs.shape) == 1:
             assert (
-                adjusted_rand_score(
-                    cu_labels_sorted[:, i], sk_labels_sorted[:, i]
-                )
-                >= 0.90
+                adjusted_rand_score(cu_labels_sorted, sk_labels_sorted) >= 0.9
             )
+        else:
+            k = min(sk_vecs.shape[1], 10)
+            for i in range(k):
+                assert (
+                    adjusted_rand_score(
+                        cu_labels_sorted[:, i], sk_labels_sorted[:, i]
+                    )
+                    >= 0.9
+                )
 
 
 @pytest.mark.parametrize("nrows", [500])
@@ -804,7 +809,7 @@ def test_all_points_membership_vectors_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_blobs(
     nrows,
     n_points_to_predict,
@@ -844,6 +849,11 @@ def test_approximate_predict_blobs(
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
         build_algo=build_algo,
+        build_kwds={
+            "nnd_max_iterations": 100,
+            "nnd_graph_degree": 96,
+            "nnd_intermediate_graph_degree": 128,
+        },
     )
     cuml_agg.fit(X)
 
@@ -864,10 +874,13 @@ def test_approximate_predict_blobs(
     sk_labels, sk_probs = hdbscan.approximate_predict(
         sk_agg, points_to_predict
     )
-    # print(f"cu labels: {cu_labels}\ncu probs: {cu_probs}")
-    # print(f"sk labels: {sk_labels}\ncu probs: {sk_probs}")
-    assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95
-    assert np.allclose(cu_probs, sk_probs, atol=0.05)
+
+    if build_algo == "brute_force_knn":
+        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95
+        assert np.allclose(cu_probs, sk_probs, atol=0.05)
+    else:
+        # this test case is not so stable for nn descent at this moment
+        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.9
 
 
 @pytest.mark.parametrize("nrows", [1000])
@@ -879,7 +892,7 @@ def test_approximate_predict_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_moons(
     nrows,
     n_points_to_predict,
@@ -934,8 +947,15 @@ def test_approximate_predict_moons(
     sk_unique = np.unique(sk_labels)
     cu_unique = np.unique(cu_labels)
     if len(sk_unique) == len(cu_unique):
-        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99
-        assert array_equal(cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005)
+        if build_algo == "brute_force_knn":
+            assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99
+            assert array_equal(
+                cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005
+            )
+        else:
+            # this test case is not so stable for nn descent at this moment
+            # a few configs result in scores around 0.85
+            assert adjusted_rand_score(cu_labels, sk_labels) >= 0.8
 
 
 @pytest.mark.parametrize("nrows", [1000])
@@ -947,7 +967,7 @@ def test_approximate_predict_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_circles(
     nrows,
     n_points_to_predict,
@@ -1293,3 +1313,73 @@ def test_membership_vector_circles(
         "float32"
     )
     assert_membership_vectors(cu_membership_vectors, sk_membership_vectors)
+
+
+@pytest.mark.parametrize("nrows", [1000])
+@pytest.mark.parametrize("n_points_to_predict", [1000])
+@pytest.mark.parametrize("min_samples", [20, 30])
+@pytest.mark.parametrize("min_cluster_size", [100, 150])
+@pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5])
+@pytest.mark.parametrize("allow_single_cluster", [True, False])
+@pytest.mark.parametrize("max_cluster_size", [0])
+@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
+@pytest.mark.parametrize("connectivity", ["knn"])
+@pytest.mark.parametrize("batch_size", [16])
+@pytest.mark.parametrize("build_algo", ["nn_descent"])
+def test_membership_vector_circles_nnd(
+    nrows,
+    n_points_to_predict,
+    min_samples,
+    cluster_selection_epsilon,
+    cluster_selection_method,
+    min_cluster_size,
+    allow_single_cluster,
+    max_cluster_size,
+    connectivity,
+    batch_size,
+    build_algo,
+):
+    X, y = datasets.make_circles(
+        n_samples=nrows + n_points_to_predict,
+        factor=0.8,
+        noise=0.05,
+        random_state=42,
+    )
+
+    X_train = X[:nrows]
+    X_test = X[nrows:]
+
+    cuml_agg_nnd = HDBSCAN(
+        verbose=logger.level_info,
+        min_samples=min_samples,
+        allow_single_cluster=allow_single_cluster,
+        max_cluster_size=max_cluster_size,
+        min_cluster_size=min_cluster_size,
+        cluster_selection_epsilon=cluster_selection_epsilon,
+        cluster_selection_method=cluster_selection_method,
+        prediction_data=True,
+        build_algo=build_algo,
+    )
+    cuml_agg_nnd.fit(X_train)
+
+    cuml_agg_bf = HDBSCAN(
+        verbose=logger.level_info,
+        min_samples=min_samples,
+        allow_single_cluster=allow_single_cluster,
+        max_cluster_size=max_cluster_size,
+        min_cluster_size=min_cluster_size,
+        cluster_selection_epsilon=cluster_selection_epsilon,
+        cluster_selection_method=cluster_selection_method,
+        prediction_data=True,
+    )
+    cuml_agg_bf.fit(X_train)
+
+    cu_membership_vectors_nnd = membership_vector(
+        cuml_agg_nnd, X_test, batch_size
+    )
+    cu_membership_vectors_bf = membership_vector(
+        cuml_agg_bf, X_test, batch_size
+    )
+    assert_membership_vectors(
+        cu_membership_vectors_nnd, cu_membership_vectors_bf
+    )

From 5b15ce5b413413347208458c3434c7bdc9526235 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Tue, 18 Jun 2024 23:48:28 +0000
Subject: [PATCH 07/25] add return_distances for cdef

---
 python/cuml/cluster/hdbscan/hdbscan.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index 23dc26e31b..08f76ab22c 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -52,6 +52,7 @@ IF GPUBUILD == 1:
             size_t intermediate_graph_degree,
             size_t max_iterations,
             float termination_threshold,
+            bool return_distances,
 
     cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
 

From 437406fe9523725b7e5e8cb84a655b395e66cae8 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Sat, 22 Jun 2024 23:02:14 +0000
Subject: [PATCH 08/25] Add to param names

---
 python/cuml/cluster/hdbscan/hdbscan.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index 08f76ab22c..6377477afe 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -1183,7 +1183,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
             "connectivity",
             "alpha",
             "gen_min_span_tree",
-            "prediction_data"
+            "prediction_data",
+            "build_algo",
+            "build_kwds"
         ]
 
     def get_attr_names(self):

From 7683d36b6d43a0d6aa970f26b4564d241d2b7f06 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Sun, 23 Jun 2024 18:50:41 +0000
Subject: [PATCH 09/25] add documentation

---
 cpp/include/cuml/cluster/hdbscan.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp
index 98f19e901c..0b98aeca86 100644
--- a/cpp/include/cuml/cluster/hdbscan.hpp
+++ b/cpp/include/cuml/cluster/hdbscan.hpp
@@ -501,6 +501,8 @@ namespace HDBSCAN::HELPER {
  * @param n number of columns in X
  * @param metric distance metric to use
  * @param min_samples minimum number of samples to use for computing core distances
+ * @param build_algo build algo for building the knn graph (default: brute_force_knn)
+ * @param build_params build parameters for build_algo
  */
 void compute_core_dists(
   const raft::handle_t& handle,

From 70b8835b93fff4327df3084d83f816afc381639e Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 12 Jul 2024 20:09:44 +0000
Subject: [PATCH 10/25] return distances param in python

---
 python/cuml/cluster/hdbscan/hdbscan.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index 6377477afe..26c0eb7cd3 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -861,11 +861,13 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                     params.nn_descent_params.intermediate_graph_degree = <size_t> 128
                     params.nn_descent_params.max_iterations = <size_t> 20
                     params.nn_descent_params.termination_threshold = <float> 0.0001
+                    params.nn_descent_params.return_distances = <bool> True
                 else:
                     params.nn_descent_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
                     params.nn_descent_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
                     params.nn_descent_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
                     params.nn_descent_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
+                    params.nn_descent_params.return_distances = <bool> self.build_kwds.get("nnd_return_distances", True)
             else:
                 raise ValueError("Build algo not supported. "
                                  "Must one of {'brute_force_knn', 'nn_descent'}")

From 152a8a3e6dbae6faf5446e0492a447a68136116e Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 12 Jul 2024 20:34:56 +0000
Subject: [PATCH 11/25] change test

---
 python/cuml/tests/test_hdbscan.py | 175 ++----------------------------
 1 file changed, 11 insertions(+), 164 deletions(-)

diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
index d71fa88678..94c2e0002b 100644
--- a/python/cuml/tests/test_hdbscan.py
+++ b/python/cuml/tests/test_hdbscan.py
@@ -47,8 +47,7 @@
 def get_graph_degree(n_samples):
     graph_degree = max(int((1 + ((n_samples * 1.5) // 32)) * 32), 64)
     intermediate_graph_degree = int(1 + ((graph_degree * 1.3) // 32) * 32)
-    max_iters = max(n_samples // 2, 20)
-    return graph_degree, intermediate_graph_degree, max_iters
+    return graph_degree, intermediate_graph_degree
 
 
 def assert_cluster_counts(sk_agg, cuml_agg, digits=25):
@@ -174,7 +173,6 @@ def assert_membership_vectors(cu_vecs, sk_vecs):
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_blobs(
     nrows,
     ncols,
@@ -186,7 +184,6 @@ def test_hdbscan_blobs(
     min_cluster_size,
     max_cluster_size,
     min_samples,
-    build_algo,
 ):
 
     X, y = make_blobs(
@@ -198,9 +195,6 @@ def test_hdbscan_blobs(
         random_state=42,
     )
 
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
-        min_samples
-    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -209,12 +203,6 @@ def test_hdbscan_blobs(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_graph_degree": graph_degree,
-            "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
-        },
     )
 
     cuml_agg.fit(X)
@@ -256,7 +244,6 @@ def test_hdbscan_blobs(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
 def test_hdbscan_sklearn_datasets(
     test_datasets,
     connectivity,
@@ -275,9 +262,6 @@ def test_hdbscan_sklearn_datasets(
 
     X = test_datasets.data
 
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
-        min_samples
-    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -287,12 +271,6 @@ def test_hdbscan_sklearn_datasets(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_graph_degree": graph_degree,
-            "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
-        },
     )
 
     cuml_agg.fit(X)
@@ -331,7 +309,6 @@ def test_hdbscan_sklearn_datasets(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_hdbscan_sklearn_extract_clusters(
     test_datasets,
     connectivity,
@@ -341,12 +318,9 @@ def test_hdbscan_sklearn_extract_clusters(
     min_cluster_size,
     max_cluster_size,
     allow_single_cluster,
-    build_algo,
 ):
     X = test_datasets.data
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
-        min_samples
-    )
+
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -356,12 +330,6 @@ def test_hdbscan_sklearn_extract_clusters(
         min_cluster_size=min_cluster_size,
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_graph_degree": graph_degree,
-            "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
-        },
     )
 
     sk_agg = hdbscan.HDBSCAN(
@@ -475,9 +443,7 @@ def test_hdbscan_cluster_patterns_extract_clusters(
 
     # This also tests duplicate data points
     X, y = get_pattern(dataset, nrows)[0]
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
-        min_samples
-    )
+    graph_degree, intermediate_graph_degree = get_graph_degree(min_samples)
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -490,7 +456,6 @@ def test_hdbscan_cluster_patterns_extract_clusters(
         build_kwds={
             "nnd_graph_degree": graph_degree,
             "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
         },
     )
 
@@ -578,8 +543,7 @@ def test_hdbscan_empty_cluster_tree(build_algo):
     assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0
 
 
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
-def test_hdbscan_plots(build_algo):
+def test_hdbscan_plots():
 
     X, y = make_blobs(
         n_samples=int(100),
@@ -590,7 +554,7 @@ def test_hdbscan_plots(build_algo):
         random_state=42,
     )
 
-    cuml_agg = HDBSCAN(gen_min_span_tree=True, build_algo=build_algo)
+    cuml_agg = HDBSCAN(gen_min_span_tree=True)
     cuml_agg.fit(X)
 
     assert cuml_agg.condensed_tree_ is not None
@@ -612,7 +576,6 @@ def test_hdbscan_plots(build_algo):
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("batch_size", [128, 1000])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_all_points_membership_vectors_blobs(
     nrows,
     ncols,
@@ -623,7 +586,6 @@ def test_all_points_membership_vectors_blobs(
     allow_single_cluster,
     max_cluster_size,
     batch_size,
-    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -633,9 +595,6 @@ def test_all_points_membership_vectors_blobs(
         shuffle=True,
         random_state=42,
     )
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
-        min_cluster_size
-    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,
@@ -644,12 +603,6 @@ def test_all_points_membership_vectors_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_graph_degree": graph_degree,
-            "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
-        },
     )
     cuml_agg.fit(X)
 
@@ -744,7 +697,6 @@ def test_all_points_membership_vectors_moons(
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
 @pytest.mark.parametrize("batch_size", [128, 1000])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_all_points_membership_vectors_circles(
     nrows,
     min_samples,
@@ -755,7 +707,6 @@ def test_all_points_membership_vectors_circles(
     max_cluster_size,
     connectivity,
     batch_size,
-    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows, factor=0.5, noise=0.05, random_state=42
@@ -770,7 +721,6 @@ def test_all_points_membership_vectors_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
     )
     cuml_agg.fit(X)
 
@@ -809,7 +759,6 @@ def test_all_points_membership_vectors_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_blobs(
     nrows,
     n_points_to_predict,
@@ -820,7 +769,6 @@ def test_approximate_predict_blobs(
     min_cluster_size,
     max_cluster_size,
     allow_single_cluster,
-    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -848,12 +796,6 @@ def test_approximate_predict_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_max_iterations": 100,
-            "nnd_graph_degree": 96,
-            "nnd_intermediate_graph_degree": 128,
-        },
     )
     cuml_agg.fit(X)
 
@@ -875,12 +817,8 @@ def test_approximate_predict_blobs(
         sk_agg, points_to_predict
     )
 
-    if build_algo == "brute_force_knn":
-        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95
-        assert np.allclose(cu_probs, sk_probs, atol=0.05)
-    else:
-        # this test case is not so stable for nn descent at this moment
-        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.9
+    assert adjusted_rand_score(cu_labels, sk_labels) >= 0.95
+    assert np.allclose(cu_probs, sk_probs, atol=0.05)
 
 
 @pytest.mark.parametrize("nrows", [1000])
@@ -892,7 +830,6 @@ def test_approximate_predict_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_moons(
     nrows,
     n_points_to_predict,
@@ -903,7 +840,6 @@ def test_approximate_predict_moons(
     max_cluster_size,
     cluster_selection_method,
     connectivity,
-    build_algo,
 ):
 
     X, y = datasets.make_moons(
@@ -922,7 +858,6 @@ def test_approximate_predict_moons(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
     )
 
     cuml_agg.fit(X_train)
@@ -947,15 +882,8 @@ def test_approximate_predict_moons(
     sk_unique = np.unique(sk_labels)
     cu_unique = np.unique(cu_labels)
     if len(sk_unique) == len(cu_unique):
-        if build_algo == "brute_force_knn":
-            assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99
-            assert array_equal(
-                cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005
-            )
-        else:
-            # this test case is not so stable for nn descent at this moment
-            # a few configs result in scores around 0.85
-            assert adjusted_rand_score(cu_labels, sk_labels) >= 0.8
+        assert adjusted_rand_score(cu_labels, sk_labels) >= 0.99
+        assert array_equal(cu_probs, sk_probs, unit_tol=0.05, total_tol=0.005)
 
 
 @pytest.mark.parametrize("nrows", [1000])
@@ -967,7 +895,6 @@ def test_approximate_predict_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_circles(
     nrows,
     n_points_to_predict,
@@ -978,7 +905,6 @@ def test_approximate_predict_circles(
     max_cluster_size,
     cluster_selection_method,
     connectivity,
-    build_algo,
 ):
     X, y = datasets.make_circles(
         n_samples=nrows + n_points_to_predict,
@@ -999,7 +925,6 @@ def test_approximate_predict_circles(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
     )
 
     cuml_agg.fit(X_train)
@@ -1036,7 +961,7 @@ def test_approximate_predict_circles(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn"])
+@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_approximate_predict_digits(
     n_points_to_predict,
     min_samples,
@@ -1106,7 +1031,6 @@ def test_approximate_predict_digits(
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("batch_size", [128])
-@pytest.mark.parametrize("build_algo", ["brute_force_knn", "nn_descent"])
 def test_membership_vector_blobs(
     nrows,
     n_points_to_predict,
@@ -1118,7 +1042,6 @@ def test_membership_vector_blobs(
     allow_single_cluster,
     max_cluster_size,
     batch_size,
-    build_algo,
 ):
     X, y = make_blobs(
         n_samples=nrows,
@@ -1138,7 +1061,7 @@ def test_membership_vector_blobs(
         random_state=42,
     )
 
-    graph_degree, intermediate_graph_degree, max_iters = get_graph_degree(
+    graph_degree, intermediate_graph_degree = get_graph_degree(
         min_cluster_size
     )
     cuml_agg = HDBSCAN(
@@ -1149,12 +1072,6 @@ def test_membership_vector_blobs(
         cluster_selection_epsilon=cluster_selection_epsilon,
         cluster_selection_method=cluster_selection_method,
         prediction_data=True,
-        build_algo=build_algo,
-        build_kwds={
-            "nnd_graph_degree": graph_degree,
-            "nnd_intermediate_graph_degree": intermediate_graph_degree,
-            "nnd_max_iterations": max_iters,
-        },
     )
     cuml_agg.fit(X)
 
@@ -1313,73 +1230,3 @@ def test_membership_vector_circles(
         "float32"
     )
     assert_membership_vectors(cu_membership_vectors, sk_membership_vectors)
-
-
-@pytest.mark.parametrize("nrows", [1000])
-@pytest.mark.parametrize("n_points_to_predict", [1000])
-@pytest.mark.parametrize("min_samples", [20, 30])
-@pytest.mark.parametrize("min_cluster_size", [100, 150])
-@pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5])
-@pytest.mark.parametrize("allow_single_cluster", [True, False])
-@pytest.mark.parametrize("max_cluster_size", [0])
-@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("batch_size", [16])
-@pytest.mark.parametrize("build_algo", ["nn_descent"])
-def test_membership_vector_circles_nnd(
-    nrows,
-    n_points_to_predict,
-    min_samples,
-    cluster_selection_epsilon,
-    cluster_selection_method,
-    min_cluster_size,
-    allow_single_cluster,
-    max_cluster_size,
-    connectivity,
-    batch_size,
-    build_algo,
-):
-    X, y = datasets.make_circles(
-        n_samples=nrows + n_points_to_predict,
-        factor=0.8,
-        noise=0.05,
-        random_state=42,
-    )
-
-    X_train = X[:nrows]
-    X_test = X[nrows:]
-
-    cuml_agg_nnd = HDBSCAN(
-        verbose=logger.level_info,
-        min_samples=min_samples,
-        allow_single_cluster=allow_single_cluster,
-        max_cluster_size=max_cluster_size,
-        min_cluster_size=min_cluster_size,
-        cluster_selection_epsilon=cluster_selection_epsilon,
-        cluster_selection_method=cluster_selection_method,
-        prediction_data=True,
-        build_algo=build_algo,
-    )
-    cuml_agg_nnd.fit(X_train)
-
-    cuml_agg_bf = HDBSCAN(
-        verbose=logger.level_info,
-        min_samples=min_samples,
-        allow_single_cluster=allow_single_cluster,
-        max_cluster_size=max_cluster_size,
-        min_cluster_size=min_cluster_size,
-        cluster_selection_epsilon=cluster_selection_epsilon,
-        cluster_selection_method=cluster_selection_method,
-        prediction_data=True,
-    )
-    cuml_agg_bf.fit(X_train)
-
-    cu_membership_vectors_nnd = membership_vector(
-        cuml_agg_nnd, X_test, batch_size
-    )
-    cu_membership_vectors_bf = membership_vector(
-        cuml_agg_bf, X_test, batch_size
-    )
-    assert_membership_vectors(
-        cu_membership_vectors_nnd, cu_membership_vectors_bf
-    )

From f0cdd3c42fe69415ee890cf4eb024fff36c82d79 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 12 Jul 2024 21:13:18 +0000
Subject: [PATCH 12/25] add copy kernel

---
 cpp/src/hdbscan/detail/reachability.cuh | 167 ++++++++++++++++--------
 1 file changed, 116 insertions(+), 51 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 99907897e5..c0af5dc292 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -17,14 +17,15 @@
 #pragma once
 
 #include <cuml/cluster/hdbscan.hpp>
+#include <cuml/common/utils.hpp>
 #include <cuml/neighbors/knn.hpp>
 
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/neighbors/brute_force.cuh>
 #include <raft/neighbors/detail/nn_descent.cuh>
 #include <raft/neighbors/nn_descent_types.hpp>
-#include <raft/neighbors/refine-inl.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -84,6 +85,45 @@ struct DistancePostProcessSqrt {
   }
 };
 
+template <typename T>
+CUML_KERNEL void copy_first_k_cols_shift_self(
+  T* out, T* in, size_t out_k, size_t in_k, size_t nrows)
+{
+  size_t row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row < nrows) {
+    for (size_t i = 1; i < out_k; i++) {
+      out[row * out_k + i] = in[row * in_k + i - 1];
+    }
+    out[row * out_k] = row;
+  }
+}
+
+template <typename T>
+CUML_KERNEL void copy_first_k_cols_shift_zero(
+  T* out, T* in, size_t out_k, size_t in_k, size_t nrows)
+{
+  size_t row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row < nrows) {
+    for (size_t i = 1; i < out_k; i++) {
+      out[row * out_k + i] = in[row * in_k + i - 1];
+    }
+    out[row * out_k] = static_cast<T>(0);
+  }
+}
+
+template <typename T>
+CUML_KERNEL void copy_first_k_cols_shift_core_dists(
+  T* out, T* in, T* core_dists, size_t out_k, size_t in_k, size_t nrows)
+{
+  size_t row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row < nrows) {
+    for (size_t i = 1; i < out_k; i++) {
+      out[row * out_k + i] = in[row * in_k + i - 1];
+    }
+    out[row * out_k] = static_cast<T>(core_dists[row]);
+  }
+}
+
 /**
  * Wraps the brute force knn API, to be used for both training and prediction
  * @tparam value_idx data type for integrals
@@ -140,46 +180,37 @@ void compute_knn(const raft::handle_t& handle,
                     true,
                     metric);
   } else {  // NN_DESCENT
+    auto epilogue                 = DistancePostProcessSqrt<value_idx, float>{};
+    build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
 
     auto dataset = raft::make_host_matrix_view<const float, int64_t>(X, m, n);
-    auto graph   = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset);
-
-    // NN Descent build does not include itself in nearest neighbors
-    for (size_t i = 0; i < n_search_items; i++) {
-      for (size_t j = k - 1; j >= 1; j--) {
-        graph.graph().data_handle()[i * build_params.graph_degree + j] =
-          graph.graph().data_handle()[i * build_params.graph_degree + j - 1];
-      }
-      graph.graph().data_handle()[i * build_params.graph_degree] = i;
-    }
 
-    auto dataset_dev = raft::make_device_matrix<float, int64_t, raft::row_major>(handle, m, n);
-    raft::copy(dataset_dev.data_handle(), dataset.data_handle(), m * n, handle.get_stream());
-    auto dataset_dev_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
-      dataset_dev.data_handle(), m, n);
+    auto graph = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
 
-    auto neighbor_candidates = raft::make_device_matrix<int64_t, int64_t, raft::row_major>(
-      handle, m, build_params.graph_degree);
-    raft::copy(neighbor_candidates.data_handle(),
-               graph.graph().data_handle(),
-               m * build_params.graph_degree,
-               handle.get_stream());
-    auto neighbor_candidates_view =
-      raft::make_device_matrix_view<const int64_t, int64_t, raft::row_major>(
-        neighbor_candidates.data_handle(), m, build_params.graph_degree);
-
-    auto indices =
-      raft::make_device_matrix_view<int64_t, int64_t>(int64_indices.data(), n_search_items, k);
-    auto distances = raft::make_device_matrix_view<float, int64_t>(dists, n_search_items, k);
-    raft::neighbors::refine(handle,
-                            dataset_dev_view,
-                            dataset_dev_view,
-                            neighbor_candidates_view,
-                            indices,
-                            distances,
-                            metric);
+    size_t TPB        = 256;
+    size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
+
+    auto indices_d =
+      raft::make_device_matrix<int64_t, int64_t>(handle, m, build_params.graph_degree);
+
+    raft::copy(
+      indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream);
+
+    if (graph.distances().has_value()) {
+      copy_first_k_cols_shift_zero<float>
+        <<<num_blocks, TPB, 0, stream>>>(dists,
+                                         graph.distances().value().data_handle(),
+                                         static_cast<size_t>(k),
+                                         build_params.graph_degree,
+                                         m);
+    }
+    copy_first_k_cols_shift_self<int64_t><<<num_blocks, TPB, 0, stream>>>(int64_indices.data(),
+                                                                          indices_d.data_handle(),
+                                                                          static_cast<size_t>(k),
+                                                                          build_params.graph_degree,
+                                                                          m);
   }
 
   // convert from current knn's 64-bit to 32-bit.
@@ -290,22 +321,56 @@ void mutual_reachability_knn_l2(
   // enclosing parent function (and any parent classes) of an extended __device__
   // or __host__ __device__ lambda`
 
-  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
-  auto X_view   = raft::make_device_matrix_view(X, m, n);
-  std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
-
-  raft::neighbors::brute_force::knn<value_idx, value_t>(
-    handle,
-    index,
-    X_view,
-    raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
-    raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
-    // TODO: expand distance metrics to support more than just L2 distance
-    // https://github.com/rapidsai/cuml/issues/5301
-    raft::distance::DistanceType::L2SqrtExpanded,
-    std::make_optional<float>(2.0f),
-    std::nullopt,
-    epilogue);
+  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
+    auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+    auto X_view   = raft::make_device_matrix_view(X, m, n);
+    std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
+
+    raft::neighbors::brute_force::knn<value_idx, value_t>(
+      handle,
+      index,
+      X_view,
+      raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
+      raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
+      // TODO: expand distance metrics to support more than just L2 distance
+      // https://github.com/rapidsai/cuml/issues/5301
+      raft::distance::DistanceType::L2SqrtExpanded,
+      std::make_optional<float>(2.0f),
+      std::nullopt,
+      epilogue);
+  } else {
+    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
+    build_params.return_distances = true;
+    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
+                 "n_neighbors should be smaller than the graph degree computed by nn descent");
+
+    auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
+    auto graph =
+      NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
+
+    size_t TPB        = 256;
+    size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
+
+    auto indices_d =
+      raft::make_device_matrix<value_idx, value_idx>(handle, m, build_params.graph_degree);
+
+    raft::copy(indices_d.data_handle(),
+               graph.graph().data_handle(),
+               m * build_params.graph_degree,
+               handle.get_stream());
+
+    if (graph.distances().has_value()) {
+      copy_first_k_cols_shift_core_dists<float>
+        <<<num_blocks, TPB, 0, handle.get_stream()>>>(out_dists,
+                                                      graph.distances().value().data_handle(),
+                                                      core_dists,
+                                                      static_cast<size_t>(k),
+                                                      build_params.graph_degree,
+                                                      m);
+    }
+    copy_first_k_cols_shift_self<value_idx><<<num_blocks, TPB, 0, handle.get_stream()>>>(
+      out_inds, indices_d.data_handle(), static_cast<size_t>(k), build_params.graph_degree, m);
+  }
 }
 
 /**

From be5c167c413f9fdd30fdce935a4f9957f6453a5e Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 12 Jul 2024 23:57:24 +0000
Subject: [PATCH 13/25] remove build_algo in test

---
 python/cuml/tests/test_hdbscan.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
index 94c2e0002b..ef60e964a8 100644
--- a/python/cuml/tests/test_hdbscan.py
+++ b/python/cuml/tests/test_hdbscan.py
@@ -251,7 +251,6 @@ def test_hdbscan_sklearn_datasets(
     cluster_selection_method,
     min_samples_cluster_size_bounds,
     allow_single_cluster,
-    build_algo,
 ):
 
     (

From e2735f552f2b986b4b25c063b2501291e5c098a0 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Wed, 24 Jul 2024 23:19:04 +0000
Subject: [PATCH 14/25] auto option as default for build_algo

---
 python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 22 +++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
index 26c0eb7cd3..c4934608aa 100644
--- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -23,6 +23,7 @@ from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
 from warnings import warn
 
+from cuml.internals import logger
 from cuml.internals.array import CumlArray
 from cuml.internals.base import UniversalBase
 from cuml.common.doc_utils import generate_docstring
@@ -518,7 +519,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                  connectivity='knn',
                  output_type=None,
                  prediction_data=False,
-                 build_algo='brute_force_knn',
+                 build_algo='auto',
                  build_kwds=None):
 
         super().__init__(handle=handle,
@@ -852,6 +853,15 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                 raise ValueError("Cluster selection method not supported. "
                                  "Must one of {'eom', 'leaf'}")
 
+            if self.build_algo == "auto":
+                if self.n_rows <= 50000:
+                    # brute force is faster for small datasets
+                    logger.warn("Building knn graph using brute force")
+                    self.build_algo = "brute_force_knn"
+                else:
+                    logger.warn("Building knn graph using nn descent")
+                    self.build_algo = "nn_descent"
+
             if self.build_algo == 'brute_force_knn':
                 params.build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN
             elif self.build_algo == 'nn_descent':
@@ -1114,6 +1124,16 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
 
             cdef GRAPH_BUILD_ALGO build_algo
             cdef index_params build_params
+
+            if self.build_algo == "auto":
+                if self.n_rows <= 50000:
+                    # brute force is faster for small datasets
+                    logger.warn("Building knn graph using brute force")
+                    self.build_algo = "brute_force_knn"
+                else:
+                    logger.warn("Building knn graph using nn descent")
+                    self.build_algo = "nn_descent"
+
             if self.build_algo == 'brute_force_knn':
                 build_algo = GRAPH_BUILD_ALGO.BRUTE_FORCE_KNN
             elif self.build_algo == 'nn_descent':

From 746ac33786d59c85afda1e2d0563bf0968d24d9f Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Mon, 19 Aug 2024 20:49:36 +0000
Subject: [PATCH 15/25] use slice kernels

---
 cpp/src/hdbscan/detail/reachability.cuh | 52 +++++++++++--------------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index c0af5dc292..710da8ff6e 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -78,7 +78,7 @@ void core_distances(
 // Functor to post-process distances by sqrt
 // For usage with NN Descent which internally supports L2Expanded only
 template <typename value_idx, typename value_t = float>
-struct DistancePostProcessSqrt {
+struct DistancePostProcessSqrt : NNDescent::DistEpilogue<value_idx, value_t> {
   DI value_t operator()(value_t value, value_idx row, value_idx col) const
   {
     return powf(fabsf(value), 0.5);
@@ -111,19 +111,6 @@ CUML_KERNEL void copy_first_k_cols_shift_zero(
   }
 }
 
-template <typename T>
-CUML_KERNEL void copy_first_k_cols_shift_core_dists(
-  T* out, T* in, T* core_dists, size_t out_k, size_t in_k, size_t nrows)
-{
-  size_t row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < nrows) {
-    for (size_t i = 1; i < out_k; i++) {
-      out[row * out_k + i] = in[row * in_k + i - 1];
-    }
-    out[row * out_k] = static_cast<T>(core_dists[row]);
-  }
-}
-
 /**
  * Wraps the brute force knn API, to be used for both training and prediction
  * @tparam value_idx data type for integrals
@@ -278,11 +265,15 @@ struct ReachabilityPostProcess {
 // Functor to post-process distances into reachability space (Sqrt)
 // For usage with NN Descent which internally supports L2Expanded only
 template <typename value_idx, typename value_t = float>
-struct ReachabilityPostProcessSqrt {
+struct ReachabilityPostProcessSqrt : NNDescent::DistEpilogue<value_idx, value_t> {
+  ReachabilityPostProcessSqrt(value_t* core_dists_, value_t alpha_)
+    : NNDescent::DistEpilogue<value_idx, value_t>(), core_dists(core_dists_), alpha(alpha_){};
+
   DI value_t operator()(value_t value, value_idx row, value_idx col) const
   {
     return max(core_dists[col], max(core_dists[row], powf(fabsf(alpha * value), 0.5)));
   }
+
   const value_t* core_dists;
   value_t alpha;
 };
@@ -339,7 +330,7 @@ void mutual_reachability_knn_l2(
       std::nullopt,
       epilogue);
   } else {
-    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>{core_dists, alpha};
+    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>(core_dists, alpha);
     build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
@@ -348,9 +339,6 @@ void mutual_reachability_knn_l2(
     auto graph =
       NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
 
-    size_t TPB        = 256;
-    size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
-
     auto indices_d =
       raft::make_device_matrix<value_idx, value_idx>(handle, m, build_params.graph_degree);
 
@@ -359,17 +347,21 @@ void mutual_reachability_knn_l2(
                m * build_params.graph_degree,
                handle.get_stream());
 
-    if (graph.distances().has_value()) {
-      copy_first_k_cols_shift_core_dists<float>
-        <<<num_blocks, TPB, 0, handle.get_stream()>>>(out_dists,
-                                                      graph.distances().value().data_handle(),
-                                                      core_dists,
-                                                      static_cast<size_t>(k),
-                                                      build_params.graph_degree,
-                                                      m);
-    }
-    copy_first_k_cols_shift_self<value_idx><<<num_blocks, TPB, 0, handle.get_stream()>>>(
-      out_inds, indices_d.data_handle(), static_cast<size_t>(k), build_params.graph_degree, m);
+    RAFT_EXPECTS(graph.distances().has_value(),
+                 "return_distances for nn descent should be set to true to be used for HDBSCAN");
+
+    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                           static_cast<int64_t>(0),
+                                           static_cast<int64_t>(m),
+                                           static_cast<int64_t>(k)};
+
+    auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k);
+    raft::matrix::slice<float, int64_t, raft::row_major>(
+      handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords);
+    auto out_knn_indices_view =
+      raft::make_device_matrix_view<value_idx, int64_t>(out_inds, m, (size_t)k);
+    raft::matrix::slice<value_idx, int64_t, raft::row_major>(
+      handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords);
   }
 }
 

From 2b46746cbd37e6a196f7a1b69eed39864454146c Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Tue, 20 Aug 2024 20:02:31 +0000
Subject: [PATCH 16/25] tests

---
 python/cuml/cuml/tests/test_hdbscan.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cuml/cuml/tests/test_hdbscan.py b/python/cuml/cuml/tests/test_hdbscan.py
index ef60e964a8..a061f71d56 100644
--- a/python/cuml/cuml/tests/test_hdbscan.py
+++ b/python/cuml/cuml/tests/test_hdbscan.py
@@ -1060,9 +1060,6 @@ def test_membership_vector_blobs(
         random_state=42,
     )
 
-    graph_degree, intermediate_graph_degree = get_graph_degree(
-        min_cluster_size
-    )
     cuml_agg = HDBSCAN(
         verbose=logger.level_info,
         allow_single_cluster=allow_single_cluster,

From 6bb63571e7481a10dcc81225d902870114cc726c Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Wed, 21 Aug 2024 16:42:26 +0000
Subject: [PATCH 17/25] make data view depending on host/dev

---
 cpp/src/hdbscan/detail/reachability.cuh | 38 ++++++++++++++++++-------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 710da8ff6e..eae628f0a5 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -24,7 +24,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/detail/nn_descent.cuh>
+#include <raft/neighbors/nn_descent.cuh>
 #include <raft/neighbors/nn_descent_types.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
@@ -111,6 +111,26 @@ CUML_KERNEL void copy_first_k_cols_shift_zero(
   }
 }
 
+template <typename value_idx, typename value_t, typename epilogue_op>
+auto get_graph_nnd(const raft::handle_t& handle,
+                   const value_t* X,
+                   size_t m,
+                   size_t n,
+                   epilogue_op distance_epilogue,
+                   Common::nn_index_params build_params)
+{
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, X));
+  float* ptr = reinterpret_cast<float*>(attr.devicePointer);
+  if (ptr != nullptr) {
+    auto dataset = raft::make_device_matrix_view<const value_t, int64_t>(X, m, n);
+    return NNDescent::build<value_t, value_idx>(handle, build_params, dataset, distance_epilogue);
+  } else {
+    auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
+    return NNDescent::build<value_t, value_idx>(handle, build_params, dataset, distance_epilogue);
+  }
+}
+
 /**
  * Wraps the brute force knn API, to be used for both training and prediction
  * @tparam value_idx data type for integrals
@@ -167,14 +187,12 @@ void compute_knn(const raft::handle_t& handle,
                     true,
                     metric);
   } else {  // NN_DESCENT
-    auto epilogue                 = DistancePostProcessSqrt<value_idx, float>{};
-    build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
 
-    auto dataset = raft::make_host_matrix_view<const float, int64_t>(X, m, n);
-
-    auto graph = NNDescent::detail::build<float, int64_t>(handle, build_params, dataset, epilogue);
+    auto epilogue                 = DistancePostProcessSqrt<int64_t, float>{};
+    build_params.return_distances = true;
+    auto graph = get_graph_nnd<int64_t, float>(handle, X, m, n, epilogue, build_params);
 
     size_t TPB        = 256;
     size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
@@ -330,14 +348,12 @@ void mutual_reachability_knn_l2(
       std::nullopt,
       epilogue);
   } else {
-    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>(core_dists, alpha);
-    build_params.return_distances = true;
     RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
                  "n_neighbors should be smaller than the graph degree computed by nn descent");
 
-    auto dataset = raft::make_host_matrix_view<const value_t, int64_t>(X, m, n);
-    auto graph =
-      NNDescent::detail::build<value_t, value_idx>(handle, build_params, dataset, epilogue);
+    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>(core_dists, alpha);
+    build_params.return_distances = true;
+    auto graph = get_graph_nnd<value_idx, value_t>(handle, X, m, n, epilogue, build_params);
 
     auto indices_d =
       raft::make_device_matrix<value_idx, value_idx>(handle, m, build_params.graph_degree);

From 3ec57c763647dab9635a33b26d6f63cb048170f9 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Wed, 21 Aug 2024 20:18:45 +0000
Subject: [PATCH 18/25] adding arg

---
 python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
index c4934608aa..5a6e62f6d9 100644
--- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -1143,11 +1143,13 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                     build_params.intermediate_graph_degree = <size_t> 128
                     build_params.max_iterations = <size_t> 20
                     build_params.termination_threshold = <float> 0.0001
+                    build_params.return_distances = <bool> True
                 else:
                     build_params.graph_degree = <size_t> self.build_kwds.get("nnd_graph_degree", 64)
                     build_params.intermediate_graph_degree = <size_t> self.build_kwds.get("nnd_intermediate_graph_degree", 128)
                     build_params.max_iterations = <size_t> self.build_kwds.get("nnd_max_iterations", 20)
                     build_params.termination_threshold = <float> self.build_kwds.get("nnd_termination_threshold", 0.0001)
+                    build_params.return_distances = <bool> self.build_kwds.get("nnd_return_distances", True)
 
             compute_core_dists(handle_[0],
                                <float*> X_ptr,

From d93aee6a77544a31ea29598eb55e8d49beac51a7 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Wed, 21 Aug 2024 20:19:43 +0000
Subject: [PATCH 19/25] for building + CI check

---
 cpp/cmake/thirdparty/get_raft.cmake | 4 ++--
 cpp/src/umap/knn_graph/algo.cuh     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 7bc860eed8..8ac1cee130 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -82,8 +82,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-      FORK             rapidsai
-      PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
+      FORK             jinsolp
+      PINNED_TAG       batch-nnd
       EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
       # When PINNED_TAG above doesn't match cuml,
       # force local raft clone in build directory
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index f6284e6a91..92c717afcd 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -59,7 +59,7 @@ void launcher(const raft::handle_t& handle,
 
 //  Functor to post-process distances as L2Sqrt*
 template <typename value_idx, typename value_t = float>
-struct DistancePostProcessSqrt {
+struct DistancePostProcessSqrt : NNDescent::DistEpilogue<value_idx, value_t> {
   DI value_t operator()(value_t value, value_idx row, value_idx col) const { return sqrtf(value); }
 };
 

From 7dbe38d2d93464668a9888ab91cea4b81a762209 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Thu, 22 Aug 2024 17:41:46 +0000
Subject: [PATCH 20/25] ann types fix

---
 python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
index 5a6e62f6d9..afbfd34a6d 100644
--- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -54,6 +54,7 @@ IF GPUBUILD == 1:
             size_t max_iterations,
             float termination_threshold,
             bool return_distances,
+            uint64_t n_clusters,
 
     cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
 

From 337aa8802dd4ca0be37405f87da920472e7ca9f5 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Thu, 22 Aug 2024 22:17:16 +0000
Subject: [PATCH 21/25] type fix

---
 python/cuml/cuml/cluster/hdbscan/hdbscan.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
index afbfd34a6d..37ef2ded3d 100644
--- a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -54,7 +54,7 @@ IF GPUBUILD == 1:
             size_t max_iterations,
             float termination_threshold,
             bool return_distances,
-            uint64_t n_clusters,
+            size_t n_clusters,
 
     cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
 
@@ -570,6 +570,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
         self.prediction_data_ptr = None
         self._cpu_to_gpu_interop_prepped = False
 
+        logger.set_level(verbose)
+
     @property
     def condensed_tree_(self):
 

From 154bbda8a167859e9f4e86d306ffdb94022f9586 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 23 Aug 2024 00:18:37 +0000
Subject: [PATCH 22/25] change if to raft_expects

---
 cpp/src/hdbscan/detail/reachability.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index eae628f0a5..c5910a6213 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -203,14 +203,14 @@ void compute_knn(const raft::handle_t& handle,
     raft::copy(
       indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream);
 
-    if (graph.distances().has_value()) {
-      copy_first_k_cols_shift_zero<float>
-        <<<num_blocks, TPB, 0, stream>>>(dists,
-                                         graph.distances().value().data_handle(),
-                                         static_cast<size_t>(k),
-                                         build_params.graph_degree,
-                                         m);
-    }
+    RAFT_EXPECTS(graph.distances().has_value(),
+                 "return_distances for nn descent should be set to true to be used for HDBSCAN");
+    copy_first_k_cols_shift_zero<float>
+      <<<num_blocks, TPB, 0, stream>>>(dists,
+                                       graph.distances().value().data_handle(),
+                                       static_cast<size_t>(k),
+                                       build_params.graph_degree,
+                                       m);
     copy_first_k_cols_shift_self<int64_t><<<num_blocks, TPB, 0, stream>>>(int64_indices.data(),
                                                                           indices_d.data_handle(),
                                                                           static_cast<size_t>(k),

From 6083812707bc6d0975ce38ffdf7311b841ec5f14 Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 23 Aug 2024 00:27:43 +0000
Subject: [PATCH 23/25] revert fork and pinned tag

---
 cpp/cmake/thirdparty/get_raft.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 8ac1cee130..7bc860eed8 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -82,8 +82,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-      FORK             jinsolp
-      PINNED_TAG       batch-nnd
+      FORK             rapidsai
+      PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
       EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
       # When PINNED_TAG above doesn't match cuml,
       # force local raft clone in build directory

From 732a06eb552f42d68303869512ae2a97d93ac1fd Mon Sep 17 00:00:00 2001
From: jinsolp <soleeep99@gmail.com>
Date: Fri, 23 Aug 2024 03:37:57 +0000
Subject: [PATCH 24/25] Trigger CI


From 904ab1b79050869ca931d7c0c500c9c5f812c3de Mon Sep 17 00:00:00 2001
From: soleee99 <jinsolp@andrew.cmu.edu>
Date: Sun, 22 Sep 2024 03:28:57 +0000
Subject: [PATCH 25/25] change to switch

---
 cpp/src/hdbscan/detail/reachability.cuh | 210 +++++++++++++-----------
 1 file changed, 112 insertions(+), 98 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index c5910a6213..9538536723 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -166,56 +166,65 @@ void compute_knn(const raft::handle_t& handle,
   // pass value_idx through to knn.
   rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
 
-  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
-    std::vector<value_t*> inputs;
-    inputs.push_back(const_cast<value_t*>(X));
-
-    std::vector<int> sizes;
-    sizes.push_back(m);
-
-    // perform knn
-    brute_force_knn(handle,
-                    inputs,
-                    sizes,
-                    n,
-                    const_cast<value_t*>(search_items),
-                    n_search_items,
-                    int64_indices.data(),
-                    dists,
-                    k,
-                    true,
-                    true,
-                    metric);
-  } else {  // NN_DESCENT
-    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
-                 "n_neighbors should be smaller than the graph degree computed by nn descent");
-
-    auto epilogue                 = DistancePostProcessSqrt<int64_t, float>{};
-    build_params.return_distances = true;
-    auto graph = get_graph_nnd<int64_t, float>(handle, X, m, n, epilogue, build_params);
-
-    size_t TPB        = 256;
-    size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
-
-    auto indices_d =
-      raft::make_device_matrix<int64_t, int64_t>(handle, m, build_params.graph_degree);
-
-    raft::copy(
-      indices_d.data_handle(), graph.graph().data_handle(), m * build_params.graph_degree, stream);
-
-    RAFT_EXPECTS(graph.distances().has_value(),
-                 "return_distances for nn descent should be set to true to be used for HDBSCAN");
-    copy_first_k_cols_shift_zero<float>
-      <<<num_blocks, TPB, 0, stream>>>(dists,
-                                       graph.distances().value().data_handle(),
-                                       static_cast<size_t>(k),
-                                       build_params.graph_degree,
-                                       m);
-    copy_first_k_cols_shift_self<int64_t><<<num_blocks, TPB, 0, stream>>>(int64_indices.data(),
-                                                                          indices_d.data_handle(),
-                                                                          static_cast<size_t>(k),
-                                                                          build_params.graph_degree,
-                                                                          m);
+  switch (build_algo) {
+    case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: {
+      std::vector<value_t*> inputs;
+      inputs.push_back(const_cast<value_t*>(X));
+
+      std::vector<int> sizes;
+      sizes.push_back(m);
+
+      // perform knn
+      brute_force_knn(handle,
+                      inputs,
+                      sizes,
+                      n,
+                      const_cast<value_t*>(search_items),
+                      n_search_items,
+                      int64_indices.data(),
+                      dists,
+                      k,
+                      true,
+                      true,
+                      metric);
+      break;
+    }
+
+    case Common::GRAPH_BUILD_ALGO::NN_DESCENT: {
+      RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
+                   "n_neighbors should be smaller than the graph degree computed by nn descent");
+
+      auto epilogue                 = DistancePostProcessSqrt<int64_t, float>{};
+      build_params.return_distances = true;
+      auto graph = get_graph_nnd<int64_t, float>(handle, X, m, n, epilogue, build_params);
+
+      size_t TPB        = 256;
+      size_t num_blocks = static_cast<size_t>((m + TPB) / TPB);
+
+      auto indices_d =
+        raft::make_device_matrix<int64_t, int64_t>(handle, m, build_params.graph_degree);
+
+      raft::copy(indices_d.data_handle(),
+                 graph.graph().data_handle(),
+                 m * build_params.graph_degree,
+                 stream);
+
+      RAFT_EXPECTS(graph.distances().has_value(),
+                   "return_distances for nn descent should be set to true to be used for HDBSCAN");
+      copy_first_k_cols_shift_zero<float>
+        <<<num_blocks, TPB, 0, stream>>>(dists,
+                                         graph.distances().value().data_handle(),
+                                         static_cast<size_t>(k),
+                                         build_params.graph_degree,
+                                         m);
+      copy_first_k_cols_shift_self<int64_t>
+        <<<num_blocks, TPB, 0, stream>>>(int64_indices.data(),
+                                         indices_d.data_handle(),
+                                         static_cast<size_t>(k),
+                                         build_params.graph_degree,
+                                         m);
+      break;
+    }
   }
 
   // convert from current knn's 64-bit to 32-bit.
@@ -329,55 +338,60 @@ void mutual_reachability_knn_l2(
   // `A type local to a function cannot be used in the template argument of the
   // enclosing parent function (and any parent classes) of an extended __device__
   // or __host__ __device__ lambda`
+  switch (build_algo) {
+    case Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN: {
+      auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+      auto X_view   = raft::make_device_matrix_view(X, m, n);
+      std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
+
+      raft::neighbors::brute_force::knn<value_idx, value_t>(
+        handle,
+        index,
+        X_view,
+        raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
+        raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
+        // TODO: expand distance metrics to support more than just L2 distance
+        // https://github.com/rapidsai/cuml/issues/5301
+        raft::distance::DistanceType::L2SqrtExpanded,
+        std::make_optional<float>(2.0f),
+        std::nullopt,
+        epilogue);
+      break;
+    }
 
-  if (build_algo == Common::GRAPH_BUILD_ALGO::BRUTE_FORCE_KNN) {
-    auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
-    auto X_view   = raft::make_device_matrix_view(X, m, n);
-    std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
-
-    raft::neighbors::brute_force::knn<value_idx, value_t>(
-      handle,
-      index,
-      X_view,
-      raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
-      raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
-      // TODO: expand distance metrics to support more than just L2 distance
-      // https://github.com/rapidsai/cuml/issues/5301
-      raft::distance::DistanceType::L2SqrtExpanded,
-      std::make_optional<float>(2.0f),
-      std::nullopt,
-      epilogue);
-  } else {
-    RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
-                 "n_neighbors should be smaller than the graph degree computed by nn descent");
-
-    auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>(core_dists, alpha);
-    build_params.return_distances = true;
-    auto graph = get_graph_nnd<value_idx, value_t>(handle, X, m, n, epilogue, build_params);
-
-    auto indices_d =
-      raft::make_device_matrix<value_idx, value_idx>(handle, m, build_params.graph_degree);
-
-    raft::copy(indices_d.data_handle(),
-               graph.graph().data_handle(),
-               m * build_params.graph_degree,
-               handle.get_stream());
-
-    RAFT_EXPECTS(graph.distances().has_value(),
-                 "return_distances for nn descent should be set to true to be used for HDBSCAN");
-
-    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
-                                           static_cast<int64_t>(0),
-                                           static_cast<int64_t>(m),
-                                           static_cast<int64_t>(k)};
-
-    auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k);
-    raft::matrix::slice<float, int64_t, raft::row_major>(
-      handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords);
-    auto out_knn_indices_view =
-      raft::make_device_matrix_view<value_idx, int64_t>(out_inds, m, (size_t)k);
-    raft::matrix::slice<value_idx, int64_t, raft::row_major>(
-      handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords);
+    case Common::GRAPH_BUILD_ALGO::NN_DESCENT: {
+      RAFT_EXPECTS(static_cast<size_t>(k) <= build_params.graph_degree,
+                   "n_neighbors should be smaller than the graph degree computed by nn descent");
+
+      auto epilogue = ReachabilityPostProcessSqrt<value_idx, value_t>(core_dists, alpha);
+      build_params.return_distances = true;
+      auto graph = get_graph_nnd<value_idx, value_t>(handle, X, m, n, epilogue, build_params);
+
+      auto indices_d =
+        raft::make_device_matrix<value_idx, value_idx>(handle, m, build_params.graph_degree);
+
+      raft::copy(indices_d.data_handle(),
+                 graph.graph().data_handle(),
+                 m * build_params.graph_degree,
+                 handle.get_stream());
+
+      RAFT_EXPECTS(graph.distances().has_value(),
+                   "return_distances for nn descent should be set to true to be used for HDBSCAN");
+
+      raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                             static_cast<int64_t>(0),
+                                             static_cast<int64_t>(m),
+                                             static_cast<int64_t>(k)};
+
+      auto out_knn_dists_view = raft::make_device_matrix_view(out_dists, m, (size_t)k);
+      raft::matrix::slice<float, int64_t, raft::row_major>(
+        handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords);
+      auto out_knn_indices_view =
+        raft::make_device_matrix_view<value_idx, int64_t>(out_inds, m, (size_t)k);
+      raft::matrix::slice<value_idx, int64_t, raft::row_major>(
+        handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords);
+      break;
+    }
   }
 }