From f8d25c153d0fa51ed74921a4abd5465a43f33498 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:28:18 +0200
Subject: [PATCH 01/37] Sync via barriers

---
 cpp/bench/ann/src/common/util.hpp             |   8 +-
 .../src/raft/raft_ann_bench_param_parser.h    |   7 +
 .../neighbors/detail/cagra/cagra_search.cuh   |   7 +-
 .../cagra/search_single_cta_kernel-inl.cuh    | 712 +++++++++++++++---
 4 files changed, 645 insertions(+), 89 deletions(-)
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 6cdff316e9..0995f0841e 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -67,8 +67,9 @@ struct buf {
     switch (memory_type) {
 #ifndef BUILD_CPU_ONLY
       case MemoryType::Device: {
-        cudaMalloc(reinterpret_cast<void**>(&data), size * sizeof(T));
-        cudaMemset(data, 0, size * sizeof(T));
+        cudaMallocAsync(reinterpret_cast<void**>(&data), size * sizeof(T), cudaStreamPerThread);
+        cudaMemsetAsync(data, 0, size * sizeof(T), cudaStreamPerThread);
+        cudaStreamSynchronize(cudaStreamPerThread);
       } break;
 #endif
       default: {
@@ -98,7 +99,8 @@ struct buf {
 #ifndef BUILD_CPU_ONLY
     if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) ||
         (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) {
-      cudaMemcpy(r.data, data, size * sizeof(T), cudaMemcpyDefault);
+      cudaMemcpyAsync(r.data, data, size * sizeof(T), cudaMemcpyDefault, cudaStreamPerThread);
+      cudaStreamSynchronize(cudaStreamPerThread);
       return r;
     }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 48bf1d70d8..bda7ba8d03 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -271,5 +271,12 @@ void parse_search_param(const nlohmann::json& conf,
   }
   // Same ratio as in IVF-PQ
   param.refine_ratio = conf.value("refine_ratio", 1.0f);
+
+  uint64_t pmask = 0x8000000000000000LL;
+  if (conf.contains("persistent") && conf.at("persistent")) {
+    param.p.rand_xor_mask |= pmask;
+  } else {
+    param.p.rand_xor_mask &= ~pmask;
+  }
 }
 #endif
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 67fad2e46a..a942323b4e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -105,6 +105,11 @@ void search_main_core(
     params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
   }
 
+  uint64_t pmask     = 0x8000000000000000LL;
+  bool is_persistent = params.rand_xor_mask & pmask;
+  rmm::device_uvector<uint32_t> completion_latch(is_persistent ? 16 : 0,
+                                                 raft::resource::get_cuda_stream(res));
+
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
@@ -135,7 +140,7 @@ void search_main_core(
         ? reinterpret_cast<const typename DatasetDescriptorT::INDEX_T*>(plan->dev_seed.data()) +
             (plan->num_seeds * qid)
         : nullptr;
-    uint32_t* _num_executed_iterations = nullptr;
+    uint32_t* _num_executed_iterations = completion_latch.data();
 
     (*plan)(res,
             dataset_desc,
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index e8104bd6f6..48cfc0d1e8 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -35,14 +35,24 @@
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <cuda/barrier>
+#include <cuda/latch>
+
+#include <atomic_queue/atomic_queue.h>
 
 #include <algorithm>
 #include <cassert>
+#include <chrono>
 #include <cstdint>
 #include <iostream>
 #include <memory>
 #include <numeric>
+#include <thread>
 #include <vector>
 
 namespace raft::neighbors::cagra::detail {
@@ -464,7 +474,7 @@ template <uint32_t TEAM_SIZE,
           unsigned TOPK_BY_BITONIC_SORT,
           class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+__device__ void search_core(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
   const std::uint32_t top_k,
@@ -486,6 +496,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
+  const std::uint32_t query_id,
   SAMPLE_FILTER_T sample_filter,
   raft::distance::DistanceType metric)
 {
@@ -496,8 +507,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using QUERY_T    = typename DATASET_DESCRIPTOR_T::QUERY_T;
 
-  const auto query_id = blockIdx.y;
-
 #ifdef _CLK_BREAKDOWN
   std::uint64_t clk_init                 = 0;
   std::uint64_t clk_compute_1st_distance = 0;
@@ -816,53 +825,272 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 }
 
 template <uint32_t TEAM_SIZE,
+          uint32_t DATASET_BLOCK_DIM,
+          unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
+  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
+  const std::uint32_t top_k,
+  DATASET_DESCRIPTOR_T dataset_desc,
+  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,  // [num_queries]
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric)
+{
+  const auto query_id = blockIdx.y;
+  search_core<TEAM_SIZE,
+              DATASET_BLOCK_DIM,
+              MAX_ITOPK,
+              MAX_CANDIDATES,
+              TOPK_BY_BITONIC_SORT,
+              DATASET_DESCRIPTOR_T,
+              SAMPLE_FILTER_T>(result_indices_ptr,
+                               result_distances_ptr,
+                               top_k,
+                               dataset_desc,
+                               queries_ptr,
+                               knn_graph,
+                               graph_degree,
+                               num_distilation,
+                               rand_xor_mask,
+                               seed_ptr,
+                               num_seeds,
+                               visited_hashmap_ptr,
+                               internal_topk,
+                               search_width,
+                               min_iteration,
+                               max_iteration,
+                               num_executed_iterations,
+                               hash_bitlen,
+                               small_hash_bitlen,
+                               small_hash_reset_interval,
+                               query_id,
+                               sample_filter,
+                               metric);
+}
+
+template <typename DATASET_DESCRIPTOR_T>
+struct work_desc_t {
+  using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
+  using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+  using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  cuda::barrier<cuda::thread_scope_system> input_barrier;
+  cuda::latch<cuda::thread_scope_device>* completion_latch;
+  index_type* result_indices_ptr;       // [num_queries, top_k]
+  distance_type* result_distances_ptr;  // [num_queries, top_k]
+  const data_type* queries_ptr;         // [num_queries, dataset_dim]
+  uint32_t query_id;                    // [0...num_queries - 1]
+  uint32_t top_k;
+};
+
+template <uint32_t TEAM_SIZE,
+          uint32_t DATASET_BLOCK_DIM,
+          unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
+  DATASET_DESCRIPTOR_T dataset_desc,
+  work_desc_t<DATASET_DESCRIPTOR_T>* work_descriptors,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,  // [num_queries]
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric)
+{
+  auto& work_descriptor = work_descriptors[blockIdx.y];
+  auto& input_barrier   = work_descriptor.input_barrier;
+
+  cuda::barrier<cuda::thread_scope_system>::arrival_token ready_to_read;
+  if (threadIdx.x == 0) { ready_to_read = input_barrier.arrive(); }
+
+  while (true) {
+    // wait the writing phase
+    if (threadIdx.x == 0) { input_barrier.wait(std::move(ready_to_read)); }
+    __syncthreads();
+    cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
+
+    // reading phase
+    auto* completion_latch = work_descriptor.completion_latch;
+    // empty input means terminate the persistent kernel.
+    if (completion_latch == nullptr) {
+      if (threadIdx.x == 0) { input_barrier.arrive_and_drop(); }
+      break;
+    }
+    auto* result_indices_ptr   = work_descriptor.result_indices_ptr;
+    auto* result_distances_ptr = work_descriptor.result_distances_ptr;
+    auto* queries_ptr          = work_descriptor.queries_ptr;
+    auto query_id              = work_descriptor.query_id;
+    auto top_k                 = work_descriptor.top_k;
+
+    // arrive to mark the end of the reading phase
+    __syncthreads();
+    if (threadIdx.x == 0) { ready_to_read = input_barrier.arrive(); }
+
+    // work phase
+    search_core<TEAM_SIZE,
+                DATASET_BLOCK_DIM,
+                MAX_ITOPK,
+                MAX_CANDIDATES,
+                TOPK_BY_BITONIC_SORT,
+                DATASET_DESCRIPTOR_T,
+                SAMPLE_FILTER_T>(result_indices_ptr,
+                                 result_distances_ptr,
+                                 top_k,
+                                 dataset_desc,
+                                 queries_ptr,
+                                 knn_graph,
+                                 graph_degree,
+                                 num_distilation,
+                                 rand_xor_mask,
+                                 seed_ptr,
+                                 num_seeds,
+                                 visited_hashmap_ptr,
+                                 internal_topk,
+                                 search_width,
+                                 min_iteration,
+                                 max_iteration,
+                                 num_executed_iterations,
+                                 hash_bitlen,
+                                 small_hash_bitlen,
+                                 small_hash_reset_interval,
+                                 query_id,
+                                 sample_filter,
+                                 metric);
+
+    // arrive to mark the end of the work phase
+    __syncthreads();
+    if (threadIdx.x == 0) { completion_latch->count_down(); }
+  }
+}
+
+RAFT_KERNEL
+register_completion_kernel(cuda::barrier<cuda::thread_scope_system>* bar,
+                           cuda::latch<cuda::thread_scope_device>* latch,
+                           uint32_t num_queries)
+{
+  if (threadIdx.x != 0 || bar == nullptr || latch == nullptr) { return; }
+  new (latch) cuda::latch<cuda::thread_scope_device>(num_queries);
+  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
+  [[maybe_unused]] auto ready_to_read = bar->arrive();
+  latch->wait();
+}
+
+template <bool Persistent,
+          uint32_t TEAM_SIZE,
+          uint32_t DATASET_BLOCK_DIM,
+          unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+auto dispatch_kernel = []() {
+  if constexpr (Persistent) {
+    return search_kernel_p<TEAM_SIZE,
+                           DATASET_BLOCK_DIM,
+                           MAX_ITOPK,
+                           MAX_CANDIDATES,
+                           TOPK_BY_BITONIC_SORT,
+                           DATASET_DESCRIPTOR_T,
+                           SAMPLE_FILTER_T>;
+  } else {
+    return search_kernel<TEAM_SIZE,
+                         DATASET_BLOCK_DIM,
+                         MAX_ITOPK,
+                         MAX_CANDIDATES,
+                         TOPK_BY_BITONIC_SORT,
+                         DATASET_DESCRIPTOR_T,
+                         SAMPLE_FILTER_T>;
+  }
+}();
+
+template <bool Persistent,
+          uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
           typename DATASET_DESCRIPTOR_T,
           typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           DATASET_BLOCK_DIM,
-                                           64,
-                                           64,
-                                           0,
-                                           DATASET_DESCRIPTOR_T,
-                                           SAMPLE_FILTER_T>);
+  using kernel_t = decltype(dispatch_kernel<Persistent,
+                                            TEAM_SIZE,
+                                            DATASET_BLOCK_DIM,
+                                            64,
+                                            64,
+                                            0,
+                                            DATASET_DESCRIPTOR_T,
+                                            SAMPLE_FILTER_T>);
 
   template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
   static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
     if (itopk_size <= 64) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           64,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             TEAM_SIZE,
+                             DATASET_BLOCK_DIM,
+                             64,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 128) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           128,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             TEAM_SIZE,
+                             DATASET_BLOCK_DIM,
+                             128,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 256) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           256,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             TEAM_SIZE,
+                             DATASET_BLOCK_DIM,
+                             256,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 512) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           512,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             TEAM_SIZE,
+                             DATASET_BLOCK_DIM,
+                             512,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     }
     THROW("No kernel for parametels itopk_size %u, max_candidates %u", itopk_size, MAX_CANDIDATES);
   }
@@ -882,21 +1110,23 @@ struct search_kernel_config {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             256,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return dispatch_kernel<Persistent,
+                               TEAM_SIZE,
+                               DATASET_BLOCK_DIM,
+                               256,
+                               max_candidates,
+                               0,
+                               DATASET_DESCRIPTOR_T,
+                               SAMPLE_FILTER_T>;
       } else if (itopk_size <= 512) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             512,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return dispatch_kernel<Persistent,
+                               TEAM_SIZE,
+                               DATASET_BLOCK_DIM,
+                               512,
+                               max_candidates,
+                               0,
+                               DATASET_DESCRIPTOR_T,
+                               SAMPLE_FILTER_T>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -905,6 +1135,255 @@ struct search_kernel_config {
   }
 };
 
+inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_clock>>* last_touch);
+
+struct persistent_runner_base_t {
+  using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
+  rmm::mr::managed_memory_resource work_descriptor_mr;
+  cudaStream_t stream;
+  work_queue_type queue;
+  persistent_runner_base_t(cudaStream_t stream) : stream(stream), queue() {}
+  virtual ~persistent_runner_base_t() noexcept = default;
+};
+
+template <unsigned TEAM_SIZE,
+          unsigned DATASET_BLOCK_DIM,
+          typename DATASET_DESCRIPTOR_T,
+          typename SAMPLE_FILTER_T>
+struct persistent_runner_t : public persistent_runner_base_t {
+  using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
+  using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+  using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  using kernel_config_type =
+    search_kernel_config<true, TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+  using kernel_type    = typename kernel_config_type::kernel_t;
+  using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
+  kernel_type kernel;
+  uint32_t block_size;
+  rmm::device_uvector<work_desc_type> work_descriptors;
+  std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
+
+  persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
+                      raft::device_matrix_view<const index_type, int64_t, row_major> graph,
+                      uint32_t num_itopk_candidates,
+                      uint32_t block_size,  //
+                      uint32_t smem_size,
+                      int64_t hash_bitlen,
+                      index_type* hashmap_ptr,
+                      size_t small_hash_bitlen,
+                      size_t small_hash_reset_interval,
+                      uint32_t num_random_samplings,
+                      uint64_t rand_xor_mask,
+                      uint32_t num_seeds,
+                      size_t itopk_size,
+                      size_t search_width,
+                      size_t min_iterations,
+                      size_t max_iterations,
+                      SAMPLE_FILTER_T sample_filter,
+                      raft::distance::DistanceType metric,
+                      cudaStream_t stream)
+    : persistent_runner_base_t{stream},
+      kernel{kernel_config_type::choose_itopk_and_mx_candidates(
+        itopk_size, num_itopk_candidates, block_size)},
+      block_size{block_size},
+      work_descriptors(0, stream, work_descriptor_mr)
+  {
+    // set kernel attributes same as in normal kernel
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    // do the same for subsequently launched completion kernel to make sure
+    // its config is the same and it is loaded by the driver before this kernel is launched
+    RAFT_CUDA_TRY(cudaFuncSetAttribute(
+      register_completion_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    register_completion_kernel<<<1, 1, 0, stream>>>(nullptr, nullptr, 1);
+
+    // set kernel launch parameters
+    dim3 gs = calc_coop_grid_size(block_size, smem_size);
+    dim3 bs(block_size, 1, 1);
+    RAFT_LOG_DEBUG(
+      "Launching persistent kernel with %u threads, %u block %u smem", bs.x, gs.y, smem_size);
+
+    // initialize the work queue
+    work_descriptors.resize(gs.y, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    for (uint32_t i = 0; i < gs.y; i++) {
+      auto& wd = work_descriptors.data()[i];
+      init(&wd.input_barrier, 3);
+      wd.completion_latch     = nullptr;
+      wd.result_indices_ptr   = nullptr;
+      wd.result_distances_ptr = nullptr;
+      wd.queries_ptr          = nullptr;
+      wd.query_id             = 0;
+      wd.top_k                = 0;
+      queue.push(i);
+    }
+
+    // launch the kernel
+    auto* work_descriptors_ptr        = work_descriptors.data();
+    auto* graph_ptr                   = graph.data_handle();
+    uint32_t graph_degree             = graph.extent(1);
+    uint32_t* num_executed_iterations = nullptr;  // optional arg [num_queries]
+    const index_type* dev_seed_ptr    = nullptr;  // optional arg [num_queries, num_seeds]
+
+    void* args[] =  // NOLINT
+      {&dataset_desc,
+       &work_descriptors_ptr,
+       &graph_ptr,  // [dataset_size, graph_degree]
+       &graph_degree,
+       &num_random_samplings,
+       &rand_xor_mask,
+       &dev_seed_ptr,
+       &num_seeds,
+       &hashmap_ptr,  // visited_hashmap_ptr: [num_queries, 1 << hash_bitlen]
+       &itopk_size,
+       &search_width,
+       &min_iterations,
+       &max_iterations,
+       &num_executed_iterations,
+       &hash_bitlen,
+       &small_hash_bitlen,
+       &small_hash_reset_interval,
+       &sample_filter,
+       &metric};
+    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel<std::remove_pointer_t<kernel_type>>(
+      kernel, gs, bs, args, smem_size, stream));
+    RAFT_LOG_INFO("Initialized the kernel in stream %zd, queue size = %u",
+                  int64_t((cudaStream_t)stream),
+                  queue.was_size());
+
+    std::thread(run_zombie, &last_touch).detach();
+  }
+
+  ~persistent_runner_t() noexcept override
+  {
+    RAFT_LOG_INFO("Gonna destroy the persistent runner.");
+    auto wds           = work_descriptors.data();
+    auto wdl           = work_descriptors.size();
+    uint32_t worker_id = 0;
+    while (queue.try_pop(worker_id)) {
+      auto& wd                   = wds[worker_id];
+      wd.completion_latch        = nullptr;
+      [[maybe_unused]] auto done = wd.input_barrier.arrive(2);
+    }
+    for (uint32_t i = 0; i < wdl; i++) {
+      auto& wd = wds[i];
+      if (wd.completion_latch != nullptr) {
+        wd.completion_latch        = nullptr;
+        [[maybe_unused]] auto done = wd.input_barrier.arrive(2);
+      }
+    }
+    RAFT_LOG_INFO("Destroyed the persistent runner.");
+  }
+
+  void launch(index_type* result_indices_ptr,       // [num_queries, top_k]
+              distance_type* result_distances_ptr,  // [num_queries, top_k]
+              const data_type* queries_ptr,         // [num_queries, dataset_dim]
+              uint32_t num_queries,
+              uint32_t top_k,
+              cuda::latch<cuda::thread_scope_device>* completion_latch,
+              cudaStream_t local_stream)
+  {
+    // RAFT_LOG_INFO("Launch! queue size = %u, num_queries = %u, top_k = %u",
+    //               queue.was_size(),
+    //               num_queries,
+    //               top_k);
+    void* args[] = {nullptr, &completion_latch, &num_queries};  // NOLINT
+    // RAFT_CUDA_TRY(cudaLaunchKernel<decltype(register_completion_kernel)>(
+    //   &register_completion_kernel, dim3(1, 1, 1), dim3(1, 1, 1), args, 0ul, local_stream));
+    // register_completion_kernel<<<1, 1, 0, local_stream>>>(nullptr, completion_latch,
+    // num_queries);
+    // RAFT_LOG_INFO("Launched completion kernel %p", completion_latch);
+    // using latch_t = cuda::latch<cuda::thread_scope_device>;
+    // static thread_local std::unique_ptr<latch_t, std::function<void(latch_t*)>>
+    //   completion_latch_store(
+    //     [local_stream]() {
+    //       latch_t* x = nullptr;
+    //       cudaMallocAsync(&x, sizeof(latch_t), local_stream);
+    //       return x;
+    //     }(),
+    //     [](latch_t* x) { cudaFreeAsync(x, local_stream); });
+    // wait for all workers to finish
+    // DANGER: initialization of the latch can happen too late!
+    // std::vector<decltype(work_descriptors[0].completion.arrive())>
+    // completion_tokens(num_queries);
+    // RAFT_EXPECTS(num_queries == 1, "Single-query for now");
+    for (uint32_t i = 0; i < num_queries; i++) {
+      auto worker_id = queue.pop();
+      // RAFT_LOG_INFO("Submitting query %u (worker id = %u)", i, worker_id);
+      auto& wd = work_descriptors.data()[worker_id];
+      if (i == 0) {
+        auto* bar = &wd.input_barrier;
+        args[0]   = &bar;
+        RAFT_CUDA_TRY(cudaLaunchKernel<decltype(register_completion_kernel)>(
+          &register_completion_kernel, dim3(1, 1, 1), dim3(1, 1, 1), args, 0ul, local_stream));
+        // register_completion_kernel<<<1, 1, 0, local_stream>>>(
+        //   &wd.input_barrier, completion_latch, num_queries);
+        // RAFT_LOG_INFO("Launched completion kernel %p", completion_latch);
+        // RAFT_CUDA_TRY(cudaPeekAtLastError());
+      }
+
+      wd.completion_latch     = completion_latch;
+      wd.result_indices_ptr   = result_indices_ptr;
+      wd.result_distances_ptr = result_distances_ptr;
+      wd.queries_ptr          = queries_ptr;
+      wd.query_id             = i;
+      wd.top_k                = top_k;
+      // RAFT_LOG_INFO("Wrote input %p", completion_latch);
+      cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+      // RAFT_LOG_INFO("Issued a fence %p", completion_latch);
+      // RAFT_CUDA_TRY(cudaStreamSynchronize(local_stream));
+      // RAFT_LOG_INFO("Finished the completion kernel!");
+      // wait till the device threads finish the reading
+      // wd.input_barrier.arrive_and_wait();
+      wd.input_barrier.wait(wd.input_barrier.arrive((i == 0) ? 1 : 2));
+      // RAFT_LOG_INFO("Got the barrier");
+      // then release the worker id
+      queue.push(worker_id);
+      // RAFT_LOG_INFO("Returned worker %u", worker_id);
+    }
+    // latch->arrive_and_wait();
+    // RAFT_LOG_INFO("Launch done!");
+    last_touch.store(std::chrono::system_clock::now());
+  }
+
+  auto calc_coop_grid_size(uint32_t block_size, uint32_t smem_size) -> dim3
+  {
+    // We may need to run other kernels alongside this persistent kernel.
+    // Leave a few SMs idle.
+    constexpr double kDeviceUsage = 0.8;
+
+    // determine the grid size
+    int ctas_per_sm = 1;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor<kernel_type>(
+      &ctas_per_sm, kernel, block_size, smem_size);
+    int num_sm = getMultiProcessorCount() - 1;
+
+    return {1, uint32_t(kDeviceUsage * (ctas_per_sm * num_sm)), 1};
+    // return {1, uint32_t(getMultiProcessorCount() - 8), 1};
+  }
+};
+
+struct non_blocking_stream {
+  non_blocking_stream() { cudaStreamCreateWithFlags(&value, cudaStreamNonBlocking); }
+  ~non_blocking_stream() noexcept { cudaStreamDestroy(value); }
+  cudaStream_t value;
+};
+
+inline std::unique_ptr<non_blocking_stream> persistent_stream;
+inline std::shared_ptr<persistent_runner_base_t> persistent_runner{nullptr};
+inline std::mutex persistent_lock;
+
+inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_clock>>* last_touch)
+{
+  constexpr auto kInterval = std::chrono::milliseconds(500);
+  last_touch->store(std::chrono::system_clock::now());
+  while (last_touch->load() + kInterval >= std::chrono::system_clock::now()) {
+    std::this_thread::sleep_for(kInterval);
+  }
+  std::lock_guard<std::mutex> guard(persistent_lock);
+  persistent_runner.reset();
+}
+
 template <unsigned TEAM_SIZE,
           unsigned DATASET_BLOCK_DIM,
           typename DATASET_DESCRIPTOR_T,
@@ -937,39 +1416,102 @@ void select_and_run(
   raft::distance::DistanceType metric,
   cudaStream_t stream)
 {
-  auto kernel =
-    search_kernel_config<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::
-      choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size);
-  RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
-                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                     smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
-  dim3 thread_dims(block_size, 1, 1);
-  dim3 block_dims(1, num_queries, 1);
-  RAFT_LOG_DEBUG(
-    "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
-  kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
-                                                         topk_distances_ptr,
-                                                         topk,
-                                                         dataset_desc,
-                                                         queries_ptr,
-                                                         graph.data_handle(),
-                                                         graph.extent(1),
-                                                         num_random_samplings,
-                                                         rand_xor_mask,
-                                                         dev_seed_ptr,
-                                                         num_seeds,
-                                                         hashmap_ptr,
-                                                         itopk_size,
-                                                         search_width,
-                                                         min_iterations,
-                                                         max_iterations,
-                                                         num_executed_iterations,
-                                                         hash_bitlen,
-                                                         small_hash_bitlen,
-                                                         small_hash_reset_interval,
-                                                         sample_filter,
-                                                         metric);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  // hack: pass the 'is_persistent' flag in the highest bit of the `rand_xor_mask`
+  //       to avoid changing the signature of `select_and_run` and updating all its
+  //       instantiations...
+  uint64_t pmask     = 0x8000000000000000LL;
+  bool is_persistent = rand_xor_mask & pmask;
+  rand_xor_mask &= ~pmask;
+  if (is_persistent) {
+    using runner_type = persistent_runner_t<true,
+                                            TEAM_SIZE,
+                                            DATASET_BLOCK_DIM,
+                                            DATASET_DESCRIPTOR_T,
+                                            SAMPLE_FILTER_T>;
+    // This is used to keep the object alive if `persistent_runner` gets reset.
+    std::shared_ptr<persistent_runner_base_t> runner_local_copy;
+    runner_type* runner = nullptr;
+    {
+      std::lock_guard<std::mutex> guard(persistent_lock);
+      runner = dynamic_cast<runner_type*>(persistent_runner.get());
+      if (runner == nullptr) {
+        // Free the resources (if any) in advance
+        persistent_runner = std::shared_ptr<persistent_runner_base_t>();
+        // Lazy-create a stream, which is going to be used by all runners till the program exists
+        if (!persistent_stream) { persistent_stream = std::make_unique<non_blocking_stream>(); }
+        // Create a new runner
+        runner = new runner_type(dataset_desc,
+                                 graph,
+                                 num_itopk_candidates,
+                                 block_size,
+                                 smem_size,
+                                 hash_bitlen,
+                                 hashmap_ptr,
+                                 small_hash_bitlen,
+                                 small_hash_reset_interval,
+                                 num_random_samplings,
+                                 rand_xor_mask,
+                                 num_seeds,
+                                 itopk_size,
+                                 search_width,
+                                 min_iterations,
+                                 max_iterations,
+                                 sample_filter,
+                                 metric,
+                                 persistent_stream->value);
+        persistent_runner.reset(runner);
+      }
+      runner_local_copy = persistent_runner;
+    }
+    auto* completion_latch =
+      reinterpret_cast<cuda::latch<cuda::thread_scope_device>*>(num_executed_iterations);
+    runner->launch(topk_indices_ptr,
+                   topk_distances_ptr,
+                   queries_ptr,
+                   num_queries,
+                   topk,
+                   completion_latch,
+                   stream);
+  } else {
+    auto kernel =
+      search_kernel_config<false,
+                           TEAM_SIZE,
+                           DATASET_BLOCK_DIM,
+                           DATASET_DESCRIPTOR_T,
+                           SAMPLE_FILTER_T>::choose_itopk_and_mx_candidates(itopk_size,
+                                                                            num_itopk_candidates,
+                                                                            block_size);
+    RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
+                                       cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                       smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
+    dim3 thread_dims(block_size, 1, 1);
+    dim3 block_dims(1, num_queries, 1);
+    RAFT_LOG_DEBUG(
+      "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
+    kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
+                                                           topk_distances_ptr,
+                                                           topk,
+                                                           dataset_desc,
+                                                           queries_ptr,
+                                                           graph.data_handle(),
+                                                           graph.extent(1),
+                                                           num_random_samplings,
+                                                           rand_xor_mask,
+                                                           dev_seed_ptr,
+                                                           num_seeds,
+                                                           hashmap_ptr,
+                                                           itopk_size,
+                                                           search_width,
+                                                           min_iterations,
+                                                           max_iterations,
+                                                           num_executed_iterations,
+                                                           hash_bitlen,
+                                                           small_hash_bitlen,
+                                                           small_hash_reset_interval,
+                                                           sample_filter,
+                                                           metric);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 }
 }  // namespace single_cta_search
 }  // namespace raft::neighbors::cagra::detail

From cd012c4242b3f4a126b75cd183aaa791dadfa62a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 16 Apr 2024 09:44:52 +0200
Subject: [PATCH 02/37] Waiting on CPU side.

The host fills in the work_descriptors, which is in the pinned memory and then arrives at the
input barriers (managed memory, on device) to mark that the descriptors are ready to read.
Then it waits on the comnpletion latch (managed memory, on host).

The device reads the descriptors when the readiness barriers allows that.
The descriptors are read by multiple threads at the same time (hoping for a single coalesced read).
---
 .../cagra/search_single_cta_kernel-inl.cuh    | 171 ++++++++++--------
 1 file changed, 91 insertions(+), 80 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 48cfc0d1e8..912bba5249 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -893,8 +893,7 @@ struct work_desc_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  cuda::barrier<cuda::thread_scope_system> input_barrier;
-  cuda::latch<cuda::thread_scope_device>* completion_latch;
+  cuda::latch<cuda::thread_scope_system>* completion_latch;
   index_type* result_indices_ptr;       // [num_queries, top_k]
   distance_type* result_distances_ptr;  // [num_queries, top_k]
   const data_type* queries_ptr;         // [num_queries, dataset_dim]
@@ -911,6 +910,7 @@ template <uint32_t TEAM_SIZE,
           class SAMPLE_FILTER_T>
 __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   DATASET_DESCRIPTOR_T dataset_desc,
+  cuda::barrier<cuda::thread_scope_system>* input_barriers,
   work_desc_t<DATASET_DESCRIPTOR_T>* work_descriptors,
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -931,8 +931,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   SAMPLE_FILTER_T sample_filter,
   raft::distance::DistanceType metric)
 {
-  auto& work_descriptor = work_descriptors[blockIdx.y];
-  auto& input_barrier   = work_descriptor.input_barrier;
+  auto& input_barrier = input_barriers[blockIdx.y];
+
+  using work_desc_type     = work_desc_t<DATASET_DESCRIPTOR_T>;
+  using blob_elem          = uint32_t;
+  constexpr auto kBlobSize = raft::div_rounding_up_safe(sizeof(work_desc_type), sizeof(blob_elem));
+  static_assert(kBlobSize * sizeof(blob_elem) == sizeof(work_desc_type));
+  __shared__ union {
+    work_desc_type value;
+    blob_elem blob[kBlobSize];
+  } work_descriptor;
 
   cuda::barrier<cuda::thread_scope_system>::arrival_token ready_to_read;
   if (threadIdx.x == 0) { ready_to_read = input_barrier.arrive(); }
@@ -943,18 +951,23 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     __syncthreads();
     cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
 
+    for (auto i = threadIdx.x; i < kBlobSize; i += blockDim.x) {
+      work_descriptor.blob[i] = reinterpret_cast<blob_elem*>(work_descriptors + blockIdx.y)[i];
+    }
+    __syncthreads();
+
     // reading phase
-    auto* completion_latch = work_descriptor.completion_latch;
+    auto* completion_latch = work_descriptor.value.completion_latch;
     // empty input means terminate the persistent kernel.
     if (completion_latch == nullptr) {
       if (threadIdx.x == 0) { input_barrier.arrive_and_drop(); }
       break;
     }
-    auto* result_indices_ptr   = work_descriptor.result_indices_ptr;
-    auto* result_distances_ptr = work_descriptor.result_distances_ptr;
-    auto* queries_ptr          = work_descriptor.queries_ptr;
-    auto query_id              = work_descriptor.query_id;
-    auto top_k                 = work_descriptor.top_k;
+    auto* result_indices_ptr   = work_descriptor.value.result_indices_ptr;
+    auto* result_distances_ptr = work_descriptor.value.result_distances_ptr;
+    auto* queries_ptr          = work_descriptor.value.queries_ptr;
+    auto query_id              = work_descriptor.value.query_id;
+    auto top_k                 = work_descriptor.value.top_k;
 
     // arrive to mark the end of the reading phase
     __syncthreads();
@@ -997,17 +1010,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   }
 }
 
-RAFT_KERNEL
-register_completion_kernel(cuda::barrier<cuda::thread_scope_system>* bar,
-                           cuda::latch<cuda::thread_scope_device>* latch,
-                           uint32_t num_queries)
-{
-  if (threadIdx.x != 0 || bar == nullptr || latch == nullptr) { return; }
-  new (latch) cuda::latch<cuda::thread_scope_device>(num_queries);
-  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
-  [[maybe_unused]] auto ready_to_read = bar->arrive();
-  latch->wait();
-}
+// RAFT_KERNEL
+// register_completion_kernel(cuda::barrier<cuda::thread_scope_system>* bar,
+//                            cuda::latch<cuda::thread_scope_device>* latch,
+//                            uint32_t num_queries)
+// {
+//   if (threadIdx.x != 0 || bar == nullptr || latch == nullptr) { return; }
+//   new (latch) cuda::latch<cuda::thread_scope_device>(num_queries);
+//   cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
+//   [[maybe_unused]] auto ready_to_read = bar->arrive();
+//   latch->wait();
+// }
 
 template <bool Persistent,
           uint32_t TEAM_SIZE,
@@ -1139,7 +1152,8 @@ inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_c
 
 struct persistent_runner_base_t {
   using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
-  rmm::mr::managed_memory_resource work_descriptor_mr;
+  rmm::mr::managed_memory_resource input_barriers_mr;
+  rmm::mr::pinned_host_memory_resource work_descriptor_mr;
   cudaStream_t stream;
   work_queue_type queue;
   persistent_runner_base_t(cudaStream_t stream) : stream(stream), queue() {}
@@ -1160,6 +1174,7 @@ struct persistent_runner_t : public persistent_runner_base_t {
   using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
   kernel_type kernel;
   uint32_t block_size;
+  rmm::device_uvector<cuda::barrier<cuda::thread_scope_system>> input_barriers;
   rmm::device_uvector<work_desc_type> work_descriptors;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
 
@@ -1186,16 +1201,19 @@ struct persistent_runner_t : public persistent_runner_base_t {
       kernel{kernel_config_type::choose_itopk_and_mx_candidates(
         itopk_size, num_itopk_candidates, block_size)},
       block_size{block_size},
+      input_barriers(0, stream, input_barriers_mr),
       work_descriptors(0, stream, work_descriptor_mr)
   {
+    int gpu_dev;
+    RAFT_CUDA_TRY(cudaGetDevice(&gpu_dev));
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
       cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    // do the same for subsequently launched completion kernel to make sure
-    // its config is the same and it is loaded by the driver before this kernel is launched
-    RAFT_CUDA_TRY(cudaFuncSetAttribute(
-      register_completion_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    register_completion_kernel<<<1, 1, 0, stream>>>(nullptr, nullptr, 1);
+    // // do the same for subsequently launched completion kernel to make sure
+    // // its config is the same and it is loaded by the driver before this kernel is launched
+    // RAFT_CUDA_TRY(cudaFuncSetAttribute(
+    //   register_completion_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    // register_completion_kernel<<<1, 1, 0, stream>>>(nullptr, nullptr, 1);
 
     // set kernel launch parameters
     dim3 gs = calc_coop_grid_size(block_size, smem_size);
@@ -1205,10 +1223,31 @@ struct persistent_runner_t : public persistent_runner_base_t {
 
     // initialize the work queue
     work_descriptors.resize(gs.y, stream);
+    auto* work_descriptors_ptr = work_descriptors.data();
+    // {
+    //   auto allocation_size = work_descriptors.size() * sizeof(work_desc_type);
+    //   RAFT_CUDA_TRY(cudaMemAdvise(
+    //     work_descriptors_ptr, allocation_size, cudaMemAdviseSetPreferredLocation,
+    //     cudaCpuDeviceId));
+    //   RAFT_CUDA_TRY(
+    //     cudaMemAdvise(work_descriptors_ptr, allocation_size, cudaMemAdviseSetAccessedBy,
+    //     gpu_dev));
+    // }
+
+    input_barriers.resize(gs.y, stream);
+    auto* input_barriers_ptr = input_barriers.data();
+    {
+      auto allocation_size =
+        input_barriers.size() * sizeof(cuda::barrier<cuda::thread_scope_system>);
+      RAFT_CUDA_TRY(cudaMemAdvise(
+        input_barriers_ptr, allocation_size, cudaMemAdviseSetPreferredLocation, gpu_dev));
+      RAFT_CUDA_TRY(cudaMemAdvise(
+        input_barriers_ptr, allocation_size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    }
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     for (uint32_t i = 0; i < gs.y; i++) {
-      auto& wd = work_descriptors.data()[i];
-      init(&wd.input_barrier, 3);
+      auto& wd = work_descriptors_ptr[i];
+      init(input_barriers_ptr + i, 2);
       wd.completion_latch     = nullptr;
       wd.result_indices_ptr   = nullptr;
       wd.result_distances_ptr = nullptr;
@@ -1219,7 +1258,6 @@ struct persistent_runner_t : public persistent_runner_base_t {
     }
 
     // launch the kernel
-    auto* work_descriptors_ptr        = work_descriptors.data();
     auto* graph_ptr                   = graph.data_handle();
     uint32_t graph_degree             = graph.extent(1);
     uint32_t* num_executed_iterations = nullptr;  // optional arg [num_queries]
@@ -1227,6 +1265,7 @@ struct persistent_runner_t : public persistent_runner_base_t {
 
     void* args[] =  // NOLINT
       {&dataset_desc,
+       &input_barriers_ptr,
        &work_descriptors_ptr,
        &graph_ptr,  // [dataset_size, graph_degree]
        &graph_degree,
@@ -1256,20 +1295,22 @@ struct persistent_runner_t : public persistent_runner_base_t {
 
   ~persistent_runner_t() noexcept override
   {
-    RAFT_LOG_INFO("Gonna destroy the persistent runner.");
+    auto ibs           = input_barriers.data();
     auto wds           = work_descriptors.data();
     auto wdl           = work_descriptors.size();
     uint32_t worker_id = 0;
+    // wait for all the jobs to finish nicely
     while (queue.try_pop(worker_id)) {
       auto& wd                   = wds[worker_id];
       wd.completion_latch        = nullptr;
-      [[maybe_unused]] auto done = wd.input_barrier.arrive(2);
+      [[maybe_unused]] auto done = ibs[worker_id].arrive();
     }
+    // try to kill stuck threads if any
     for (uint32_t i = 0; i < wdl; i++) {
       auto& wd = wds[i];
       if (wd.completion_latch != nullptr) {
         wd.completion_latch        = nullptr;
-        [[maybe_unused]] auto done = wd.input_barrier.arrive(2);
+        [[maybe_unused]] auto done = ibs[i].arrive();
       }
     }
     RAFT_LOG_INFO("Destroyed the persistent runner.");
@@ -1280,68 +1321,38 @@ struct persistent_runner_t : public persistent_runner_base_t {
               const data_type* queries_ptr,         // [num_queries, dataset_dim]
               uint32_t num_queries,
               uint32_t top_k,
-              cuda::latch<cuda::thread_scope_device>* completion_latch,
+              cuda::latch<cuda::thread_scope_device>* _completion_latch,
               cudaStream_t local_stream)
   {
-    // RAFT_LOG_INFO("Launch! queue size = %u, num_queries = %u, top_k = %u",
-    //               queue.was_size(),
-    //               num_queries,
-    //               top_k);
-    void* args[] = {nullptr, &completion_latch, &num_queries};  // NOLINT
-    // RAFT_CUDA_TRY(cudaLaunchKernel<decltype(register_completion_kernel)>(
-    //   &register_completion_kernel, dim3(1, 1, 1), dim3(1, 1, 1), args, 0ul, local_stream));
-    // register_completion_kernel<<<1, 1, 0, local_stream>>>(nullptr, completion_latch,
-    // num_queries);
-    // RAFT_LOG_INFO("Launched completion kernel %p", completion_latch);
-    // using latch_t = cuda::latch<cuda::thread_scope_device>;
-    // static thread_local std::unique_ptr<latch_t, std::function<void(latch_t*)>>
-    //   completion_latch_store(
-    //     [local_stream]() {
-    //       latch_t* x = nullptr;
-    //       cudaMallocAsync(&x, sizeof(latch_t), local_stream);
-    //       return x;
-    //     }(),
-    //     [](latch_t* x) { cudaFreeAsync(x, local_stream); });
+    using latch_t = cuda::latch<cuda::thread_scope_system>;
+    static thread_local std::unique_ptr<latch_t, std::function<void(latch_t*)>>
+      completion_latch_store(
+        []() {
+          latch_t* x = nullptr;
+          cudaMallocManaged(&x, sizeof(latch_t));
+          RAFT_CUDA_TRY(
+            cudaMemAdvise(x, sizeof(latch_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+          RAFT_CUDA_TRY(cudaMemAdvise(x, sizeof(latch_t), cudaMemAdviseSetAccessedBy, 0));
+          return x;
+        }(),
+        [](latch_t* x) { cudaFree(x); });
     // wait for all workers to finish
-    // DANGER: initialization of the latch can happen too late!
-    // std::vector<decltype(work_descriptors[0].completion.arrive())>
-    // completion_tokens(num_queries);
-    // RAFT_EXPECTS(num_queries == 1, "Single-query for now");
+    auto* completion_latch = new (completion_latch_store.get()) latch_t{num_queries};
     for (uint32_t i = 0; i < num_queries; i++) {
       auto worker_id = queue.pop();
       // RAFT_LOG_INFO("Submitting query %u (worker id = %u)", i, worker_id);
-      auto& wd = work_descriptors.data()[worker_id];
-      if (i == 0) {
-        auto* bar = &wd.input_barrier;
-        args[0]   = &bar;
-        RAFT_CUDA_TRY(cudaLaunchKernel<decltype(register_completion_kernel)>(
-          &register_completion_kernel, dim3(1, 1, 1), dim3(1, 1, 1), args, 0ul, local_stream));
-        // register_completion_kernel<<<1, 1, 0, local_stream>>>(
-        //   &wd.input_barrier, completion_latch, num_queries);
-        // RAFT_LOG_INFO("Launched completion kernel %p", completion_latch);
-        // RAFT_CUDA_TRY(cudaPeekAtLastError());
-      }
-
+      auto& wd                = work_descriptors.data()[worker_id];
       wd.completion_latch     = completion_latch;
       wd.result_indices_ptr   = result_indices_ptr;
       wd.result_distances_ptr = result_distances_ptr;
       wd.queries_ptr          = queries_ptr;
       wd.query_id             = i;
       wd.top_k                = top_k;
-      // RAFT_LOG_INFO("Wrote input %p", completion_latch);
       cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
-      // RAFT_LOG_INFO("Issued a fence %p", completion_latch);
-      // RAFT_CUDA_TRY(cudaStreamSynchronize(local_stream));
-      // RAFT_LOG_INFO("Finished the completion kernel!");
-      // wait till the device threads finish the reading
-      // wd.input_barrier.arrive_and_wait();
-      wd.input_barrier.wait(wd.input_barrier.arrive((i == 0) ? 1 : 2));
-      // RAFT_LOG_INFO("Got the barrier");
-      // then release the worker id
+      input_barriers.data()[worker_id].arrive_and_wait();
       queue.push(worker_id);
-      // RAFT_LOG_INFO("Returned worker %u", worker_id);
     }
-    // latch->arrive_and_wait();
+    completion_latch->wait();
     // RAFT_LOG_INFO("Launch done!");
     last_touch.store(std::chrono::system_clock::now());
   }

From a1a091c860e1df23a72622ea45b846cc29496a08 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 16 Apr 2024 17:15:42 +0200
Subject: [PATCH 03/37] Use simple atomics for synchronization.

Minimize the host<->device latencies by using host pinned memory and device memory for intra-device comm
---
 .../neighbors/detail/cagra/cagra_search.cuh   |   7 +-
 .../cagra/search_single_cta_kernel-inl.cuh    | 209 ++++++++----------
 2 files changed, 90 insertions(+), 126 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index a942323b4e..67fad2e46a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -105,11 +105,6 @@ void search_main_core(
     params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
   }
 
-  uint64_t pmask     = 0x8000000000000000LL;
-  bool is_persistent = params.rand_xor_mask & pmask;
-  rmm::device_uvector<uint32_t> completion_latch(is_persistent ? 16 : 0,
-                                                 raft::resource::get_cuda_stream(res));
-
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
@@ -140,7 +135,7 @@ void search_main_core(
         ? reinterpret_cast<const typename DatasetDescriptorT::INDEX_T*>(plan->dev_seed.data()) +
             (plan->num_seeds * qid)
         : nullptr;
-    uint32_t* _num_executed_iterations = completion_latch.data();
+    uint32_t* _num_executed_iterations = nullptr;
 
     (*plan)(res,
             dataset_desc,
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 912bba5249..7548963c0b 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -37,10 +37,10 @@
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
-#include <cuda/barrier>
+#include <cuda/atomic>
 #include <cuda/latch>
 
 #include <atomic_queue/atomic_queue.h>
@@ -50,6 +50,7 @@
 #include <chrono>
 #include <cstdint>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <thread>
@@ -893,14 +894,24 @@ struct work_desc_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  cuda::latch<cuda::thread_scope_system>* completion_latch;
   index_type* result_indices_ptr;       // [num_queries, top_k]
   distance_type* result_distances_ptr;  // [num_queries, top_k]
   const data_type* queries_ptr;         // [num_queries, dataset_dim]
-  uint32_t query_id;                    // [0...num_queries - 1]
   uint32_t top_k;
+  uint32_t n_queries;  // also used as a completion indicator: last thread sets it to zero
 };
 
+using work_handle_t = cuda::atomic<uint64_t, cuda::thread_scope_system>;
+union work_handle_view_t {
+  uint64_t handle;
+  struct value_t {
+    uint32_t desc_id;
+    uint32_t query_id;
+  } value;
+};
+constexpr uint64_t kWaitForWork = std::numeric_limits<uint64_t>::max();
+constexpr uint64_t kNoMoreWork  = kWaitForWork - 1;
+
 template <uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
           unsigned MAX_ITOPK,
@@ -910,8 +921,9 @@ template <uint32_t TEAM_SIZE,
           class SAMPLE_FILTER_T>
 __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   DATASET_DESCRIPTOR_T dataset_desc,
-  cuda::barrier<cuda::thread_scope_system>* input_barriers,
+  work_handle_t* work_handles,
   work_desc_t<DATASET_DESCRIPTOR_T>* work_descriptors,
+  uint32_t* completion_counters,
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
   const unsigned num_distilation,
@@ -931,7 +943,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   SAMPLE_FILTER_T sample_filter,
   raft::distance::DistanceType metric)
 {
-  auto& input_barrier = input_barriers[blockIdx.y];
+  auto& work_handle = work_handles[blockIdx.y];
 
   using work_desc_type     = work_desc_t<DATASET_DESCRIPTOR_T>;
   using blob_elem          = uint32_t;
@@ -942,36 +954,32 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     blob_elem blob[kBlobSize];
   } work_descriptor;
 
-  cuda::barrier<cuda::thread_scope_system>::arrival_token ready_to_read;
-  if (threadIdx.x == 0) { ready_to_read = input_barrier.arrive(); }
+  __shared__ work_handle_view_t work_index;
 
   while (true) {
     // wait the writing phase
-    if (threadIdx.x == 0) { input_barrier.wait(std::move(ready_to_read)); }
+    if (threadIdx.x == 0) {
+      do {
+        work_index.handle = work_handle.load(cuda::memory_order_acquire);
+      } while (work_index.handle == kWaitForWork);
+    }
     __syncthreads();
-    cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
+    if (work_index.handle == kNoMoreWork) { break; }
+    auto work_ix = work_index.value.desc_id;
 
     for (auto i = threadIdx.x; i < kBlobSize; i += blockDim.x) {
-      work_descriptor.blob[i] = reinterpret_cast<blob_elem*>(work_descriptors + blockIdx.y)[i];
+      work_descriptor.blob[i] = reinterpret_cast<blob_elem*>(work_descriptors + work_ix)[i];
     }
     __syncthreads();
+    if (threadIdx.x == 0) { work_handle.store(kWaitForWork, cuda::memory_order_relaxed); }
 
     // reading phase
-    auto* completion_latch = work_descriptor.value.completion_latch;
-    // empty input means terminate the persistent kernel.
-    if (completion_latch == nullptr) {
-      if (threadIdx.x == 0) { input_barrier.arrive_and_drop(); }
-      break;
-    }
     auto* result_indices_ptr   = work_descriptor.value.result_indices_ptr;
     auto* result_distances_ptr = work_descriptor.value.result_distances_ptr;
     auto* queries_ptr          = work_descriptor.value.queries_ptr;
-    auto query_id              = work_descriptor.value.query_id;
     auto top_k                 = work_descriptor.value.top_k;
-
-    // arrive to mark the end of the reading phase
-    __syncthreads();
-    if (threadIdx.x == 0) { ready_to_read = input_barrier.arrive(); }
+    auto n_queries             = work_descriptor.value.n_queries;
+    auto query_id              = work_index.value.query_id;
 
     // work phase
     search_core<TEAM_SIZE,
@@ -1006,22 +1014,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
 
     // arrive to mark the end of the work phase
     __syncthreads();
-    if (threadIdx.x == 0) { completion_latch->count_down(); }
+    if (threadIdx.x == 0) {
+      auto completed_count = atomicInc(completion_counters + work_ix, n_queries - 1) + 1;
+      if (completed_count >= n_queries) {
+        reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
+          &work_descriptors[work_ix].n_queries)
+          ->store(0, cuda::memory_order_relaxed);
+      }
+    }
   }
 }
 
-// RAFT_KERNEL
-// register_completion_kernel(cuda::barrier<cuda::thread_scope_system>* bar,
-//                            cuda::latch<cuda::thread_scope_device>* latch,
-//                            uint32_t num_queries)
-// {
-//   if (threadIdx.x != 0 || bar == nullptr || latch == nullptr) { return; }
-//   new (latch) cuda::latch<cuda::thread_scope_device>(num_queries);
-//   cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
-//   [[maybe_unused]] auto ready_to_read = bar->arrive();
-//   latch->wait();
-// }
-
 template <bool Persistent,
           uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
@@ -1152,8 +1155,9 @@ inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_c
 
 struct persistent_runner_base_t {
   using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
-  rmm::mr::managed_memory_resource input_barriers_mr;
+  rmm::mr::pinned_host_memory_resource work_handles_mr;
   rmm::mr::pinned_host_memory_resource work_descriptor_mr;
+  rmm::mr::cuda_memory_resource completion_counters_mr;
   cudaStream_t stream;
   work_queue_type queue;
   persistent_runner_base_t(cudaStream_t stream) : stream(stream), queue() {}
@@ -1174,8 +1178,9 @@ struct persistent_runner_t : public persistent_runner_base_t {
   using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
   kernel_type kernel;
   uint32_t block_size;
-  rmm::device_uvector<cuda::barrier<cuda::thread_scope_system>> input_barriers;
+  rmm::device_uvector<work_handle_t> work_handles;
   rmm::device_uvector<work_desc_type> work_descriptors;
+  rmm::device_uvector<uint32_t> completion_counters;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
 
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
@@ -1201,19 +1206,13 @@ struct persistent_runner_t : public persistent_runner_base_t {
       kernel{kernel_config_type::choose_itopk_and_mx_candidates(
         itopk_size, num_itopk_candidates, block_size)},
       block_size{block_size},
-      input_barriers(0, stream, input_barriers_mr),
-      work_descriptors(0, stream, work_descriptor_mr)
+      work_handles(0, stream, work_handles_mr),
+      work_descriptors(0, stream, work_descriptor_mr),
+      completion_counters(0, stream, completion_counters_mr)
   {
-    int gpu_dev;
-    RAFT_CUDA_TRY(cudaGetDevice(&gpu_dev));
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
       cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    // // do the same for subsequently launched completion kernel to make sure
-    // // its config is the same and it is loaded by the driver before this kernel is launched
-    // RAFT_CUDA_TRY(cudaFuncSetAttribute(
-    //   register_completion_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    // register_completion_kernel<<<1, 1, 0, stream>>>(nullptr, nullptr, 1);
 
     // set kernel launch parameters
     dim3 gs = calc_coop_grid_size(block_size, smem_size);
@@ -1222,38 +1221,22 @@ struct persistent_runner_t : public persistent_runner_base_t {
       "Launching persistent kernel with %u threads, %u block %u smem", bs.x, gs.y, smem_size);
 
     // initialize the work queue
+    completion_counters.resize(gs.y, stream);
+    auto* completion_counters_ptr = completion_counters.data();
     work_descriptors.resize(gs.y, stream);
     auto* work_descriptors_ptr = work_descriptors.data();
-    // {
-    //   auto allocation_size = work_descriptors.size() * sizeof(work_desc_type);
-    //   RAFT_CUDA_TRY(cudaMemAdvise(
-    //     work_descriptors_ptr, allocation_size, cudaMemAdviseSetPreferredLocation,
-    //     cudaCpuDeviceId));
-    //   RAFT_CUDA_TRY(
-    //     cudaMemAdvise(work_descriptors_ptr, allocation_size, cudaMemAdviseSetAccessedBy,
-    //     gpu_dev));
-    // }
 
-    input_barriers.resize(gs.y, stream);
-    auto* input_barriers_ptr = input_barriers.data();
-    {
-      auto allocation_size =
-        input_barriers.size() * sizeof(cuda::barrier<cuda::thread_scope_system>);
-      RAFT_CUDA_TRY(cudaMemAdvise(
-        input_barriers_ptr, allocation_size, cudaMemAdviseSetPreferredLocation, gpu_dev));
-      RAFT_CUDA_TRY(cudaMemAdvise(
-        input_barriers_ptr, allocation_size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
-    }
+    work_handles.resize(gs.y, stream);
+    auto* work_handles_ptr = work_handles.data();
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     for (uint32_t i = 0; i < gs.y; i++) {
-      auto& wd = work_descriptors_ptr[i];
-      init(input_barriers_ptr + i, 2);
-      wd.completion_latch     = nullptr;
+      auto& wd                = work_descriptors_ptr[i];
       wd.result_indices_ptr   = nullptr;
       wd.result_distances_ptr = nullptr;
       wd.queries_ptr          = nullptr;
-      wd.query_id             = 0;
       wd.top_k                = 0;
+      wd.n_queries            = 0;
+      work_handles_ptr[i].store(kWaitForWork);
       queue.push(i);
     }
 
@@ -1265,8 +1248,9 @@ struct persistent_runner_t : public persistent_runner_base_t {
 
     void* args[] =  // NOLINT
       {&dataset_desc,
-       &input_barriers_ptr,
+       &work_handles_ptr,
        &work_descriptors_ptr,
+       &completion_counters_ptr,
        &graph_ptr,  // [dataset_size, graph_degree]
        &graph_degree,
        &num_random_samplings,
@@ -1295,22 +1279,19 @@ struct persistent_runner_t : public persistent_runner_base_t {
 
   ~persistent_runner_t() noexcept override
   {
-    auto ibs           = input_barriers.data();
-    auto wds           = work_descriptors.data();
-    auto wdl           = work_descriptors.size();
+    auto whs           = work_handles.data();
+    auto whl           = work_handles.size();
     uint32_t worker_id = 0;
+    auto count         = whl;
     // wait for all the jobs to finish nicely
     while (queue.try_pop(worker_id)) {
-      auto& wd                   = wds[worker_id];
-      wd.completion_latch        = nullptr;
-      [[maybe_unused]] auto done = ibs[worker_id].arrive();
+      whs[worker_id].store(kNoMoreWork, cuda::memory_order_relaxed);
+      count--;
     }
-    // try to kill stuck threads if any
-    for (uint32_t i = 0; i < wdl; i++) {
-      auto& wd = wds[i];
-      if (wd.completion_latch != nullptr) {
-        wd.completion_latch        = nullptr;
-        [[maybe_unused]] auto done = ibs[i].arrive();
+    if (count > 0) {
+      // try to kill stuck threads if any
+      for (uint32_t i = 0; i < whl; i++) {
+        whs[i].store(kNoMoreWork, cuda::memory_order_relaxed);
       }
     }
     RAFT_LOG_INFO("Destroyed the persistent runner.");
@@ -1320,40 +1301,36 @@ struct persistent_runner_t : public persistent_runner_base_t {
               distance_type* result_distances_ptr,  // [num_queries, top_k]
               const data_type* queries_ptr,         // [num_queries, dataset_dim]
               uint32_t num_queries,
-              uint32_t top_k,
-              cuda::latch<cuda::thread_scope_device>* _completion_latch,
-              cudaStream_t local_stream)
+              uint32_t top_k)
   {
-    using latch_t = cuda::latch<cuda::thread_scope_system>;
-    static thread_local std::unique_ptr<latch_t, std::function<void(latch_t*)>>
-      completion_latch_store(
-        []() {
-          latch_t* x = nullptr;
-          cudaMallocManaged(&x, sizeof(latch_t));
-          RAFT_CUDA_TRY(
-            cudaMemAdvise(x, sizeof(latch_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
-          RAFT_CUDA_TRY(cudaMemAdvise(x, sizeof(latch_t), cudaMemAdviseSetAccessedBy, 0));
-          return x;
-        }(),
-        [](latch_t* x) { cudaFree(x); });
-    // wait for all workers to finish
-    auto* completion_latch = new (completion_latch_store.get()) latch_t{num_queries};
+    cuda::atomic<uint32_t, cuda::thread_scope_system>* completion_latch;
+    uint32_t lead_worker_id;
     for (uint32_t i = 0; i < num_queries; i++) {
       auto worker_id = queue.pop();
-      // RAFT_LOG_INFO("Submitting query %u (worker id = %u)", i, worker_id);
-      auto& wd                = work_descriptors.data()[worker_id];
-      wd.completion_latch     = completion_latch;
-      wd.result_indices_ptr   = result_indices_ptr;
-      wd.result_distances_ptr = result_distances_ptr;
-      wd.queries_ptr          = queries_ptr;
-      wd.query_id             = i;
-      wd.top_k                = top_k;
-      cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
-      input_barriers.data()[worker_id].arrive_and_wait();
+
+      if (i == 0) {
+        lead_worker_id = worker_id;
+        auto& wd       = work_descriptors.data()[lead_worker_id];
+        completion_latch =
+          reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(&wd.n_queries);
+        wd.result_indices_ptr   = result_indices_ptr;
+        wd.result_distances_ptr = result_distances_ptr;
+        wd.queries_ptr          = queries_ptr;
+        wd.top_k                = top_k;
+        wd.n_queries            = num_queries;
+      }
+
+      work_handles.data()[worker_id].store(
+        (work_handle_view_t{.value = {lead_worker_id, i}}).handle, cuda::memory_order_release);
+      // danger: need to wait till the GPU finishes reading.
       queue.push(worker_id);
     }
-    completion_latch->wait();
-    // RAFT_LOG_INFO("Launch done!");
+    while (completion_latch->load() != 0) {
+      // Not sure if this improves the perf, but it does not seem to hurt it.
+      // Let's hope this reduces cpu utilization
+      std::this_thread::yield();
+    }
+
     last_touch.store(std::chrono::system_clock::now());
   }
 
@@ -1474,15 +1451,7 @@ void select_and_run(
       }
       runner_local_copy = persistent_runner;
     }
-    auto* completion_latch =
-      reinterpret_cast<cuda::latch<cuda::thread_scope_device>*>(num_executed_iterations);
-    runner->launch(topk_indices_ptr,
-                   topk_distances_ptr,
-                   queries_ptr,
-                   num_queries,
-                   topk,
-                   completion_latch,
-                   stream);
+    runner->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
   } else {
     auto kernel =
       search_kernel_config<false,

From e6ad7b6032dbb1fd7555cd91d9159aba0380cd7d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 17 Apr 2024 13:08:17 +0200
Subject: [PATCH 04/37] Added launcher_t - a helper state machine struct to
 interleave work queue submitting with worker releasing

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 133 +++++++++++++++---
 1 file changed, 110 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 7548963c0b..513d50e990 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1164,6 +1164,107 @@ struct persistent_runner_base_t {
   virtual ~persistent_runner_base_t() noexcept = default;
 };
 
+struct launcher_t {
+  using work_queue_type = persistent_runner_base_t::work_queue_type;
+  using pending_reads_queue_type =
+    atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, true, false, true>;
+  using completion_latch_type = cuda::atomic<uint32_t, cuda::thread_scope_system>;
+
+  pending_reads_queue_type pending_reads{};
+  work_queue_type& worker_ids;
+  work_handle_t* work_handles;
+  uint32_t lead_worker_id;
+  completion_latch_type* completion_latch;
+  bool all_done = false;
+
+  template <typename RecordWork>
+  launcher_t(work_queue_type& worker_ids,
+             work_handle_t* work_handles,
+             uint32_t n_queries,
+             RecordWork record_work)
+    : worker_ids{worker_ids},
+      work_handles{work_handles},
+      lead_worker_id{worker_ids.pop()},
+      completion_latch{record_work(lead_worker_id)}
+  {
+    // The first worker is special: one may associate a work_descriptor and the completion_latch
+    // with the same id. Hence it bypassed the `pending_reads` queue and is only released at the
+    // very end.
+    submit_query(lead_worker_id, 0, false);
+    // Submit all queries in the batch
+    for (uint32_t i = 1; i < n_queries; i++) {
+      uint32_t worker_id;
+      while (!try_get_worker(worker_id)) {
+        if (pending_reads.try_pop(worker_id)) {
+          if (!try_return_worker(worker_id)) { pending_reads.push(worker_id); }
+        }
+      }
+      submit_query(worker_id, i);
+    }
+  }
+
+  void submit_query(uint32_t worker_id, uint32_t query_id, bool add_to_pending_reads = true)
+  {
+    work_handles[worker_id].store((work_handle_view_t{.value = {lead_worker_id, query_id}}).handle,
+                                  cuda::memory_order_release);
+
+    if (!add_to_pending_reads) { return; }
+    while (!pending_reads.try_push(worker_id)) {
+      // The only reason pending_reads cannot push is that the queue is full.
+      // It's local, so we must pop and wait for the returned worker to finish its work.
+      auto pending_worker_id = pending_reads.pop();
+      while (!try_return_worker(pending_worker_id)) {
+        std::this_thread::yield();
+      }
+    }
+  }
+
+  /** Check if the worker has finished the work; if so, return it to the shared pool. */
+  auto try_return_worker(uint32_t worker_id) -> bool
+  {
+    // Use the cached `all_done` - makes sence when called from the `wait()` routine.
+    if (all_done || work_handles[worker_id].load(cuda::memory_order_relaxed) == kWaitForWork) {
+      worker_ids.push(worker_id);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /** Try get a free worker if any. */
+  auto try_get_worker(uint32_t& worker_id) -> bool { return worker_ids.try_pop(worker_id); }
+
+  /** Check if all workers finished their work. */
+  auto is_all_done()
+  {
+    // Cache the result of the check to avoid doing unnecessary atomic loads.
+    if (all_done) { return true; }
+    all_done = completion_latch->load() == 0;
+    return all_done;
+  }
+
+  /** Wait for all work to finish and don't forget to return the workers to the shared pool. */
+  void wait()
+  {
+    uint32_t worker_id;
+    while (pending_reads.try_pop(worker_id)) {
+      while (!try_return_worker(worker_id)) {
+        if (!is_all_done()) { std::this_thread::yield(); }
+      }
+    }
+    // terminal state, should be engaged only after the `pending_reads` is empty
+    // and `queries_submitted == n_queries`
+    while (!is_all_done()) {
+      // Not sure if this improves the perf, but it does not seem to hurt it.
+      // Let's hope this reduces cpu utilization
+      std::this_thread::yield();
+    }
+
+    // lead_worker_id is reused for the handles, so we can only return it at the end
+    try_return_worker(lead_worker_id);
+  }
+};
+
 template <unsigned TEAM_SIZE,
           unsigned DATASET_BLOCK_DIM,
           typename DATASET_DESCRIPTOR_T,
@@ -1303,34 +1404,20 @@ struct persistent_runner_t : public persistent_runner_base_t {
               uint32_t num_queries,
               uint32_t top_k)
   {
-    cuda::atomic<uint32_t, cuda::thread_scope_system>* completion_latch;
-    uint32_t lead_worker_id;
-    for (uint32_t i = 0; i < num_queries; i++) {
-      auto worker_id = queue.pop();
-
-      if (i == 0) {
-        lead_worker_id = worker_id;
-        auto& wd       = work_descriptors.data()[lead_worker_id];
-        completion_latch =
-          reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(&wd.n_queries);
+    launcher_t{
+      queue,
+      work_handles.data(),
+      num_queries,
+      [=](uint32_t worker_id) {
+        auto& wd                = work_descriptors.data()[worker_id];
         wd.result_indices_ptr   = result_indices_ptr;
         wd.result_distances_ptr = result_distances_ptr;
         wd.queries_ptr          = queries_ptr;
         wd.top_k                = top_k;
         wd.n_queries            = num_queries;
-      }
-
-      work_handles.data()[worker_id].store(
-        (work_handle_view_t{.value = {lead_worker_id, i}}).handle, cuda::memory_order_release);
-      // danger: need to wait till the GPU finishes reading.
-      queue.push(worker_id);
-    }
-    while (completion_latch->load() != 0) {
-      // Not sure if this improves the perf, but it does not seem to hurt it.
-      // Let's hope this reduces cpu utilization
-      std::this_thread::yield();
-    }
-
+        return reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(&wd.n_queries);
+      }}
+      .wait();
     last_touch.store(std::chrono::system_clock::now());
   }
 

From 0137dd4f72bcb5b2e599b9a4f726a019d6c48fe4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:39:14 +0200
Subject: [PATCH 05/37] Initialize the kernel runner in a separate thread and
 improve thread safety related to the runner.

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 145 ++++++++++--------
 1 file changed, 79 insertions(+), 66 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 513d50e990..71454570c5 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1151,17 +1151,18 @@ struct search_kernel_config {
   }
 };
 
-inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_clock>>* last_touch);
-
 struct persistent_runner_base_t {
   using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
   rmm::mr::pinned_host_memory_resource work_handles_mr;
   rmm::mr::pinned_host_memory_resource work_descriptor_mr;
   rmm::mr::cuda_memory_resource completion_counters_mr;
-  cudaStream_t stream;
-  work_queue_type queue;
-  persistent_runner_base_t(cudaStream_t stream) : stream(stream), queue() {}
-  virtual ~persistent_runner_base_t() noexcept = default;
+  cudaStream_t stream{};
+  work_queue_type queue{};
+  persistent_runner_base_t() : queue()
+  {
+    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  }
+  virtual ~persistent_runner_base_t() noexcept { cudaStreamDestroy(stream); };
 };
 
 struct launcher_t {
@@ -1301,9 +1302,8 @@ struct persistent_runner_t : public persistent_runner_base_t {
                       size_t min_iterations,
                       size_t max_iterations,
                       SAMPLE_FILTER_T sample_filter,
-                      raft::distance::DistanceType metric,
-                      cudaStream_t stream)
-    : persistent_runner_base_t{stream},
+                      raft::distance::DistanceType metric)
+    : persistent_runner_base_t{},
       kernel{kernel_config_type::choose_itopk_and_mx_candidates(
         itopk_size, num_itopk_candidates, block_size)},
       block_size{block_size},
@@ -1374,8 +1374,6 @@ struct persistent_runner_t : public persistent_runner_base_t {
     RAFT_LOG_INFO("Initialized the kernel in stream %zd, queue size = %u",
                   int64_t((cudaStream_t)stream),
                   queue.was_size());
-
-    std::thread(run_zombie, &last_touch).detach();
   }
 
   ~persistent_runner_t() noexcept override
@@ -1438,25 +1436,60 @@ struct persistent_runner_t : public persistent_runner_base_t {
   }
 };
 
-struct non_blocking_stream {
-  non_blocking_stream() { cudaStreamCreateWithFlags(&value, cudaStreamNonBlocking); }
-  ~non_blocking_stream() noexcept { cudaStreamDestroy(value); }
-  cudaStream_t value;
-};
-
-inline std::unique_ptr<non_blocking_stream> persistent_stream;
 inline std::shared_ptr<persistent_runner_base_t> persistent_runner{nullptr};
 inline std::mutex persistent_lock;
 
-inline void run_zombie(std::atomic<std::chrono::time_point<std::chrono::system_clock>>* last_touch)
+template <typename RunnerT, typename... Args>
+auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass everything by values
+{
+  cuda::atomic<std::shared_ptr<RunnerT>*> runner_outer{nullptr};
+  std::thread(
+    [&runner_outer](Args... thread_args) {  // pass everything by values
+      std::shared_ptr<RunnerT> runner_inner{nullptr};
+      std::weak_ptr<RunnerT> runner_weak;
+      {
+        std::lock_guard<std::mutex> guard(persistent_lock);
+        persistent_runner.reset();  // Free the resources (if any) in advance
+        runner_inner      = std::make_shared<RunnerT>(thread_args...);
+        runner_weak       = runner_inner;
+        persistent_runner = std::static_pointer_cast<persistent_runner_base_t>(runner_inner);
+        runner_outer.store(new std::shared_ptr<RunnerT>{runner_inner});
+        runner_outer.notify_one();
+        runner_inner->last_touch.store(std::chrono::system_clock::now());
+        runner_inner.reset();
+      }
+      constexpr auto kInterval = std::chrono::milliseconds(500);
+      while (true) {
+        std::this_thread::sleep_for(kInterval);
+        std::lock_guard<std::mutex> guard(persistent_lock);
+        auto runner = runner_weak.lock();
+        if (!runner) {
+          return;  // dead already
+        }
+        if (runner->last_touch.load() + kInterval < std::chrono::system_clock::now()) {
+          if (runner == persistent_runner) { persistent_runner.reset(); }
+          return;
+        }
+      }
+    },
+    args...)
+    .detach();
+  runner_outer.wait(nullptr);
+  auto* p = runner_outer.load();
+  auto r  = std::move(*p);
+  delete p;
+  return r;
+}
+
+template <typename RunnerT, typename... Args>
+auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
 {
-  constexpr auto kInterval = std::chrono::milliseconds(500);
-  last_touch->store(std::chrono::system_clock::now());
-  while (last_touch->load() + kInterval >= std::chrono::system_clock::now()) {
-    std::this_thread::sleep_for(kInterval);
+  {
+    std::lock_guard<std::mutex> guard(persistent_lock);
+    auto runner = std::dynamic_pointer_cast<RunnerT>(persistent_runner);
+    if (runner) { return runner; }
   }
-  std::lock_guard<std::mutex> guard(persistent_lock);
-  persistent_runner.reset();
+  return create_runner<RunnerT>(args...);
 }
 
 template <unsigned TEAM_SIZE,
@@ -1498,47 +1531,27 @@ void select_and_run(
   bool is_persistent = rand_xor_mask & pmask;
   rand_xor_mask &= ~pmask;
   if (is_persistent) {
-    using runner_type = persistent_runner_t<true,
-                                            TEAM_SIZE,
-                                            DATASET_BLOCK_DIM,
-                                            DATASET_DESCRIPTOR_T,
-                                            SAMPLE_FILTER_T>;
-    // This is used to keep the object alive if `persistent_runner` gets reset.
-    std::shared_ptr<persistent_runner_base_t> runner_local_copy;
-    runner_type* runner = nullptr;
-    {
-      std::lock_guard<std::mutex> guard(persistent_lock);
-      runner = dynamic_cast<runner_type*>(persistent_runner.get());
-      if (runner == nullptr) {
-        // Free the resources (if any) in advance
-        persistent_runner = std::shared_ptr<persistent_runner_base_t>();
-        // Lazy-create a stream, which is going to be used by all runners till the program exists
-        if (!persistent_stream) { persistent_stream = std::make_unique<non_blocking_stream>(); }
-        // Create a new runner
-        runner = new runner_type(dataset_desc,
-                                 graph,
-                                 num_itopk_candidates,
-                                 block_size,
-                                 smem_size,
-                                 hash_bitlen,
-                                 hashmap_ptr,
-                                 small_hash_bitlen,
-                                 small_hash_reset_interval,
-                                 num_random_samplings,
-                                 rand_xor_mask,
-                                 num_seeds,
-                                 itopk_size,
-                                 search_width,
-                                 min_iterations,
-                                 max_iterations,
-                                 sample_filter,
-                                 metric,
-                                 persistent_stream->value);
-        persistent_runner.reset(runner);
-      }
-      runner_local_copy = persistent_runner;
-    }
-    runner->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
+    using runner_type =
+      persistent_runner_t<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+    get_runner<runner_type>(dataset_desc,
+                            graph,
+                            num_itopk_candidates,
+                            block_size,
+                            smem_size,
+                            hash_bitlen,
+                            hashmap_ptr,
+                            small_hash_bitlen,
+                            small_hash_reset_interval,
+                            num_random_samplings,
+                            rand_xor_mask,
+                            num_seeds,
+                            itopk_size,
+                            search_width,
+                            min_iterations,
+                            max_iterations,
+                            sample_filter,
+                            metric)
+      ->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
   } else {
     auto kernel =
       search_kernel_config<false,

From 25dda446b27724f2eddb1fc20d5a2d2209ee53ad Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:52:15 +0200
Subject: [PATCH 06/37] Added small memory sync optimizations

---
 .../cagra/search_single_cta_kernel-inl.cuh     | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 71454570c5..f4792c6631 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -38,6 +38,7 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
 #include <cuda/atomic>
@@ -960,8 +961,9 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     // wait the writing phase
     if (threadIdx.x == 0) {
       do {
-        work_index.handle = work_handle.load(cuda::memory_order_acquire);
+        work_index.handle = work_handle.load(cuda::memory_order_relaxed);
       } while (work_index.handle == kWaitForWork);
+      cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
     }
     __syncthreads();
     if (work_index.handle == kNoMoreWork) { break; }
@@ -1017,6 +1019,9 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     if (threadIdx.x == 0) {
       auto completed_count = atomicInc(completion_counters + work_ix, n_queries - 1) + 1;
       if (completed_count >= n_queries) {
+        // we may need a memory fence here:
+        //   - device - if the queries are accessed by the device
+        //   - system - e.g. if we put them into managed/pinned memory.
         reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
           &work_descriptors[work_ix].n_queries)
           ->store(0, cuda::memory_order_relaxed);
@@ -1191,7 +1196,7 @@ struct launcher_t {
     // The first worker is special: one may associate a work_descriptor and the completion_latch
     // with the same id. Hence it bypassed the `pending_reads` queue and is only released at the
     // very end.
-    submit_query(lead_worker_id, 0, false);
+    submit_query(lead_worker_id, 0, true);
     // Submit all queries in the batch
     for (uint32_t i = 1; i < n_queries; i++) {
       uint32_t worker_id;
@@ -1204,12 +1209,13 @@ struct launcher_t {
     }
   }
 
-  void submit_query(uint32_t worker_id, uint32_t query_id, bool add_to_pending_reads = true)
+  void submit_query(uint32_t worker_id, uint32_t query_id, bool first_to_submit = false)
   {
-    work_handles[worker_id].store((work_handle_view_t{.value = {lead_worker_id, query_id}}).handle,
-                                  cuda::memory_order_release);
+    work_handles[worker_id].store(
+      (work_handle_view_t{.value = {lead_worker_id, query_id}}).handle,
+      first_to_submit ? cuda::memory_order_release : cuda::memory_order_relaxed);
 
-    if (!add_to_pending_reads) { return; }
+    if (first_to_submit) { return; }
     while (!pending_reads.try_push(worker_id)) {
       // The only reason pending_reads cannot push is that the queue is full.
       // It's local, so we must pop and wait for the returned worker to finish its work.

From 1d60ab540457e8ece830d4b9910b0c90c25afb45 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 12:16:05 +0200
Subject: [PATCH 07/37] Change the benchmark neighbors time size_t -> uint32_t
 and make benchmark loop event sync optional

When using the persistent kernel variant, the calling CPU thread has to synchronize
with the GPU (wait on the completion flag) - i.e. there's no way to use events for this.
As a result, the event recording and sync in the benchmark loop introduce significant latency overheads.
To avoid this, I make the event optional (dependant on the search mode: persistent/original).

Originally, the benchmark used size_t indices, whereas CAGRA operated with uint32_t.
As a result, we had to do a linear mapping (on GPU), which adds a kernel to the benchmark loop,
which goes against the event optimization above.
Hence, I changed the benchmark index type.
---
 cpp/bench/ann/CMakeLists.txt                  |  5 +-
 cpp/bench/ann/src/common/ann_types.hpp        | 20 ++++-
 cpp/bench/ann/src/common/benchmark.hpp        | 18 ++--
 cpp/bench/ann/src/common/util.hpp             |  4 +-
 cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h   |  9 +-
 cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h   |  9 +-
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh       | 16 ++--
 cpp/bench/ann/src/raft/raft_cagra_float.cu    | 20 +++++
 cpp/bench/ann/src/raft/raft_cagra_half.cu     | 20 +++++
 .../ann/src/raft/raft_cagra_hnswlib_wrapper.h |  9 +-
 cpp/bench/ann/src/raft/raft_cagra_int8_t.cu   | 20 +++++
 .../{raft_cagra.cu => raft_cagra_uint8_t.cu}  |  3 -
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   | 88 ++++++++++++-------
 .../ann/src/raft/raft_ivf_flat_wrapper.h      | 22 ++---
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  |  9 +-
 cpp/bench/ann/src/raft/raft_wrapper.h         | 11 ++-
 .../cagra/search_single_cta_kernel-inl.cuh    |  4 +-
 17 files changed, 202 insertions(+), 85 deletions(-)
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_float.cu
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_half.cu
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
 rename cpp/bench/ann/src/raft/{raft_cagra.cu => raft_cagra_uint8_t.cu} (85%)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index ee84f7515a..f29d32ccde 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -266,7 +266,10 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
     RAFT_CAGRA
     PATH
     bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_float.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_half.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_int8_t.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_uint8_t.cu>
     LINKS
     raft::compiled
   )
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index c6213059dc..19cc36b17e 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -73,6 +73,8 @@ struct AlgoProperty {
 
 class AnnBase {
  public:
+  using index_type = uint32_t;
+
   inline AnnBase(Metric metric, int dim) : metric_(metric), dim_(dim) {}
   virtual ~AnnBase() noexcept = default;
 
@@ -98,7 +100,16 @@ class AnnGPU {
    * end.
    */
   [[nodiscard]] virtual auto get_sync_stream() const noexcept -> cudaStream_t = 0;
-  virtual ~AnnGPU() noexcept                                                  = default;
+  /**
+   * By default a GPU algorithm uses a fixed stream to order GPU operations.
+   * However, an algorithm may need to synchronize with the host at the end of its execution.
+   * In that case, also synchronizing with a benchmark event would put it at disadvantage.
+   *
+   * We can disable event sync by passing `false` here
+   *   - ONLY IF THE ALGORITHM HAS PRODUCED ITS OUTPUT BY THE TIME IT SYNCHRONIZES WITH CPU.
+   */
+  [[nodiscard]] virtual auto uses_stream() const noexcept -> bool { return true; }
+  virtual ~AnnGPU() noexcept = default;
 };
 
 template <typename T>
@@ -118,8 +129,11 @@ class ANN : public AnnBase {
   virtual void set_search_param(const AnnSearchParam& param) = 0;
   // TODO: this assumes that an algorithm can always return k results.
   // This is not always possible.
-  virtual void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const = 0;
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      AnnBase::index_type* neighbors,
+                      float* distances) const = 0;
 
   virtual void save(const std::string& file) const = 0;
   virtual void load(const std::string& file)       = 0;
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index d7bcd17a00..ad392d53a2 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,10 +280,10 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  std::shared_ptr<buf<float>> distances =
+  auto distances =
     std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
-  std::shared_ptr<buf<std::size_t>> neighbors =
-    std::make_shared<buf<std::size_t>>(current_algo_props->query_memory_type, k * query_set_size);
+  auto neighbors = std::make_shared<buf<AnnBase::index_type>>(current_algo_props->query_memory_type,
+                                                              k * query_set_size);
 
   {
     nvtx_case nvtx{state.name()};
@@ -338,12 +338,12 @@ void bench_search(::benchmark::State& state,
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
   if (dataset->max_k() >= k) {
-    const std::int32_t* gt          = dataset->gt_set();
-    const std::uint32_t max_k       = dataset->max_k();
-    buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
-    std::size_t rows                = std::min(queries_processed, query_set_size);
-    std::size_t match_count         = 0;
-    std::size_t total_count         = rows * static_cast<size_t>(k);
+    const std::int32_t* gt    = dataset->gt_set();
+    const std::uint32_t max_k = dataset->max_k();
+    buf neighbors_host        = neighbors->move(MemoryType::Host);
+    std::size_t rows          = std::min(queries_processed, query_set_size);
+    std::size_t match_count   = 0;
+    std::size_t total_count   = rows * static_cast<size_t>(k);
 
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 0995f0841e..614b4ee3bd 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -120,7 +120,9 @@ struct cuda_timer {
   static inline auto extract_stream(AnnT* algo) -> std::optional<cudaStream_t>
   {
     auto gpu_ann = dynamic_cast<AnnGPU*>(algo);
-    if (gpu_ann != nullptr) { return std::make_optional(gpu_ann->get_sync_stream()); }
+    if (gpu_ann != nullptr && gpu_ann->uses_stream()) {
+      return std::make_optional(gpu_ann->get_sync_stream());
+    }
     return std::nullopt;
   }
 
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 407f7148df..3caca15b7f 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -88,8 +88,11 @@ class FaissCpu : public ANN<T> {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   AlgoProperty get_preference() const override
   {
@@ -169,7 +172,7 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
 
 template <typename T>
 void FaissCpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 633098fd1d..2effe631e5 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -111,8 +111,11 @@ class FaissGpu : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -196,7 +199,7 @@ void FaissGpu<T>::build(const T* dataset, size_t nrow)
 
 template <typename T>
 void FaissGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index c89f02d974..59cf3df806 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -58,8 +58,11 @@ class Ggnn : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override { impl_->build(dataset, nrow); }
 
   void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override
   {
     impl_->search(queries, batch_size, k, neighbors, distances);
   }
@@ -123,8 +126,11 @@ class GgnnImpl : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override { return stream_; }
 
   void save(const std::string& file) const override;
@@ -243,7 +249,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearc
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
   if (k != KQuery) {
diff --git a/cpp/bench/ann/src/raft/raft_cagra_float.cu b/cpp/bench/ann/src/raft/raft_cagra_float.cu
new file mode 100644
index 0000000000..058f5bf34a
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_float.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<float, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_half.cu b/cpp/bench/ann/src/raft/raft_cagra_half.cu
new file mode 100644
index 0000000000..a015819ec5
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_half.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<half, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
index ed9c120ed4..9345045022 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -43,8 +43,11 @@ class RaftCagraHnswlib : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -99,7 +102,7 @@ void RaftCagraHnswlib<T, IdxT>::load(const std::string& file)
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
 }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
new file mode 100644
index 0000000000..be3b83ee60
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<int8_t, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra.cu b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
similarity index 85%
rename from cpp/bench/ann/src/raft/raft_cagra.cu
rename to cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
index c0c1352a43..c9679e404d 100644
--- a/cpp/bench/ann/src/raft/raft_cagra.cu
+++ b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
@@ -17,7 +17,4 @@
 
 namespace raft::bench::ann {
 template class RaftCagra<uint8_t, uint32_t>;
-template class RaftCagra<int8_t, uint32_t>;
-template class RaftCagra<half, uint32_t>;
-template class RaftCagra<float, uint32_t>;
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 46da8c52e6..c9759ea32a 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -98,16 +98,34 @@ class RaftCagra : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
-  void search_base(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
     return handle_.get_sync_stream();
   }
 
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override
+  {
+    // To avoid too much api changes in the prototype, I encode whether the algorithm runs
+    // persistent kernel using the highest bit in the `rand_xor_mask` parameter.
+    uint64_t pmask = 0x8000000000000000LL;
+    // If the algorithm uses persistent kernel, the CPU has to synchronize by the end of computing
+    // the result. Hence it guarantees the benchmark CUDA stream is empty by the end of the
+    // execution. Hence we notify the benchmark to not waste the time on recording & synchronizing
+    // the event.
+    return !(search_params_.rand_xor_mask & pmask);
+  }
+
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
   {
@@ -272,12 +290,12 @@ std::unique_ptr<ANN<T>> RaftCagra<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search_base(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   IdxT* neighbors_IdxT;
   rmm::device_uvector<IdxT> neighbors_storage(0, resource::get_cuda_stream(handle_));
-  if constexpr (std::is_same_v<IdxT, size_t>) {
-    neighbors_IdxT = neighbors;
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<AnnBase::index_type*>(neighbors);
   } else {
     neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_));
     neighbors_IdxT = neighbors_storage.data();
@@ -291,18 +309,18 @@ void RaftCagra<T, IdxT>::search_base(
   raft::neighbors::cagra::search(
     handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
 
-  if constexpr (!std::is_same_v<IdxT, size_t>) {
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
     raft::linalg::unaryOp(neighbors,
                           neighbors_IdxT,
                           batch_size * k,
-                          raft::cast_op<size_t>(),
+                          raft::cast_op<AnnBase::index_type>(),
                           raft::resource::get_cuda_stream(handle_));
   }
 }
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto k0                       = static_cast<size_t>(refine_ratio_ * k);
   const bool disable_refinement = k0 <= static_cast<size_t>(k);
@@ -312,21 +330,24 @@ void RaftCagra<T, IdxT>::search(
   if (disable_refinement) {
     search_base(queries, batch_size, k, neighbors, distances);
   } else {
-    auto candidate_ixs   = raft::make_device_matrix<int64_t, int64_t>(res, batch_size, k0);
-    auto candidate_dists = raft::make_device_matrix<float, int64_t>(res, batch_size, k0);
+    auto candidate_ixs =
+      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
     search_base(queries,
                 batch_size,
                 k0,
-                reinterpret_cast<size_t*>(candidate_ixs.data_handle()),
+                reinterpret_cast<AnnBase::index_type*>(candidate_ixs.data_handle()),
                 candidate_dists.data_handle());
 
     if (raft::get_device_for_address(input_dataset_v_->data_handle()) >= 0) {
-      auto queries_v =
-        raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, dimension_);
-      auto neighours_v = raft::make_device_matrix_view<int64_t, int64_t>(
-        reinterpret_cast<int64_t*>(neighbors), batch_size, k);
-      auto distances_v = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
-      raft::neighbors::refine<int64_t, T, float, int64_t>(
+      auto queries_v = raft::make_device_matrix_view<const T, AnnBase::index_type>(
+        queries, batch_size, dimension_);
+      auto neighours_v = raft::make_device_matrix_view<AnnBase::index_type, AnnBase::index_type>(
+        reinterpret_cast<AnnBase::index_type*>(neighbors), batch_size, k);
+      auto distances_v =
+        raft::make_device_matrix_view<float, AnnBase::index_type>(distances, batch_size, k);
+      raft::neighbors::refine<AnnBase::index_type, T, float, AnnBase::index_type>(
         res,
         *input_dataset_v_,
         queries_v,
@@ -335,28 +356,31 @@ void RaftCagra<T, IdxT>::search(
         distances_v,
         index_->metric());
     } else {
-      auto dataset_host = raft::make_host_matrix_view<const T, int64_t>(
+      auto dataset_host = raft::make_host_matrix_view<const T, AnnBase::index_type>(
         input_dataset_v_->data_handle(), input_dataset_v_->extent(0), input_dataset_v_->extent(1));
-      auto queries_host    = raft::make_host_matrix<T, int64_t>(batch_size, dimension_);
-      auto candidates_host = raft::make_host_matrix<int64_t, int64_t>(batch_size, k0);
-      auto neighbors_host  = raft::make_host_matrix<int64_t, int64_t>(batch_size, k);
-      auto distances_host  = raft::make_host_matrix<float, int64_t>(batch_size, k);
+      auto queries_host = raft::make_host_matrix<T, AnnBase::index_type>(batch_size, dimension_);
+      auto candidates_host =
+        raft::make_host_matrix<AnnBase::index_type, AnnBase::index_type>(batch_size, k0);
+      auto neighbors_host =
+        raft::make_host_matrix<AnnBase::index_type, AnnBase::index_type>(batch_size, k);
+      auto distances_host = raft::make_host_matrix<float, AnnBase::index_type>(batch_size, k);
 
       raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
       raft::copy(
         candidates_host.data_handle(), candidate_ixs.data_handle(), candidates_host.size(), stream);
 
       raft::resource::sync_stream(res);  // wait for the queries and candidates
-      raft::neighbors::refine<int64_t, T, float, int64_t>(res,
-                                                          dataset_host,
-                                                          queries_host.view(),
-                                                          candidates_host.view(),
-                                                          neighbors_host.view(),
-                                                          distances_host.view(),
-                                                          index_->metric());
+      raft::neighbors::refine<AnnBase::index_type, T, float, AnnBase::index_type>(
+        res,
+        dataset_host,
+        queries_host.view(),
+        candidates_host.view(),
+        neighbors_host.view(),
+        distances_host.view(),
+        index_->metric());
 
       raft::copy(neighbors,
-                 reinterpret_cast<size_t*>(neighbors_host.data_handle()),
+                 reinterpret_cast<AnnBase::index_type*>(neighbors_host.data_handle()),
                  neighbors_host.size(),
                  stream);
       raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 48d2b9de80..9027b5fa86 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -63,8 +63,11 @@ class RaftIvfFlatGpu : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -131,17 +134,10 @@ std::unique_ptr<ANN<T>> RaftIvfFlatGpu<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
-  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
-  raft::neighbors::ivf_flat::search(handle_,
-                                    search_params_,
-                                    *index_,
-                                    queries,
-                                    batch_size,
-                                    k,
-                                    (IdxT*)neighbors,
-                                    distances,
-                                    resource::get_workspace_resource(handle_));
+  static_assert(sizeof(AnnBase::index_type) == sizeof(IdxT), "IdxT is incompatible with size_t");
+  raft::neighbors::ivf_flat::search(
+    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances);
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 1d73bd2e51..3178bfd3e8 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -63,8 +63,11 @@ class RaftIvfPQ : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -139,7 +142,7 @@ void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   if (refine_ratio_ > 1.0f) {
     uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 586b81ae06..12742d42dd 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -58,8 +58,11 @@ class RaftGpu : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
@@ -133,7 +136,7 @@ void RaftGpu<T>::load(const std::string& file)
 
 template <typename T>
 void RaftGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto queries_view =
     raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
@@ -141,7 +144,7 @@ void RaftGpu<T>::search(
   auto neighbors_view = raft::make_device_matrix_view<size_t, int64_t>(neighbors, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  raft::neighbors::brute_force::search<T, size_t>(
+  raft::neighbors::brute_force::search<T, AnnBase::index_type>(
     handle_, *index_, queries_view, neighbors_view, distances_view);
 }
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index f4792c6631..58ce4f3be9 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1173,7 +1173,7 @@ struct persistent_runner_base_t {
 struct launcher_t {
   using work_queue_type = persistent_runner_base_t::work_queue_type;
   using pending_reads_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, true, false, true>;
+    atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, false, false, true>;
   using completion_latch_type = cuda::atomic<uint32_t, cuda::thread_scope_system>;
 
   pending_reads_queue_type pending_reads{};
@@ -1229,7 +1229,7 @@ struct launcher_t {
   /** Check if the worker has finished the work; if so, return it to the shared pool. */
   auto try_return_worker(uint32_t worker_id) -> bool
   {
-    // Use the cached `all_done` - makes sence when called from the `wait()` routine.
+    // Use the cached `all_done` - makes sense when called from the `wait()` routine.
     if (all_done || work_handles[worker_id].load(cuda::memory_order_relaxed) == kWaitForWork) {
       worker_ids.push(worker_id);
       return true;

From efd7966533a54cee69662630a8d9981c4cb318a1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:06:21 +0200
Subject: [PATCH 08/37] Slightly increase occupancy to improve QPS

---
 .../detail/cagra/search_single_cta_kernel-inl.cuh        | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 58ce4f3be9..95385fe0aa 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1429,16 +1429,17 @@ struct persistent_runner_t : public persistent_runner_base_t {
   {
     // We may need to run other kernels alongside this persistent kernel.
     // Leave a few SMs idle.
-    constexpr double kDeviceUsage = 0.8;
+    // Note: even when we know there are no other kernels working at the same time, setting
+    // kDeviceUsage to 1.0 surprisingly hurts performance.
+    constexpr double kDeviceUsage = 0.9;
 
     // determine the grid size
     int ctas_per_sm = 1;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor<kernel_type>(
       &ctas_per_sm, kernel, block_size, smem_size);
-    int num_sm = getMultiProcessorCount() - 1;
+    int num_sm = getMultiProcessorCount();
 
-    return {1, uint32_t(kDeviceUsage * (ctas_per_sm * num_sm)), 1};
-    // return {1, uint32_t(getMultiProcessorCount() - 8), 1};
+    return {1, static_cast<uint32_t>(kDeviceUsage * (ctas_per_sm * num_sm)), 1};
   }
 };
 

From e7c35dfd791081d9a25bc02c4f5ae43e0e9d234a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 19 Apr 2024 14:21:40 +0200
Subject: [PATCH 09/37] Align input and sync variables with cache lines to
 avoid cache conflicts

Restructure input/output a bit to pad the atomics to 128 bytes.
This reduces the latency/single threaded time by 3x on a PCIe machine.
---
 .../cagra/search_single_cta_kernel-inl.cuh    | 129 ++++++++++--------
 1 file changed, 74 insertions(+), 55 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 95385fe0aa..6792c9d231 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -890,28 +890,53 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
                                metric);
 }
 
+// To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
+// maximum of the two.
+// This makes sync atomic significantly faster.
+constexpr size_t kCacheLineBytes = 128;
+
 template <typename DATASET_DESCRIPTOR_T>
-struct work_desc_t {
+struct alignas(kCacheLineBytes) work_desc_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  index_type* result_indices_ptr;       // [num_queries, top_k]
-  distance_type* result_distances_ptr;  // [num_queries, top_k]
-  const data_type* queries_ptr;         // [num_queries, dataset_dim]
-  uint32_t top_k;
-  uint32_t n_queries;  // also used as a completion indicator: last thread sets it to zero
+  // The algorithm input parameters
+  struct value_t {
+    index_type* result_indices_ptr;       // [num_queries, top_k]
+    distance_type* result_distances_ptr;  // [num_queries, top_k]
+    const data_type* queries_ptr;         // [num_queries, dataset_dim]
+    uint32_t top_k;
+    uint32_t n_queries;
+  };
+  using blob_elem_type = uint32_t;
+  constexpr static inline size_t kBlobSize =
+    raft::div_rounding_up_safe(sizeof(value_t), sizeof(blob_elem_type));
+  // Union facilitates loading the input by a warp in a single request
+  union input_t {
+    blob_elem_type blob[kBlobSize];  // NOLINT
+    value_t value;
+  } input;
+  // Last thread triggers this flag.
+  cuda::atomic<bool, cuda::thread_scope_system> completion_flag;
 };
 
-using work_handle_t = cuda::atomic<uint64_t, cuda::thread_scope_system>;
-union work_handle_view_t {
-  uint64_t handle;
+struct alignas(kCacheLineBytes) work_handle_t {
+  using handle_t = uint64_t;
   struct value_t {
     uint32_t desc_id;
     uint32_t query_id;
-  } value;
+  };
+  union data_t {
+    handle_t handle;
+    value_t value;
+  };
+  cuda::atomic<data_t, cuda::thread_scope_system> data;
 };
-constexpr uint64_t kWaitForWork = std::numeric_limits<uint64_t>::max();
-constexpr uint64_t kNoMoreWork  = kWaitForWork - 1;
+static_assert(sizeof(work_handle_t::value_t) == sizeof(work_handle_t::handle_t));
+static_assert(cuda::atomic<work_handle_t::data_t, cuda::thread_scope_system>::is_always_lock_free);
+
+constexpr work_handle_t::handle_t kWaitForWork = std::numeric_limits<uint64_t>::max();
+constexpr work_handle_t::handle_t kNoMoreWork  = kWaitForWork - 1;
 
 template <uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
@@ -944,24 +969,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   SAMPLE_FILTER_T sample_filter,
   raft::distance::DistanceType metric)
 {
-  auto& work_handle = work_handles[blockIdx.y];
-
-  using work_desc_type     = work_desc_t<DATASET_DESCRIPTOR_T>;
-  using blob_elem          = uint32_t;
-  constexpr auto kBlobSize = raft::div_rounding_up_safe(sizeof(work_desc_type), sizeof(blob_elem));
-  static_assert(kBlobSize * sizeof(blob_elem) == sizeof(work_desc_type));
-  __shared__ union {
-    work_desc_type value;
-    blob_elem blob[kBlobSize];
-  } work_descriptor;
+  using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
+  __shared__ typename work_desc_type::input_t work_descriptor;
+  __shared__ work_handle_t::data_t work_index;
 
-  __shared__ work_handle_view_t work_index;
+  auto& work_handle = work_handles[blockIdx.y].data;
 
   while (true) {
     // wait the writing phase
     if (threadIdx.x == 0) {
       do {
-        work_index.handle = work_handle.load(cuda::memory_order_relaxed);
+        work_index = work_handle.load(cuda::memory_order_relaxed);
       } while (work_index.handle == kWaitForWork);
       cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
     }
@@ -969,11 +987,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     if (work_index.handle == kNoMoreWork) { break; }
     auto work_ix = work_index.value.desc_id;
 
-    for (auto i = threadIdx.x; i < kBlobSize; i += blockDim.x) {
-      work_descriptor.blob[i] = reinterpret_cast<blob_elem*>(work_descriptors + work_ix)[i];
+    for (auto i = threadIdx.x; i < work_desc_type::kBlobSize; i += blockDim.x) {
+      work_descriptor.blob[i] = work_descriptors[work_ix].input.blob[i];
     }
     __syncthreads();
-    if (threadIdx.x == 0) { work_handle.store(kWaitForWork, cuda::memory_order_relaxed); }
+    if (threadIdx.x == 0) { work_handle.store({kWaitForWork}, cuda::memory_order_relaxed); }
 
     // reading phase
     auto* result_indices_ptr   = work_descriptor.value.result_indices_ptr;
@@ -1022,9 +1040,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
         // we may need a memory fence here:
         //   - device - if the queries are accessed by the device
         //   - system - e.g. if we put them into managed/pinned memory.
-        reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
-          &work_descriptors[work_ix].n_queries)
-          ->store(0, cuda::memory_order_relaxed);
+        work_descriptors[work_ix].completion_flag.store(true, cuda::memory_order_relaxed);
       }
     }
   }
@@ -1174,13 +1190,13 @@ struct launcher_t {
   using work_queue_type = persistent_runner_base_t::work_queue_type;
   using pending_reads_queue_type =
     atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, false, false, true>;
-  using completion_latch_type = cuda::atomic<uint32_t, cuda::thread_scope_system>;
+  using completion_flag_type = cuda::atomic<bool, cuda::thread_scope_system>;
 
   pending_reads_queue_type pending_reads{};
   work_queue_type& worker_ids;
   work_handle_t* work_handles;
   uint32_t lead_worker_id;
-  completion_latch_type* completion_latch;
+  completion_flag_type* completion_flag;
   bool all_done = false;
 
   template <typename RecordWork>
@@ -1191,9 +1207,9 @@ struct launcher_t {
     : worker_ids{worker_ids},
       work_handles{work_handles},
       lead_worker_id{worker_ids.pop()},
-      completion_latch{record_work(lead_worker_id)}
+      completion_flag{record_work(lead_worker_id)}
   {
-    // The first worker is special: one may associate a work_descriptor and the completion_latch
+    // The first worker is special: one may associate a work_descriptor and the completion_flag
     // with the same id. Hence it bypassed the `pending_reads` queue and is only released at the
     // very end.
     submit_query(lead_worker_id, 0, true);
@@ -1211,8 +1227,8 @@ struct launcher_t {
 
   void submit_query(uint32_t worker_id, uint32_t query_id, bool first_to_submit = false)
   {
-    work_handles[worker_id].store(
-      (work_handle_view_t{.value = {lead_worker_id, query_id}}).handle,
+    work_handles[worker_id].data.store(
+      work_handle_t::data_t{.value = {lead_worker_id, query_id}},
       first_to_submit ? cuda::memory_order_release : cuda::memory_order_relaxed);
 
     if (first_to_submit) { return; }
@@ -1230,7 +1246,8 @@ struct launcher_t {
   auto try_return_worker(uint32_t worker_id) -> bool
   {
     // Use the cached `all_done` - makes sense when called from the `wait()` routine.
-    if (all_done || work_handles[worker_id].load(cuda::memory_order_relaxed) == kWaitForWork) {
+    if (all_done ||
+        work_handles[worker_id].data.load(cuda::memory_order_relaxed).handle == kWaitForWork) {
       worker_ids.push(worker_id);
       return true;
     } else {
@@ -1246,7 +1263,7 @@ struct launcher_t {
   {
     // Cache the result of the check to avoid doing unnecessary atomic loads.
     if (all_done) { return true; }
-    all_done = completion_latch->load() == 0;
+    all_done = completion_flag->load(cuda::memory_order_relaxed);
     return all_done;
   }
 
@@ -1337,13 +1354,14 @@ struct persistent_runner_t : public persistent_runner_base_t {
     auto* work_handles_ptr = work_handles.data();
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     for (uint32_t i = 0; i < gs.y; i++) {
-      auto& wd                = work_descriptors_ptr[i];
+      auto& wd                = work_descriptors_ptr[i].input.value;
       wd.result_indices_ptr   = nullptr;
       wd.result_distances_ptr = nullptr;
       wd.queries_ptr          = nullptr;
       wd.top_k                = 0;
       wd.n_queries            = 0;
-      work_handles_ptr[i].store(kWaitForWork);
+      work_descriptors_ptr[i].completion_flag.store(false);
+      work_handles_ptr[i].data.store({kWaitForWork});
       queue.push(i);
     }
 
@@ -1390,13 +1408,13 @@ struct persistent_runner_t : public persistent_runner_base_t {
     auto count         = whl;
     // wait for all the jobs to finish nicely
     while (queue.try_pop(worker_id)) {
-      whs[worker_id].store(kNoMoreWork, cuda::memory_order_relaxed);
+      whs[worker_id].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
       count--;
     }
     if (count > 0) {
       // try to kill stuck threads if any
       for (uint32_t i = 0; i < whl; i++) {
-        whs[i].store(kNoMoreWork, cuda::memory_order_relaxed);
+        whs[i].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
       }
     }
     RAFT_LOG_INFO("Destroyed the persistent runner.");
@@ -1408,19 +1426,20 @@ struct persistent_runner_t : public persistent_runner_base_t {
               uint32_t num_queries,
               uint32_t top_k)
   {
-    launcher_t{
-      queue,
-      work_handles.data(),
-      num_queries,
-      [=](uint32_t worker_id) {
-        auto& wd                = work_descriptors.data()[worker_id];
-        wd.result_indices_ptr   = result_indices_ptr;
-        wd.result_distances_ptr = result_distances_ptr;
-        wd.queries_ptr          = queries_ptr;
-        wd.top_k                = top_k;
-        wd.n_queries            = num_queries;
-        return reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(&wd.n_queries);
-      }}
+    launcher_t{queue,
+               work_handles.data(),
+               num_queries,
+               [=](uint32_t worker_id) {
+                 auto& wd                = work_descriptors.data()[worker_id].input.value;
+                 auto cflag              = &work_descriptors.data()[worker_id].completion_flag;
+                 wd.result_indices_ptr   = result_indices_ptr;
+                 wd.result_distances_ptr = result_distances_ptr;
+                 wd.queries_ptr          = queries_ptr;
+                 wd.top_k                = top_k;
+                 wd.n_queries            = num_queries;
+                 cflag->store(false, cuda::memory_order_relaxed);
+                 return cflag;
+               }}
       .wait();
     last_touch.store(std::chrono::system_clock::now());
   }

From 7089ed8d73e686e384ecd714320f60d3b92e54a1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 23 Apr 2024 16:09:12 +0200
Subject: [PATCH 10/37] Optimize the waiting for the input inside the kernel.

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 6792c9d231..be290c3b0a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -42,7 +42,6 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
 #include <cuda/atomic>
-#include <cuda/latch>
 
 #include <atomic_queue/atomic_queue.h>
 
@@ -908,7 +907,7 @@ struct alignas(kCacheLineBytes) work_desc_t {
     uint32_t top_k;
     uint32_t n_queries;
   };
-  using blob_elem_type = uint32_t;
+  using blob_elem_type = uint4;
   constexpr static inline size_t kBlobSize =
     raft::div_rounding_up_safe(sizeof(value_t), sizeof(blob_elem_type));
   // Union facilitates loading the input by a warp in a single request
@@ -974,23 +973,29 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   __shared__ work_handle_t::data_t work_index;
 
   auto& work_handle = work_handles[blockIdx.y].data;
+  uint32_t work_ix;
 
   while (true) {
     // wait the writing phase
     if (threadIdx.x == 0) {
+      work_handle_t::data_t work_index_local;
       do {
-        work_index = work_handle.load(cuda::memory_order_relaxed);
-      } while (work_index.handle == kWaitForWork);
+        work_index_local = work_handle.load(cuda::memory_order_relaxed);
+      } while (work_index_local.handle == kWaitForWork);
+      work_ix = work_index_local.value.desc_id;
       cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
+      work_index = work_index_local;
     }
-    __syncthreads();
-    if (work_index.handle == kNoMoreWork) { break; }
-    auto work_ix = work_index.value.desc_id;
-
-    for (auto i = threadIdx.x; i < work_desc_type::kBlobSize; i += blockDim.x) {
-      work_descriptor.blob[i] = work_descriptors[work_ix].input.blob[i];
+    if (threadIdx.x < WarpSize) {
+      // Sync one warp and copy descriptor data
+      static_assert(work_desc_type::kBlobSize <= WarpSize);
+      work_ix = raft::shfl(work_ix, 0);
+      if (threadIdx.x < work_desc_type::kBlobSize) {
+        work_descriptor.blob[threadIdx.x] = work_descriptors[work_ix].input.blob[threadIdx.x];
+      }
     }
     __syncthreads();
+    if (work_index.handle == kNoMoreWork) { break; }
     if (threadIdx.x == 0) { work_handle.store({kWaitForWork}, cuda::memory_order_relaxed); }
 
     // reading phase

From 259e5ec8a4dd958d28022012084a78452211af8c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:14:50 +0200
Subject: [PATCH 11/37] cagra wrapper: avoid constructing rmm uvectors when not
 needed

---
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index c9759ea32a..386d21f616 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -293,12 +293,12 @@ void RaftCagra<T, IdxT>::search_base(
   const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   IdxT* neighbors_IdxT;
-  rmm::device_uvector<IdxT> neighbors_storage(0, resource::get_cuda_stream(handle_));
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
   if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
     neighbors_IdxT = reinterpret_cast<AnnBase::index_type*>(neighbors);
   } else {
-    neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_));
-    neighbors_IdxT = neighbors_storage.data();
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
   }
 
   auto queries_view =

From 63f996a66cee4d9bb1bff3dbe7dbe146dfb2b613 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:25:53 +0200
Subject: [PATCH 12/37] Avoid any calls to RMM in IO threads.

1) Make the persistent kernel allocate the hashmap in advance.
2) Introduce lightweight_uvector, which does not call any CUDA functions when not needed.
---
 .../neighbors/detail/cagra/search_plan.cuh    | 97 +++++++++++++++++--
 .../detail/cagra/search_single_cta.cuh        |  4 +-
 .../cagra/search_single_cta_kernel-inl.cuh    | 13 ++-
 3 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index b35d96e9f5..41a07388c7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -29,20 +29,99 @@
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/util/pow2_utils.cuh>
 
+#include <optional>
+#include <tuple>
+#include <variant>
+
 namespace raft::neighbors::cagra::detail {
 
+/**
+ * A lightweight version of rmm::device_uvector.
+ * This version ignores the current device on allocations, thus avoids calling
+ * cudaSetDevice/cudaGetDevice.
+ * If the size stays at zero, this struct never calls any CUDA driver / RAFT resource functions.
+ */
+template <typename T>
+struct lightweight_uvector {
+ private:
+  using raft_res_type = const raft::resources*;
+  using rmm_res_type  = std::tuple<rmm::device_async_resource_ref, rmm::cuda_stream_view>;
+  static constexpr size_t kAlign = 256;
+
+  std::variant<raft_res_type, rmm_res_type> res_;
+  T* ptr_;
+  size_t size_;
+
+ public:
+  explicit lightweight_uvector(const raft::resources& res) : res_(&res), ptr_{nullptr}, size_{0} {}
+
+  [[nodiscard]] auto data() noexcept -> T* { return ptr_; }
+  [[nodiscard]] auto data() const noexcept -> const T* { return ptr_; }
+  [[nodiscard]] auto size() const noexcept -> size_t { return size_; }
+
+  void resize(size_t new_size)
+  {
+    if (new_size == size_) { return; }
+    if (std::holds_alternative<raft_res_type>(res_)) {
+      auto& h = std::get<raft_res_type>(res_);
+      res_    = rmm_res_type{resource::get_workspace_resource(*h), resource::get_cuda_stream(*h)};
+    }
+    auto& [r, s] = std::get<rmm_res_type>(res_);
+    T* new_ptr   = nullptr;
+    if (new_size > 0) {
+      new_ptr = reinterpret_cast<T*>(r.allocate_async(new_size * sizeof(T), kAlign, s));
+    }
+    auto copy_size = std::min(size_, new_size);
+    if (copy_size > 0) {
+      cudaMemcpyAsync(new_ptr, ptr_, copy_size * sizeof(T), cudaMemcpyDefault, s);
+    }
+    if (size_ > 0) { r.deallocate_async(ptr_, size_ * sizeof(T), kAlign, s); }
+    ptr_  = new_ptr;
+    size_ = new_size;
+  }
+
+  void resize(size_t new_size, rmm::cuda_stream_view stream)
+  {
+    if (new_size == size_) { return; }
+    if (std::holds_alternative<raft_res_type>(res_)) {
+      auto& h = std::get<raft_res_type>(res_);
+      res_    = rmm_res_type{resource::get_workspace_resource(*h), stream};
+    } else {
+      std::get<rmm::cuda_stream_view>(std::get<rmm_res_type>(res_)) = stream;
+    }
+    resize(new_size);
+  }
+
+  ~lightweight_uvector() noexcept
+  {
+    if (size_ > 0) {
+      auto& [r, s] = std::get<rmm_res_type>(res_);
+      r.deallocate_async(ptr_, size_ * sizeof(T), kAlign, s);
+    }
+  }
+};
+
 struct search_plan_impl_base : public search_params {
   int64_t dataset_block_dim;
   int64_t dim;
   int64_t graph_degree;
   uint32_t topk;
   raft::distance::DistanceType metric;
+  bool is_persistent;
+
+  static constexpr uint64_t kPMask = 0x8000000000000000LL;
+
   search_plan_impl_base(search_params params,
                         int64_t dim,
                         int64_t graph_degree,
                         uint32_t topk,
                         raft::distance::DistanceType metric)
-    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
+    : search_params(params),
+      dim(dim),
+      graph_degree(graph_degree),
+      topk(topk),
+      metric(metric),
+      is_persistent(params.rand_xor_mask & kPMask)
   {
     set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
@@ -95,9 +174,9 @@ struct search_plan_impl : public search_plan_impl_base {
   uint32_t topk;
   uint32_t num_seeds;
 
-  rmm::device_uvector<INDEX_T> hashmap;
-  rmm::device_uvector<uint32_t> num_executed_iterations;  // device or managed?
-  rmm::device_uvector<INDEX_T> dev_seed;
+  lightweight_uvector<INDEX_T> hashmap;
+  lightweight_uvector<uint32_t> num_executed_iterations;  // device or managed?
+  lightweight_uvector<INDEX_T> dev_seed;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,
@@ -106,16 +185,18 @@ struct search_plan_impl : public search_plan_impl_base {
                    uint32_t topk,
                    raft::distance::DistanceType metric)
     : search_plan_impl_base(params, dim, graph_degree, topk, metric),
-      hashmap(0, resource::get_cuda_stream(res)),
-      num_executed_iterations(0, resource::get_cuda_stream(res)),
-      dev_seed(0, resource::get_cuda_stream(res)),
+      hashmap(res),
+      num_executed_iterations(res),
+      dev_seed(res),
       num_seeds(0)
   {
     adjust_search_params();
     check_params();
     calc_hashmap_params(res);
     set_dataset_block_and_team_size(dim);
-    num_executed_iterations.resize(max_queries, resource::get_cuda_stream(res));
+    if (!is_persistent) {  // Persistent kernel does not provide this functionality
+      num_executed_iterations.resize(max_queries, resource::get_cuda_stream(res));
+    }
     RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
   }
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 0771652787..9e1215713f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -201,8 +201,8 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     }
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
     hashmap_size = 0;
-    if (small_hash_bitlen == 0) {
-      hashmap_size = sizeof(INDEX_T) * max_queries * hashmap::get_size(hash_bitlen);
+    if (small_hash_bitlen == 0 && !this->is_persistent) {
+      hashmap_size = max_queries * hashmap::get_size(hash_bitlen);
       hashmap.resize(hashmap_size, resource::get_cuda_stream(res));
     }
     RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size);
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index be290c3b0a..fd764e73a7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -574,7 +574,7 @@ __device__ void search_core(
   if (small_hash_bitlen) {
     local_visited_hashmap_ptr = visited_hash_buffer;
   } else {
-    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
+    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * gridDim.y);
   }
   hashmap::init(local_visited_hashmap_ptr, hash_bitlen, 0);
   __syncthreads();
@@ -1181,7 +1181,7 @@ struct persistent_runner_base_t {
   using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
   rmm::mr::pinned_host_memory_resource work_handles_mr;
   rmm::mr::pinned_host_memory_resource work_descriptor_mr;
-  rmm::mr::cuda_memory_resource completion_counters_mr;
+  rmm::mr::cuda_memory_resource device_mr;
   cudaStream_t stream{};
   work_queue_type queue{};
   persistent_runner_base_t() : queue()
@@ -1311,6 +1311,7 @@ struct persistent_runner_t : public persistent_runner_base_t {
   rmm::device_uvector<work_handle_t> work_handles;
   rmm::device_uvector<work_desc_type> work_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
+  rmm::device_uvector<index_type> hashmap;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
 
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
@@ -1319,7 +1320,6 @@ struct persistent_runner_t : public persistent_runner_base_t {
                       uint32_t block_size,  //
                       uint32_t smem_size,
                       int64_t hash_bitlen,
-                      index_type* hashmap_ptr,
                       size_t small_hash_bitlen,
                       size_t small_hash_reset_interval,
                       uint32_t num_random_samplings,
@@ -1337,7 +1337,8 @@ struct persistent_runner_t : public persistent_runner_base_t {
       block_size{block_size},
       work_handles(0, stream, work_handles_mr),
       work_descriptors(0, stream, work_descriptor_mr),
-      completion_counters(0, stream, completion_counters_mr)
+      completion_counters(0, stream, device_mr),
+      hashmap(0, stream, device_mr)
   {
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
@@ -1355,6 +1356,9 @@ struct persistent_runner_t : public persistent_runner_base_t {
     work_descriptors.resize(gs.y, stream);
     auto* work_descriptors_ptr = work_descriptors.data();
 
+    index_type* hashmap_ptr = nullptr;
+    if (small_hash_bitlen == 0) { hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream); }
+
     work_handles.resize(gs.y, stream);
     auto* work_handles_ptr = work_handles.data();
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -1570,7 +1574,6 @@ void select_and_run(
                             block_size,
                             smem_size,
                             hash_bitlen,
-                            hashmap_ptr,
                             small_hash_bitlen,
                             small_hash_reset_interval,
                             num_random_samplings,

From 3d1011d4e87fbf3bcf666d3f15340dd85710948f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:29:29 +0200
Subject: [PATCH 13/37] Use atomics on the persistent runner (shared_ptr) to
 reduce the number of mutex locks

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 121 +++++++++++-------
 1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index fd764e73a7..9d4cb7a933 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -42,6 +42,7 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 
 #include <atomic_queue/atomic_queue.h>
 
@@ -892,7 +893,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
 // maximum of the two.
 // This makes sync atomic significantly faster.
-constexpr size_t kCacheLineBytes = 128;
+constexpr size_t kCacheLineBytes = 64;
 
 template <typename DATASET_DESCRIPTOR_T>
 struct alignas(kCacheLineBytes) work_desc_t {
@@ -1191,7 +1192,7 @@ struct persistent_runner_base_t {
   virtual ~persistent_runner_base_t() noexcept { cudaStreamDestroy(stream); };
 };
 
-struct launcher_t {
+struct alignas(kCacheLineBytes) launcher_t {
   using work_queue_type = persistent_runner_base_t::work_queue_type;
   using pending_reads_queue_type =
     atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, false, false, true>;
@@ -1298,7 +1299,7 @@ template <unsigned TEAM_SIZE,
           unsigned DATASET_BLOCK_DIM,
           typename DATASET_DESCRIPTOR_T,
           typename SAMPLE_FILTER_T>
-struct persistent_runner_t : public persistent_runner_base_t {
+struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_base_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
@@ -1435,22 +1436,22 @@ struct persistent_runner_t : public persistent_runner_base_t {
               uint32_t num_queries,
               uint32_t top_k)
   {
-    launcher_t{queue,
-               work_handles.data(),
-               num_queries,
-               [=](uint32_t worker_id) {
-                 auto& wd                = work_descriptors.data()[worker_id].input.value;
-                 auto cflag              = &work_descriptors.data()[worker_id].completion_flag;
-                 wd.result_indices_ptr   = result_indices_ptr;
-                 wd.result_distances_ptr = result_distances_ptr;
-                 wd.queries_ptr          = queries_ptr;
-                 wd.top_k                = top_k;
-                 wd.n_queries            = num_queries;
-                 cflag->store(false, cuda::memory_order_relaxed);
-                 return cflag;
-               }}
-      .wait();
-    last_touch.store(std::chrono::system_clock::now());
+    // submit all queries
+    launcher_t launcher{queue, work_handles.data(), num_queries, [=](uint32_t worker_id) {
+                          auto& wd   = work_descriptors.data()[worker_id].input.value;
+                          auto cflag = &work_descriptors.data()[worker_id].completion_flag;
+                          wd.result_indices_ptr   = result_indices_ptr;
+                          wd.result_distances_ptr = result_distances_ptr;
+                          wd.queries_ptr          = queries_ptr;
+                          wd.top_k                = top_k;
+                          wd.n_queries            = num_queries;
+                          cflag->store(false, cuda::memory_order_relaxed);
+                          return cflag;
+                        }};
+    // update the keep-alive atomic in the meanwhile
+    last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
+    // wait for the results to arrive
+    launcher.wait();
   }
 
   auto calc_coop_grid_size(uint32_t block_size, uint32_t smem_size) -> dim3
@@ -1471,60 +1472,84 @@ struct persistent_runner_t : public persistent_runner_base_t {
   }
 };
 
-inline std::shared_ptr<persistent_runner_base_t> persistent_runner{nullptr};
-inline std::mutex persistent_lock;
+struct alignas(kCacheLineBytes) persistent_state {
+  std::shared_ptr<persistent_runner_base_t> runner{nullptr};
+  std::mutex lock;
+};
+
+inline persistent_state persistent{};
 
 template <typename RunnerT, typename... Args>
 auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass everything by values
 {
-  cuda::atomic<std::shared_ptr<RunnerT>*> runner_outer{nullptr};
+  // NB: storing pointer-to-shared_ptr; otherwise, notify_one()/wait() do not seem to work.
+  std::shared_ptr<RunnerT> runner_outer{nullptr};
+  cuda::std::atomic_flag ready{};
+  ready.clear(cuda::std::memory_order_relaxed);
   std::thread(
-    [&runner_outer](Args... thread_args) {  // pass everything by values
-      std::shared_ptr<RunnerT> runner_inner{nullptr};
+    [&runner_outer, &ready](Args... thread_args) {  // pass everything by values
       std::weak_ptr<RunnerT> runner_weak;
       {
-        std::lock_guard<std::mutex> guard(persistent_lock);
-        persistent_runner.reset();  // Free the resources (if any) in advance
-        runner_inner      = std::make_shared<RunnerT>(thread_args...);
-        runner_weak       = runner_inner;
-        persistent_runner = std::static_pointer_cast<persistent_runner_base_t>(runner_inner);
-        runner_outer.store(new std::shared_ptr<RunnerT>{runner_inner});
-        runner_outer.notify_one();
-        runner_inner->last_touch.store(std::chrono::system_clock::now());
-        runner_inner.reset();
+        std::lock_guard<std::mutex> guard(persistent.lock);
+        // Try to check the runner again:
+        //   it may have been created by another thread since the last check
+        runner_outer = std::dynamic_pointer_cast<RunnerT>(
+          std::atomic_load_explicit(&persistent.runner, std::memory_order_relaxed));
+        if (runner_outer) {
+          runner_outer->last_touch.store(std::chrono::system_clock::now(),
+                                         std::memory_order_relaxed);
+          ready.test_and_set(cuda::std::memory_order_release);
+          ready.notify_one();
+          return;
+        }
+        // Free the resources (if any) in advance
+        std::atomic_store_explicit(&persistent.runner,
+                                   std::shared_ptr<persistent_runner_base_t>{nullptr},
+                                   std::memory_order_relaxed);
+        runner_outer = std::make_shared<RunnerT>(thread_args...);
+        runner_weak  = runner_outer;
+        std::atomic_store_explicit(&persistent.runner,
+                                   std::static_pointer_cast<persistent_runner_base_t>(runner_outer),
+                                   std::memory_order_relaxed);
+        runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
+        ready.test_and_set(cuda::std::memory_order_release);
+        ready.notify_one();
       }
       constexpr auto kInterval = std::chrono::milliseconds(500);
       while (true) {
         std::this_thread::sleep_for(kInterval);
-        std::lock_guard<std::mutex> guard(persistent_lock);
-        auto runner = runner_weak.lock();
+        auto runner = runner_weak.lock();  // runner_weak is local - thread-safe
         if (!runner) {
           return;  // dead already
         }
-        if (runner->last_touch.load() + kInterval < std::chrono::system_clock::now()) {
-          if (runner == persistent_runner) { persistent_runner.reset(); }
+        if (runner->last_touch.load(std::memory_order_relaxed) + kInterval <
+            std::chrono::system_clock::now()) {
+          if (runner == std::atomic_load_explicit(
+                          &persistent.runner,
+                          std::memory_order_relaxed)) {  // compare pointers: this is thread-safe
+            std::lock_guard<std::mutex> guard(persistent.lock);
+            std::atomic_store_explicit(&persistent.runner,
+                                       std::shared_ptr<persistent_runner_base_t>{nullptr},
+                                       std::memory_order_relaxed);
+          }
           return;
         }
       }
     },
     args...)
     .detach();
-  runner_outer.wait(nullptr);
-  auto* p = runner_outer.load();
-  auto r  = std::move(*p);
-  delete p;
-  return r;
+  ready.wait(false, cuda::std::memory_order_acquire);
+  return runner_outer;
 }
 
 template <typename RunnerT, typename... Args>
 auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
 {
-  {
-    std::lock_guard<std::mutex> guard(persistent_lock);
-    auto runner = std::dynamic_pointer_cast<RunnerT>(persistent_runner);
-    if (runner) { return runner; }
-  }
-  return create_runner<RunnerT>(args...);
+  // We copy the shared pointer here, then using the copy is thread-safe.
+  auto runner = std::dynamic_pointer_cast<RunnerT>(
+    std::atomic_load_explicit(&persistent.runner, std::memory_order_relaxed));
+  if (runner) { return runner; }
+  return create_runner<RunnerT>(std::forward<Args>(args)...);
 }
 
 template <unsigned TEAM_SIZE,

From 8b19907541a59c328ab764aea3972686c4b99a81 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:30:16 +0200
Subject: [PATCH 14/37] Remove the shared state and the mutex lock from NVTX
 helpers

---
 cpp/include/raft/core/detail/nvtx.hpp | 32 +++++++++------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index 82db75de84..3fe70c7154 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -24,23 +24,19 @@
 
 #include <cstdint>
 #include <cstdlib>
-#include <mutex>
+#include <limits>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <vector>
 
 namespace raft::common::nvtx::detail {
 
 /**
- * @brief An internal struct to store associated state with the color
- * generator
+ * @brief An internal struct to to initialize the color generator
  */
-struct color_gen_state {
-  /** collection of all tagged colors generated so far */
-  static inline std::unordered_map<std::string, uint32_t> all_colors_;
-  /** mutex for accessing the above map */
-  static inline std::mutex map_mutex_;
+struct color_gen {
+  /** This determines how many bits of the hash to use for the generator */
+  using hash_type = uint16_t;
   /** saturation */
   static inline constexpr float kS = 0.9f;
   /** value */
@@ -121,20 +117,12 @@ inline auto hsv2rgb(float h, float s, float v) -> uint32_t
  */
 inline auto generate_next_color(const std::string& tag) -> uint32_t
 {
-  // std::unordered_map<std::string, uint32_t> color_gen_state::all_colors_;
-  // std::mutex color_gen_state::map_mutex_;
-
-  std::lock_guard<std::mutex> guard(color_gen_state::map_mutex_);
-  if (!tag.empty()) {
-    auto itr = color_gen_state::all_colors_.find(tag);
-    if (itr != color_gen_state::all_colors_.end()) { return itr->second; }
-  }
-  auto h = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-  h += color_gen_state::kInvPhi;
+  auto x = static_cast<color_gen::hash_type>(std::hash<std::string>{}(tag));
+  auto u = std::numeric_limits<color_gen::hash_type>::max();
+  auto h = static_cast<float>(x) / static_cast<float>(u);
+  h += color_gen::kInvPhi;
   if (h >= 1.f) h -= 1.f;
-  auto rgb = hsv2rgb(h, color_gen_state::kS, color_gen_state::kV);
-  if (!tag.empty()) { color_gen_state::all_colors_[tag] = rgb; }
-  return rgb;
+  return hsv2rgb(h, color_gen::kS, color_gen::kV);
 }
 
 template <typename Domain, typename = Domain>

From 4d2b8d5682f5db5fefd8bcf8a86134e963543552 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 20:11:22 +0200
Subject: [PATCH 15/37] Split the sync queue in two: job descriptors and idle
 worker handles

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 242 ++++++++++--------
 1 file changed, 133 insertions(+), 109 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 9d4cb7a933..ae990d83e6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -896,7 +896,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 constexpr size_t kCacheLineBytes = 64;
 
 template <typename DATASET_DESCRIPTOR_T>
-struct alignas(kCacheLineBytes) work_desc_t {
+struct alignas(kCacheLineBytes) job_desc_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
@@ -920,7 +920,7 @@ struct alignas(kCacheLineBytes) work_desc_t {
   cuda::atomic<bool, cuda::thread_scope_system> completion_flag;
 };
 
-struct alignas(kCacheLineBytes) work_handle_t {
+struct alignas(kCacheLineBytes) worker_handle_t {
   using handle_t = uint64_t;
   struct value_t {
     uint32_t desc_id;
@@ -932,11 +932,17 @@ struct alignas(kCacheLineBytes) work_handle_t {
   };
   cuda::atomic<data_t, cuda::thread_scope_system> data;
 };
-static_assert(sizeof(work_handle_t::value_t) == sizeof(work_handle_t::handle_t));
-static_assert(cuda::atomic<work_handle_t::data_t, cuda::thread_scope_system>::is_always_lock_free);
+static_assert(sizeof(worker_handle_t::value_t) == sizeof(worker_handle_t::handle_t));
+static_assert(
+  cuda::atomic<worker_handle_t::data_t, cuda::thread_scope_system>::is_always_lock_free);
 
-constexpr work_handle_t::handle_t kWaitForWork = std::numeric_limits<uint64_t>::max();
-constexpr work_handle_t::handle_t kNoMoreWork  = kWaitForWork - 1;
+constexpr worker_handle_t::handle_t kWaitForWork = std::numeric_limits<uint64_t>::max();
+constexpr worker_handle_t::handle_t kNoMoreWork  = kWaitForWork - 1;
+
+constexpr auto is_worker_busy(worker_handle_t::handle_t h) -> bool
+{
+  return (h != kWaitForWork) && (h != kNoMoreWork);
+}
 
 template <uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
@@ -947,8 +953,8 @@ template <uint32_t TEAM_SIZE,
           class SAMPLE_FILTER_T>
 __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   DATASET_DESCRIPTOR_T dataset_desc,
-  work_handle_t* work_handles,
-  work_desc_t<DATASET_DESCRIPTOR_T>* work_descriptors,
+  worker_handle_t* worker_handles,
+  job_desc_t<DATASET_DESCRIPTOR_T>* job_descriptors,
   uint32_t* completion_counters,
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -969,43 +975,43 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
   SAMPLE_FILTER_T sample_filter,
   raft::distance::DistanceType metric)
 {
-  using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
-  __shared__ typename work_desc_type::input_t work_descriptor;
-  __shared__ work_handle_t::data_t work_index;
+  using job_desc_type = job_desc_t<DATASET_DESCRIPTOR_T>;
+  __shared__ typename job_desc_type::input_t job_descriptor;
+  __shared__ worker_handle_t::data_t worker_data;
 
-  auto& work_handle = work_handles[blockIdx.y].data;
-  uint32_t work_ix;
+  auto& worker_handle = worker_handles[blockIdx.y].data;
+  uint32_t job_ix;
 
   while (true) {
     // wait the writing phase
     if (threadIdx.x == 0) {
-      work_handle_t::data_t work_index_local;
+      worker_handle_t::data_t worker_data_local;
       do {
-        work_index_local = work_handle.load(cuda::memory_order_relaxed);
-      } while (work_index_local.handle == kWaitForWork);
-      work_ix = work_index_local.value.desc_id;
+        worker_data_local = worker_handle.load(cuda::memory_order_relaxed);
+      } while (worker_data_local.handle == kWaitForWork);
+      job_ix = worker_data_local.value.desc_id;
       cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
-      work_index = work_index_local;
+      worker_data = worker_data_local;
     }
     if (threadIdx.x < WarpSize) {
       // Sync one warp and copy descriptor data
-      static_assert(work_desc_type::kBlobSize <= WarpSize);
-      work_ix = raft::shfl(work_ix, 0);
-      if (threadIdx.x < work_desc_type::kBlobSize) {
-        work_descriptor.blob[threadIdx.x] = work_descriptors[work_ix].input.blob[threadIdx.x];
+      static_assert(job_desc_type::kBlobSize <= WarpSize);
+      job_ix = raft::shfl(job_ix, 0);
+      if (threadIdx.x < job_desc_type::kBlobSize) {
+        job_descriptor.blob[threadIdx.x] = job_descriptors[job_ix].input.blob[threadIdx.x];
       }
     }
     __syncthreads();
-    if (work_index.handle == kNoMoreWork) { break; }
-    if (threadIdx.x == 0) { work_handle.store({kWaitForWork}, cuda::memory_order_relaxed); }
+    if (worker_data.handle == kNoMoreWork) { break; }
+    if (threadIdx.x == 0) { worker_handle.store({kWaitForWork}, cuda::memory_order_relaxed); }
 
     // reading phase
-    auto* result_indices_ptr   = work_descriptor.value.result_indices_ptr;
-    auto* result_distances_ptr = work_descriptor.value.result_distances_ptr;
-    auto* queries_ptr          = work_descriptor.value.queries_ptr;
-    auto top_k                 = work_descriptor.value.top_k;
-    auto n_queries             = work_descriptor.value.n_queries;
-    auto query_id              = work_index.value.query_id;
+    auto* result_indices_ptr   = job_descriptor.value.result_indices_ptr;
+    auto* result_distances_ptr = job_descriptor.value.result_distances_ptr;
+    auto* queries_ptr          = job_descriptor.value.queries_ptr;
+    auto top_k                 = job_descriptor.value.top_k;
+    auto n_queries             = job_descriptor.value.n_queries;
+    auto query_id              = worker_data.value.query_id;
 
     // work phase
     search_core<TEAM_SIZE,
@@ -1041,12 +1047,12 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     // arrive to mark the end of the work phase
     __syncthreads();
     if (threadIdx.x == 0) {
-      auto completed_count = atomicInc(completion_counters + work_ix, n_queries - 1) + 1;
+      auto completed_count = atomicInc(completion_counters + job_ix, n_queries - 1) + 1;
       if (completed_count >= n_queries) {
         // we may need a memory fence here:
         //   - device - if the queries are accessed by the device
         //   - system - e.g. if we put them into managed/pinned memory.
-        work_descriptors[work_ix].completion_flag.store(true, cuda::memory_order_relaxed);
+        job_descriptors[job_ix].completion_flag.store(true, cuda::memory_order_relaxed);
       }
     }
   }
@@ -1178,14 +1184,21 @@ struct search_kernel_config {
   }
 };
 
+constexpr uint32_t kMaxJobsNum    = 1024;
+constexpr uint32_t kMaxWorkersNum = 1024;
+
 struct persistent_runner_base_t {
-  using work_queue_type = atomic_queue::AtomicQueue<uint32_t, 1024, ~0u, false, true, false, false>;
-  rmm::mr::pinned_host_memory_resource work_handles_mr;
-  rmm::mr::pinned_host_memory_resource work_descriptor_mr;
+  using job_queue_type =
+    atomic_queue::AtomicQueue<uint32_t, kMaxJobsNum, ~0u, false, true, false, false>;
+  using worker_queue_type =
+    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersNum, ~0u, false, true, false, false>;
+  rmm::mr::pinned_host_memory_resource worker_handles_mr;
+  rmm::mr::pinned_host_memory_resource job_descriptor_mr;
   rmm::mr::cuda_memory_resource device_mr;
   cudaStream_t stream{};
-  work_queue_type queue{};
-  persistent_runner_base_t() : queue()
+  job_queue_type job_queue{};
+  worker_queue_type worker_queue{};
+  persistent_runner_base_t() : job_queue(), worker_queue()
   {
     cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
   }
@@ -1193,34 +1206,34 @@ struct persistent_runner_base_t {
 };
 
 struct alignas(kCacheLineBytes) launcher_t {
-  using work_queue_type = persistent_runner_base_t::work_queue_type;
+  using job_queue_type    = persistent_runner_base_t::job_queue_type;
+  using worker_queue_type = persistent_runner_base_t::worker_queue_type;
   using pending_reads_queue_type =
     atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, false, false, true>;
   using completion_flag_type = cuda::atomic<bool, cuda::thread_scope_system>;
 
   pending_reads_queue_type pending_reads{};
-  work_queue_type& worker_ids;
-  work_handle_t* work_handles;
-  uint32_t lead_worker_id;
+  job_queue_type& job_ids;
+  worker_queue_type& idle_worker_ids;
+  worker_handle_t* worker_handles;
+  uint32_t job_id;
   completion_flag_type* completion_flag;
   bool all_done = false;
 
   template <typename RecordWork>
-  launcher_t(work_queue_type& worker_ids,
-             work_handle_t* work_handles,
+  launcher_t(job_queue_type& job_ids,
+             worker_queue_type& idle_worker_ids,
+             worker_handle_t* worker_handles,
              uint32_t n_queries,
              RecordWork record_work)
-    : worker_ids{worker_ids},
-      work_handles{work_handles},
-      lead_worker_id{worker_ids.pop()},
-      completion_flag{record_work(lead_worker_id)}
+    : job_ids{job_ids},
+      idle_worker_ids{idle_worker_ids},
+      worker_handles{worker_handles},
+      job_id{job_ids.pop()},
+      completion_flag{record_work(job_id)}
   {
-    // The first worker is special: one may associate a work_descriptor and the completion_flag
-    // with the same id. Hence it bypassed the `pending_reads` queue and is only released at the
-    // very end.
-    submit_query(lead_worker_id, 0, true);
     // Submit all queries in the batch
-    for (uint32_t i = 1; i < n_queries; i++) {
+    for (uint32_t i = 0; i < n_queries; i++) {
       uint32_t worker_id;
       while (!try_get_worker(worker_id)) {
         if (pending_reads.try_pop(worker_id)) {
@@ -1231,15 +1244,13 @@ struct alignas(kCacheLineBytes) launcher_t {
     }
   }
 
-  void submit_query(uint32_t worker_id, uint32_t query_id, bool first_to_submit = false)
+  void submit_query(uint32_t worker_id, uint32_t query_id)
   {
-    work_handles[worker_id].data.store(
-      work_handle_t::data_t{.value = {lead_worker_id, query_id}},
-      first_to_submit ? cuda::memory_order_release : cuda::memory_order_relaxed);
+    worker_handles[worker_id].data.store(worker_handle_t::data_t{.value = {job_id, query_id}},
+                                         cuda::memory_order_relaxed);
 
-    if (first_to_submit) { return; }
     while (!pending_reads.try_push(worker_id)) {
-      // The only reason pending_reads cannot push is that the queue is full.
+      // The only reason pending_reads cannot push is that the job_queue is full.
       // It's local, so we must pop and wait for the returned worker to finish its work.
       auto pending_worker_id = pending_reads.pop();
       while (!try_return_worker(pending_worker_id)) {
@@ -1253,8 +1264,8 @@ struct alignas(kCacheLineBytes) launcher_t {
   {
     // Use the cached `all_done` - makes sense when called from the `wait()` routine.
     if (all_done ||
-        work_handles[worker_id].data.load(cuda::memory_order_relaxed).handle == kWaitForWork) {
-      worker_ids.push(worker_id);
+        !is_worker_busy(worker_handles[worker_id].data.load(cuda::memory_order_relaxed).handle)) {
+      idle_worker_ids.push(worker_id);
       return true;
     } else {
       return false;
@@ -1262,7 +1273,7 @@ struct alignas(kCacheLineBytes) launcher_t {
   }
 
   /** Try get a free worker if any. */
-  auto try_get_worker(uint32_t& worker_id) -> bool { return worker_ids.try_pop(worker_id); }
+  auto try_get_worker(uint32_t& worker_id) -> bool { return idle_worker_ids.try_pop(worker_id); }
 
   /** Check if all workers finished their work. */
   auto is_all_done()
@@ -1290,8 +1301,8 @@ struct alignas(kCacheLineBytes) launcher_t {
       std::this_thread::yield();
     }
 
-    // lead_worker_id is reused for the handles, so we can only return it at the end
-    try_return_worker(lead_worker_id);
+    // Return the job descriptor
+    job_ids.push(job_id);
   }
 };
 
@@ -1305,12 +1316,12 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using kernel_config_type =
     search_kernel_config<true, TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
-  using kernel_type    = typename kernel_config_type::kernel_t;
-  using work_desc_type = work_desc_t<DATASET_DESCRIPTOR_T>;
+  using kernel_type   = typename kernel_config_type::kernel_t;
+  using job_desc_type = job_desc_t<DATASET_DESCRIPTOR_T>;
   kernel_type kernel;
   uint32_t block_size;
-  rmm::device_uvector<work_handle_t> work_handles;
-  rmm::device_uvector<work_desc_type> work_descriptors;
+  rmm::device_uvector<worker_handle_t> worker_handles;
+  rmm::device_uvector<job_desc_type> job_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
   rmm::device_uvector<index_type> hashmap;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
@@ -1336,9 +1347,9 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
       kernel{kernel_config_type::choose_itopk_and_mx_candidates(
         itopk_size, num_itopk_candidates, block_size)},
       block_size{block_size},
-      work_handles(0, stream, work_handles_mr),
-      work_descriptors(0, stream, work_descriptor_mr),
-      completion_counters(0, stream, device_mr),
+      worker_handles(0, stream, worker_handles_mr),
+      job_descriptors(kMaxJobsNum, stream, job_descriptor_mr),
+      completion_counters(kMaxJobsNum, stream, device_mr),
       hashmap(0, stream, device_mr)
   {
     // set kernel attributes same as in normal kernel
@@ -1351,30 +1362,32 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     RAFT_LOG_DEBUG(
       "Launching persistent kernel with %u threads, %u block %u smem", bs.x, gs.y, smem_size);
 
-    // initialize the work queue
-    completion_counters.resize(gs.y, stream);
+    // initialize the job queue
     auto* completion_counters_ptr = completion_counters.data();
-    work_descriptors.resize(gs.y, stream);
-    auto* work_descriptors_ptr = work_descriptors.data();
-
-    index_type* hashmap_ptr = nullptr;
-    if (small_hash_bitlen == 0) { hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream); }
+    auto* job_descriptors_ptr     = job_descriptors.data();
+    for (uint32_t i = 0; i < kMaxJobsNum; i++) {
+      auto& jd                = job_descriptors_ptr[i].input.value;
+      jd.result_indices_ptr   = nullptr;
+      jd.result_distances_ptr = nullptr;
+      jd.queries_ptr          = nullptr;
+      jd.top_k                = 0;
+      jd.n_queries            = 0;
+      job_descriptors_ptr[i].completion_flag.store(false);
+      job_queue.push(i);
+    }
 
-    work_handles.resize(gs.y, stream);
-    auto* work_handles_ptr = work_handles.data();
+    // initialize the worker queue
+    worker_handles.resize(gs.y, stream);
+    auto* worker_handles_ptr = worker_handles.data();
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     for (uint32_t i = 0; i < gs.y; i++) {
-      auto& wd                = work_descriptors_ptr[i].input.value;
-      wd.result_indices_ptr   = nullptr;
-      wd.result_distances_ptr = nullptr;
-      wd.queries_ptr          = nullptr;
-      wd.top_k                = 0;
-      wd.n_queries            = 0;
-      work_descriptors_ptr[i].completion_flag.store(false);
-      work_handles_ptr[i].data.store({kWaitForWork});
-      queue.push(i);
+      worker_handles_ptr[i].data.store({kWaitForWork});
+      worker_queue.push(i);
     }
 
+    index_type* hashmap_ptr = nullptr;
+    if (small_hash_bitlen == 0) { hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream); }
+
     // launch the kernel
     auto* graph_ptr                   = graph.data_handle();
     uint32_t graph_degree             = graph.extent(1);
@@ -1383,8 +1396,8 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
 
     void* args[] =  // NOLINT
       {&dataset_desc,
-       &work_handles_ptr,
-       &work_descriptors_ptr,
+       &worker_handles_ptr,
+       &job_descriptors_ptr,
        &completion_counters_ptr,
        &graph_ptr,  // [dataset_size, graph_degree]
        &graph_degree,
@@ -1405,19 +1418,21 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
        &metric};
     RAFT_CUDA_TRY(cudaLaunchCooperativeKernel<std::remove_pointer_t<kernel_type>>(
       kernel, gs, bs, args, smem_size, stream));
-    RAFT_LOG_INFO("Initialized the kernel in stream %zd, queue size = %u",
-                  int64_t((cudaStream_t)stream),
-                  queue.was_size());
+    RAFT_LOG_INFO(
+      "Initialized the kernel in stream %zd; job_queue size = %u; worker_queue size = %u",
+      int64_t((cudaStream_t)stream),
+      job_queue.was_size(),
+      worker_queue.was_size());
   }
 
   ~persistent_runner_t() noexcept override
   {
-    auto whs           = work_handles.data();
-    auto whl           = work_handles.size();
+    auto whs           = worker_handles.data();
+    auto whl           = worker_handles.size();
     uint32_t worker_id = 0;
     auto count         = whl;
     // wait for all the jobs to finish nicely
-    while (queue.try_pop(worker_id)) {
+    while (job_queue.try_pop(worker_id)) {
       whs[worker_id].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
       count--;
     }
@@ -1437,17 +1452,19 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
               uint32_t top_k)
   {
     // submit all queries
-    launcher_t launcher{queue, work_handles.data(), num_queries, [=](uint32_t worker_id) {
-                          auto& wd   = work_descriptors.data()[worker_id].input.value;
-                          auto cflag = &work_descriptors.data()[worker_id].completion_flag;
-                          wd.result_indices_ptr   = result_indices_ptr;
-                          wd.result_distances_ptr = result_distances_ptr;
-                          wd.queries_ptr          = queries_ptr;
-                          wd.top_k                = top_k;
-                          wd.n_queries            = num_queries;
-                          cflag->store(false, cuda::memory_order_relaxed);
-                          return cflag;
-                        }};
+    launcher_t launcher{
+      job_queue, worker_queue, worker_handles.data(), num_queries, [=](uint32_t job_ix) {
+        auto& jd                = job_descriptors.data()[job_ix].input.value;
+        auto cflag              = &job_descriptors.data()[job_ix].completion_flag;
+        jd.result_indices_ptr   = result_indices_ptr;
+        jd.result_distances_ptr = result_distances_ptr;
+        jd.queries_ptr          = queries_ptr;
+        jd.top_k                = top_k;
+        jd.n_queries            = num_queries;
+        cflag->store(false, cuda::memory_order_relaxed);
+        cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+        return cflag;
+      }};
     // update the keep-alive atomic in the meanwhile
     last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
     // wait for the results to arrive
@@ -1466,9 +1483,16 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     int ctas_per_sm = 1;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor<kernel_type>(
       &ctas_per_sm, kernel, block_size, smem_size);
-    int num_sm = getMultiProcessorCount();
+    int num_sm    = getMultiProcessorCount();
+    auto n_blocks = static_cast<uint32_t>(kDeviceUsage * (ctas_per_sm * num_sm));
+    if (n_blocks > kMaxWorkersNum) {
+      RAFT_LOG_WARN("Limiting the grid size limit due to the size of the queue: %u -> %u",
+                    n_blocks,
+                    kMaxWorkersNum);
+      n_blocks = kMaxWorkersNum;
+    }
 
-    return {1, static_cast<uint32_t>(kDeviceUsage * (ctas_per_sm * num_sm)), 1};
+    return {1, n_blocks, 1};
   }
 };
 

From 83355ab578fd6b78e8aff958d881df0263af255f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 20:12:14 +0200
Subject: [PATCH 16/37] Add the third-party atomic_queue headers for easier
 testing

---
 cpp/include/atomic_queue/LICENSE        |  21 ++
 cpp/include/atomic_queue/atomic_queue.h | 457 ++++++++++++++++++++++++
 cpp/include/atomic_queue/defs.h         |  99 +++++
 3 files changed, 577 insertions(+)
 create mode 100644 cpp/include/atomic_queue/LICENSE
 create mode 100644 cpp/include/atomic_queue/atomic_queue.h
 create mode 100644 cpp/include/atomic_queue/defs.h

diff --git a/cpp/include/atomic_queue/LICENSE b/cpp/include/atomic_queue/LICENSE
new file mode 100644
index 0000000000..c1d3466926
--- /dev/null
+++ b/cpp/include/atomic_queue/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Maxim Egorushkin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cpp/include/atomic_queue/atomic_queue.h b/cpp/include/atomic_queue/atomic_queue.h
new file mode 100644
index 0000000000..93c237da4c
--- /dev/null
+++ b/cpp/include/atomic_queue/atomic_queue.h
@@ -0,0 +1,457 @@
+/* -*- mode: c++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
+#ifndef ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
+#define ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
+
+// Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE.
+
+#include "defs.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace atomic_queue {
+
+using std::uint32_t;
+using std::uint64_t;
+using std::uint8_t;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace details {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <size_t elements_per_cache_line>
+struct GetCacheLineIndexBits {
+  static int constexpr value = 0;
+};
+template <>
+struct GetCacheLineIndexBits<256> {
+  static int constexpr value = 8;
+};
+template <>
+struct GetCacheLineIndexBits<128> {
+  static int constexpr value = 7;
+};
+template <>
+struct GetCacheLineIndexBits<64> {
+  static int constexpr value = 6;
+};
+template <>
+struct GetCacheLineIndexBits<32> {
+  static int constexpr value = 5;
+};
+template <>
+struct GetCacheLineIndexBits<16> {
+  static int constexpr value = 4;
+};
+template <>
+struct GetCacheLineIndexBits<8> {
+  static int constexpr value = 3;
+};
+template <>
+struct GetCacheLineIndexBits<4> {
+  static int constexpr value = 2;
+};
+template <>
+struct GetCacheLineIndexBits<2> {
+  static int constexpr value = 1;
+};
+
+template <bool minimize_contention, unsigned array_size, size_t elements_per_cache_line>
+struct GetIndexShuffleBits {
+  static int constexpr bits          = GetCacheLineIndexBits<elements_per_cache_line>::value;
+  static unsigned constexpr min_size = 1u << (bits * 2);
+  static int constexpr value         = array_size < min_size ? 0 : bits;
+};
+
+template <unsigned array_size, size_t elements_per_cache_line>
+struct GetIndexShuffleBits<false, array_size, elements_per_cache_line> {
+  static int constexpr value = 0;
+};
+
+// Multiple writers/readers contend on the same cache line when storing/loading elements at
+// subsequent indexes, aka false sharing. For power of 2 ring buffer size it is possible to re-map
+// the index in such a way that each subsequent element resides on another cache line, which
+// minimizes contention. This is done by swapping the lowest order N bits (which are the index of
+// the element within the cache line) with the next N bits (which are the index of the cache line)
+// of the element index.
+template <int BITS>
+constexpr unsigned remap_index(unsigned index) noexcept
+{
+  unsigned constexpr mix_mask{(1u << BITS) - 1};
+  unsigned const mix{(index ^ (index >> BITS)) & mix_mask};
+  return index ^ mix ^ (mix << BITS);
+}
+
+template <>
+constexpr unsigned remap_index<0>(unsigned index) noexcept
+{
+  return index;
+}
+
+template <int BITS, class T>
+constexpr T& map(T* elements, unsigned index) noexcept
+{
+  return elements[remap_index<BITS>(index)];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Implement a "bit-twiddling hack" for finding the next power of 2 in either 32 bits or 64 bits
+// in C++11 compatible constexpr functions. The library no longer maintains C++11 compatibility.
+
+// "Runtime" version for 32 bits
+// --a;
+// a |= a >> 1;
+// a |= a >> 2;
+// a |= a >> 4;
+// a |= a >> 8;
+// a |= a >> 16;
+// ++a;
+
+template <class T>
+constexpr T decrement(T x) noexcept
+{
+  return x - 1;
+}
+
+template <class T>
+constexpr T increment(T x) noexcept
+{
+  return x + 1;
+}
+
+template <class T>
+constexpr T or_equal(T x, unsigned u) noexcept
+{
+  return x | x >> u;
+}
+
+template <class T, class... Args>
+constexpr T or_equal(T x, unsigned u, Args... rest) noexcept
+{
+  return or_equal(or_equal(x, u), rest...);
+}
+
+constexpr uint32_t round_up_to_power_of_2(uint32_t a) noexcept
+{
+  return increment(or_equal(decrement(a), 1, 2, 4, 8, 16));
+}
+
+constexpr uint64_t round_up_to_power_of_2(uint64_t a) noexcept
+{
+  return increment(or_equal(decrement(a), 1, 2, 4, 8, 16, 32));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+constexpr T nil() noexcept
+{
+#if __cpp_lib_atomic_is_always_lock_free  // Better compile-time error message requires C++17.
+  static_assert(
+    std::atomic<T>::is_always_lock_free,
+    "Queue element type T is not atomic. Use AtomicQueue2/AtomicQueueB2 for such element types.");
+#endif
+  return {};
+}
+
+template <class T>
+inline void destroy_n(T* p, unsigned n) noexcept
+{
+  for (auto q = p + n; p != q;)
+    (p++)->~T();
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace details
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Derived>
+class AtomicQueueCommon {
+ protected:
+  // Put these on different cache lines to avoid false sharing between readers and writers.
+  alignas(CACHE_LINE_SIZE) std::atomic<unsigned> head_ = {};
+  alignas(CACHE_LINE_SIZE) std::atomic<unsigned> tail_ = {};
+
+  // The special member functions are not thread-safe.
+
+  AtomicQueueCommon() noexcept = default;
+
+  AtomicQueueCommon(AtomicQueueCommon const& b) noexcept
+    : head_(b.head_.load(X)), tail_(b.tail_.load(X))
+  {
+  }
+
+  AtomicQueueCommon& operator=(AtomicQueueCommon const& b) noexcept
+  {
+    head_.store(b.head_.load(X), X);
+    tail_.store(b.tail_.load(X), X);
+    return *this;
+  }
+
+  void swap(AtomicQueueCommon& b) noexcept
+  {
+    unsigned h = head_.load(X);
+    unsigned t = tail_.load(X);
+    head_.store(b.head_.load(X), X);
+    tail_.store(b.tail_.load(X), X);
+    b.head_.store(h, X);
+    b.tail_.store(t, X);
+  }
+
+  template <class T, T NIL>
+  static T do_pop_atomic(std::atomic<T>& q_element) noexcept
+  {
+    if (Derived::spsc_) {
+      for (;;) {
+        T element = q_element.load(A);
+        if (ATOMIC_QUEUE_LIKELY(element != NIL)) {
+          q_element.store(NIL, X);
+          return element;
+        }
+        if (Derived::maximize_throughput_) spin_loop_pause();
+      }
+    } else {
+      for (;;) {
+        T element = q_element.exchange(NIL, A);  // (2) The store to wait for.
+        if (ATOMIC_QUEUE_LIKELY(element != NIL)) return element;
+        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
+        do
+          spin_loop_pause();
+        while (Derived::maximize_throughput_ && q_element.load(X) == NIL);
+      }
+    }
+  }
+
+  template <class T, T NIL>
+  static void do_push_atomic(T element, std::atomic<T>& q_element) noexcept
+  {
+    assert(element != NIL);
+    if (Derived::spsc_) {
+      while (ATOMIC_QUEUE_UNLIKELY(q_element.load(X) != NIL))
+        if (Derived::maximize_throughput_) spin_loop_pause();
+      q_element.store(element, R);
+    } else {
+      for (T expected = NIL;
+           ATOMIC_QUEUE_UNLIKELY(!q_element.compare_exchange_weak(expected, element, R, X));
+           expected = NIL) {
+        do
+          spin_loop_pause();  // (1) Wait for store (2) to complete.
+        while (Derived::maximize_throughput_ && q_element.load(X) != NIL);
+      }
+    }
+  }
+
+  enum State : unsigned char { EMPTY, STORING, STORED, LOADING };
+
+  template <class T>
+  static T do_pop_any(std::atomic<unsigned char>& state, T& q_element) noexcept
+  {
+    if (Derived::spsc_) {
+      while (ATOMIC_QUEUE_UNLIKELY(state.load(A) != STORED))
+        if (Derived::maximize_throughput_) spin_loop_pause();
+      T element{std::move(q_element)};
+      state.store(EMPTY, R);
+      return element;
+    } else {
+      for (;;) {
+        unsigned char expected = STORED;
+        if (ATOMIC_QUEUE_LIKELY(state.compare_exchange_weak(expected, LOADING, A, X))) {
+          T element{std::move(q_element)};
+          state.store(EMPTY, R);
+          return element;
+        }
+        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
+        do
+          spin_loop_pause();
+        while (Derived::maximize_throughput_ && state.load(X) != STORED);
+      }
+    }
+  }
+
+  template <class U, class T>
+  static void do_push_any(U&& element, std::atomic<unsigned char>& state, T& q_element) noexcept
+  {
+    if (Derived::spsc_) {
+      while (ATOMIC_QUEUE_UNLIKELY(state.load(A) != EMPTY))
+        if (Derived::maximize_throughput_) spin_loop_pause();
+      q_element = std::forward<U>(element);
+      state.store(STORED, R);
+    } else {
+      for (;;) {
+        unsigned char expected = EMPTY;
+        if (ATOMIC_QUEUE_LIKELY(state.compare_exchange_weak(expected, STORING, A, X))) {
+          q_element = std::forward<U>(element);
+          state.store(STORED, R);
+          return;
+        }
+        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
+        do
+          spin_loop_pause();
+        while (Derived::maximize_throughput_ && state.load(X) != EMPTY);
+      }
+    }
+  }
+
+ public:
+  template <class T>
+  bool try_push(T&& element) noexcept
+  {
+    auto head = head_.load(X);
+    if (Derived::spsc_) {
+      if (static_cast<int>(head - tail_.load(X)) >=
+          static_cast<int>(static_cast<Derived&>(*this).size_))
+        return false;
+      head_.store(head + 1, X);
+    } else {
+      do {
+        if (static_cast<int>(head - tail_.load(X)) >=
+            static_cast<int>(static_cast<Derived&>(*this).size_))
+          return false;
+      } while (ATOMIC_QUEUE_UNLIKELY(
+        !head_.compare_exchange_weak(head, head + 1, X, X)));  // This loop is not FIFO.
+    }
+
+    static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
+    return true;
+  }
+
+  template <class T>
+  bool try_pop(T& element) noexcept
+  {
+    auto tail = tail_.load(X);
+    if (Derived::spsc_) {
+      if (static_cast<int>(head_.load(X) - tail) <= 0) return false;
+      tail_.store(tail + 1, X);
+    } else {
+      do {
+        if (static_cast<int>(head_.load(X) - tail) <= 0) return false;
+      } while (ATOMIC_QUEUE_UNLIKELY(
+        !tail_.compare_exchange_weak(tail, tail + 1, X, X)));  // This loop is not FIFO.
+    }
+
+    element = static_cast<Derived&>(*this).do_pop(tail);
+    return true;
+  }
+
+  template <class T>
+  void push(T&& element) noexcept
+  {
+    unsigned head;
+    if (Derived::spsc_) {
+      head = head_.load(X);
+      head_.store(head + 1, X);
+    } else {
+      constexpr auto memory_order =
+        Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_relaxed;
+      head =
+        head_.fetch_add(1, memory_order);  // FIFO and total order on Intel regardless, as of 2019.
+    }
+    static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
+  }
+
+  auto pop() noexcept
+  {
+    unsigned tail;
+    if (Derived::spsc_) {
+      tail = tail_.load(X);
+      tail_.store(tail + 1, X);
+    } else {
+      constexpr auto memory_order =
+        Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_relaxed;
+      tail =
+        tail_.fetch_add(1, memory_order);  // FIFO and total order on Intel regardless, as of 2019.
+    }
+    return static_cast<Derived&>(*this).do_pop(tail);
+  }
+
+  bool was_empty() const noexcept { return !was_size(); }
+
+  bool was_full() const noexcept
+  {
+    return was_size() >= static_cast<int>(static_cast<Derived const&>(*this).size_);
+  }
+
+  unsigned was_size() const noexcept
+  {
+    // tail_ can be greater than head_ because of consumers doing pop, rather that try_pop, when the
+    // queue is empty.
+    return std::max(static_cast<int>(head_.load(X) - tail_.load(X)), 0);
+  }
+
+  unsigned capacity() const noexcept { return static_cast<Derived const&>(*this).size_; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class T,
+          unsigned SIZE,
+          T NIL                    = details::nil<T>(),
+          bool MINIMIZE_CONTENTION = true,
+          bool MAXIMIZE_THROUGHPUT = true,
+          bool TOTAL_ORDER         = false,
+          bool SPSC                = false>
+class AtomicQueue
+  : public AtomicQueueCommon<
+      AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>> {
+  using Base = AtomicQueueCommon<
+    AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
+  friend Base;
+
+  static constexpr unsigned size_ =
+    MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE;
+  static constexpr int SHUFFLE_BITS =
+    details::GetIndexShuffleBits<MINIMIZE_CONTENTION,
+                                 size_,
+                                 CACHE_LINE_SIZE / sizeof(std::atomic<T>)>::value;
+  static constexpr bool total_order_         = TOTAL_ORDER;
+  static constexpr bool spsc_                = SPSC;
+  static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
+
+  alignas(CACHE_LINE_SIZE) std::atomic<T> elements_[size_];
+
+  T do_pop(unsigned tail) noexcept
+  {
+    std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, tail % size_);
+    return Base::template do_pop_atomic<T, NIL>(q_element);
+  }
+
+  void do_push(T element, unsigned head) noexcept
+  {
+    std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, head % size_);
+    Base::template do_push_atomic<T, NIL>(element, q_element);
+  }
+
+ public:
+  using value_type = T;
+
+  AtomicQueue() noexcept
+  {
+    assert(
+      std::atomic<T>{NIL}.is_lock_free());  // Queue element type T is not atomic. Use
+                                            // AtomicQueue2/AtomicQueueB2 for such element types.
+    for (auto p = elements_, q = elements_ + size_; p != q; ++p)
+      p->store(NIL, X);
+  }
+
+  AtomicQueue(AtomicQueue const&)            = delete;
+  AtomicQueue& operator=(AtomicQueue const&) = delete;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace atomic_queue
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif  // ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
diff --git a/cpp/include/atomic_queue/defs.h b/cpp/include/atomic_queue/defs.h
new file mode 100644
index 0000000000..4601b1d46f
--- /dev/null
+++ b/cpp/include/atomic_queue/defs.h
@@ -0,0 +1,99 @@
+/* -*- mode: c++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
+#ifndef ATOMIC_QUEUE_DEFS_H_INCLUDED
+#define ATOMIC_QUEUE_DEFS_H_INCLUDED
+
+// Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE.
+
+#include <atomic>
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#include <emmintrin.h>
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 64;
+static inline void spin_loop_pause() noexcept {
+    _mm_pause();
+}
+} // namespace atomic_queue
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 64;
+static inline void spin_loop_pause() noexcept {
+#if (defined(__ARM_ARCH_6K__) || \
+     defined(__ARM_ARCH_6Z__) || \
+     defined(__ARM_ARCH_6ZK__) || \
+     defined(__ARM_ARCH_6T2__) || \
+     defined(__ARM_ARCH_7__) || \
+     defined(__ARM_ARCH_7A__) || \
+     defined(__ARM_ARCH_7R__) || \
+     defined(__ARM_ARCH_7M__) || \
+     defined(__ARM_ARCH_7S__) || \
+     defined(__ARM_ARCH_8A__) || \
+     defined(__aarch64__))
+    asm volatile ("yield" ::: "memory");
+#elif defined(_M_ARM64)
+    __yield();
+#else
+    asm volatile ("nop" ::: "memory");
+#endif
+}
+} // namespace atomic_queue
+#elif defined(__ppc64__) || defined(__powerpc64__)
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 128; // TODO: Review that this is the correct value.
+static inline void spin_loop_pause() noexcept {
+    asm volatile("or 31,31,31 # very low priority"); // TODO: Review and benchmark that this is the right instruction.
+}
+} // namespace atomic_queue
+#elif defined(__s390x__)
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 256; // TODO: Review that this is the correct value.
+static inline void spin_loop_pause() noexcept {} // TODO: Find the right instruction to use here, if any.
+} // namespace atomic_queue
+#elif defined(__riscv)
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 64;
+static inline void spin_loop_pause() noexcept {
+    asm volatile (".insn i 0x0F, 0, x0, x0, 0x010");
+}
+} // namespace atomic_queue
+#else
+#ifdef _MSC_VER
+#pragma message("Unknown CPU architecture. Using L1 cache line size of 64 bytes and no spinloop pause instruction.")
+#else
+#warning "Unknown CPU architecture. Using L1 cache line size of 64 bytes and no spinloop pause instruction."
+#endif
+namespace atomic_queue {
+constexpr int CACHE_LINE_SIZE = 64; // TODO: Review that this is the correct value.
+static inline void spin_loop_pause() noexcept {}
+} // namespace atomic_queue
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace atomic_queue {
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ATOMIC_QUEUE_LIKELY(expr) __builtin_expect(static_cast<bool>(expr), 1)
+#define ATOMIC_QUEUE_UNLIKELY(expr) __builtin_expect(static_cast<bool>(expr), 0)
+#define ATOMIC_QUEUE_NOINLINE __attribute__((noinline))
+#else
+#define ATOMIC_QUEUE_LIKELY(expr) (expr)
+#define ATOMIC_QUEUE_UNLIKELY(expr) (expr)
+#define ATOMIC_QUEUE_NOINLINE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+auto constexpr A = std::memory_order_acquire;
+auto constexpr R = std::memory_order_release;
+auto constexpr X = std::memory_order_relaxed;
+auto constexpr C = std::memory_order_seq_cst;
+auto constexpr AR = std::memory_order_acq_rel;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace atomic_queue
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // ATOMIC_QUEUE_DEFS_H_INCLUDED

From f177a81487f09b2f77c244612d5ee3bf4772eb18 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 09:55:53 +0200
Subject: [PATCH 17/37] Tweak the CPU waiting behavior to avoid busy-spinning

---
 .../detail/cagra/search_single_cta_kernel-inl.cuh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index ae990d83e6..c5bd718f34 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1184,8 +1184,10 @@ struct search_kernel_config {
   }
 };
 
-constexpr uint32_t kMaxJobsNum    = 1024;
-constexpr uint32_t kMaxWorkersNum = 1024;
+constexpr uint32_t kMaxJobsNum              = 2048;
+constexpr uint32_t kMaxWorkersNum           = 2048;
+constexpr uint32_t kMaxWorkersPerThread     = 256;
+constexpr uint32_t kSoftMaxWorkersPerThread = 16;
 
 struct persistent_runner_base_t {
   using job_queue_type =
@@ -1209,7 +1211,7 @@ struct alignas(kCacheLineBytes) launcher_t {
   using job_queue_type    = persistent_runner_base_t::job_queue_type;
   using worker_queue_type = persistent_runner_base_t::worker_queue_type;
   using pending_reads_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, 32, ~0u, false, false, false, true>;
+    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersPerThread, ~0u, false, false, false, true>;
   using completion_flag_type = cuda::atomic<bool, cuda::thread_scope_system>;
 
   pending_reads_queue_type pending_reads{};
@@ -1237,10 +1239,17 @@ struct alignas(kCacheLineBytes) launcher_t {
       uint32_t worker_id;
       while (!try_get_worker(worker_id)) {
         if (pending_reads.try_pop(worker_id)) {
+          // TODO optimization: avoid the roundtrip through pending_worker_ids
           if (!try_return_worker(worker_id)) { pending_reads.push(worker_id); }
+        } else {
+          std::this_thread::yield();
         }
       }
       submit_query(worker_id, i);
+      // Try to not hold too many workers in one thread
+      if (i >= kSoftMaxWorkersPerThread && pending_reads.try_pop(worker_id)) {
+        if (!try_return_worker(worker_id)) { pending_reads.push(worker_id); }
+      }
     }
   }
 

From db5b00251dbc78951b6f4095e16f3a3181f54f8c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:43:23 +0200
Subject: [PATCH 18/37] Add a single-threaded deque for pending_reads to reduce
 the cpu/cache load

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 85 ++++++++++++++++---
 1 file changed, 71 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index c5bd718f34..bccfd6c6ce 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1184,6 +1184,63 @@ struct search_kernel_config {
   }
 };
 
+/** Primitive fixed-size deque for single-threaded use. */
+template <typename T>
+struct local_deque_t {
+  explicit local_deque_t(uint32_t size) : store_(size) {}
+
+  [[nodiscard]] auto capacity() const -> uint32_t { return store_.size(); }
+  [[nodiscard]] auto size() const -> uint32_t { return end_ - start_; }
+
+  void push_back(T x) { store_[end_++ % capacity()] = x; }
+
+  void push_front(T x)
+  {
+    if (start_ == 0) {
+      start_ += capacity();
+      end_ += capacity();
+    }
+    store_[--start_ % capacity()] = x;
+  }
+
+  // NB: non-blocking, unsafe functions
+  auto pop_back() -> T { return store_[--end_ % capacity()]; }
+  auto pop_front() -> T { return store_[start_++ % capacity()]; }
+
+  auto try_push_back(T x) -> bool
+  {
+    if (size() >= capacity()) { return false; }
+    push_back(x);
+    return true;
+  }
+
+  auto try_push_front(T x) -> bool
+  {
+    if (size() >= capacity()) { return false; }
+    push_front(x);
+    return true;
+  }
+
+  auto try_pop_back(T& x) -> bool
+  {
+    if (start_ >= end_) { return false; }
+    x = pop_back();
+    return true;
+  }
+
+  auto try_pop_front(T& x) -> bool
+  {
+    if (start_ >= end_) { return false; }
+    x = pop_front();
+    return true;
+  }
+
+ private:
+  std::vector<T> store_;
+  uint32_t start_{0};
+  uint32_t end_{0};
+};
+
 constexpr uint32_t kMaxJobsNum              = 2048;
 constexpr uint32_t kMaxWorkersNum           = 2048;
 constexpr uint32_t kMaxWorkersPerThread     = 256;
@@ -1208,13 +1265,12 @@ struct persistent_runner_base_t {
 };
 
 struct alignas(kCacheLineBytes) launcher_t {
-  using job_queue_type    = persistent_runner_base_t::job_queue_type;
-  using worker_queue_type = persistent_runner_base_t::worker_queue_type;
-  using pending_reads_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersPerThread, ~0u, false, false, false, true>;
-  using completion_flag_type = cuda::atomic<bool, cuda::thread_scope_system>;
+  using job_queue_type           = persistent_runner_base_t::job_queue_type;
+  using worker_queue_type        = persistent_runner_base_t::worker_queue_type;
+  using pending_reads_queue_type = local_deque_t<uint32_t>;
+  using completion_flag_type     = cuda::atomic<bool, cuda::thread_scope_system>;
 
-  pending_reads_queue_type pending_reads{};
+  pending_reads_queue_type pending_reads;
   job_queue_type& job_ids;
   worker_queue_type& idle_worker_ids;
   worker_handle_t* worker_handles;
@@ -1228,7 +1284,8 @@ struct alignas(kCacheLineBytes) launcher_t {
              worker_handle_t* worker_handles,
              uint32_t n_queries,
              RecordWork record_work)
-    : job_ids{job_ids},
+    : pending_reads{std::min(n_queries, kMaxWorkersPerThread)},
+      job_ids{job_ids},
       idle_worker_ids{idle_worker_ids},
       worker_handles{worker_handles},
       job_id{job_ids.pop()},
@@ -1238,17 +1295,17 @@ struct alignas(kCacheLineBytes) launcher_t {
     for (uint32_t i = 0; i < n_queries; i++) {
       uint32_t worker_id;
       while (!try_get_worker(worker_id)) {
-        if (pending_reads.try_pop(worker_id)) {
+        if (pending_reads.try_pop_front(worker_id)) {
           // TODO optimization: avoid the roundtrip through pending_worker_ids
-          if (!try_return_worker(worker_id)) { pending_reads.push(worker_id); }
+          if (!try_return_worker(worker_id)) { pending_reads.push_front(worker_id); }
         } else {
           std::this_thread::yield();
         }
       }
       submit_query(worker_id, i);
       // Try to not hold too many workers in one thread
-      if (i >= kSoftMaxWorkersPerThread && pending_reads.try_pop(worker_id)) {
-        if (!try_return_worker(worker_id)) { pending_reads.push(worker_id); }
+      if (i >= kSoftMaxWorkersPerThread && pending_reads.try_pop_front(worker_id)) {
+        if (!try_return_worker(worker_id)) { pending_reads.push_front(worker_id); }
       }
     }
   }
@@ -1258,10 +1315,10 @@ struct alignas(kCacheLineBytes) launcher_t {
     worker_handles[worker_id].data.store(worker_handle_t::data_t{.value = {job_id, query_id}},
                                          cuda::memory_order_relaxed);
 
-    while (!pending_reads.try_push(worker_id)) {
+    while (!pending_reads.try_push_back(worker_id)) {
       // The only reason pending_reads cannot push is that the job_queue is full.
       // It's local, so we must pop and wait for the returned worker to finish its work.
-      auto pending_worker_id = pending_reads.pop();
+      auto pending_worker_id = pending_reads.pop_front();
       while (!try_return_worker(pending_worker_id)) {
         std::this_thread::yield();
       }
@@ -1297,7 +1354,7 @@ struct alignas(kCacheLineBytes) launcher_t {
   void wait()
   {
     uint32_t worker_id;
-    while (pending_reads.try_pop(worker_id)) {
+    while (pending_reads.try_pop_front(worker_id)) {
       while (!try_return_worker(worker_id)) {
         if (!is_all_done()) { std::this_thread::yield(); }
       }

From c7481600d957f2f1a6d5a86a41377497651cb7f4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 26 Apr 2024 08:55:34 +0200
Subject: [PATCH 19/37] ann_bench: minimize chances for GPU sync between
 benchmark cases

---
 cpp/bench/ann/src/common/benchmark.hpp        | 12 +++++-----
 cpp/bench/ann/src/common/util.hpp             | 18 +++++++++------
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 22 ++++++++++++-------
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index ad392d53a2..408dc9f825 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,10 +280,8 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  auto distances =
-    std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
-  auto neighbors = std::make_shared<buf<AnnBase::index_type>>(current_algo_props->query_memory_type,
-                                                              k * query_set_size);
+  buf<float> distances{current_algo_props->query_memory_type, k * query_set_size};
+  buf<AnnBase::index_type> neighbors{current_algo_props->query_memory_type, k * query_set_size};
 
   {
     nvtx_case nvtx{state.name()};
@@ -305,8 +303,8 @@ void bench_search(::benchmark::State& state,
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors->data + out_offset * k,
-                     distances->data + out_offset * k);
+                     neighbors.data + out_offset * k,
+                     distances.data + out_offset * k);
       } catch (const std::exception& e) {
         state.SkipWithError("Benchmark loop: " + std::string(e.what()));
         break;
@@ -340,7 +338,7 @@ void bench_search(::benchmark::State& state,
   if (dataset->max_k() >= k) {
     const std::int32_t* gt    = dataset->gt_set();
     const std::uint32_t max_k = dataset->max_k();
-    buf neighbors_host        = neighbors->move(MemoryType::Host);
+    buf neighbors_host        = neighbors.move(MemoryType::Host);
     std::size_t rows          = std::min(queries_processed, query_set_size);
     std::size_t match_count   = 0;
     std::size_t total_count   = rows * static_cast<size_t>(k);
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 614b4ee3bd..e3ca1c5273 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -58,18 +58,20 @@ inline thread_local int benchmark_n_threads = 1;
 
 template <typename T>
 struct buf {
+  cudaStream_t stream;
   MemoryType memory_type;
   std::size_t size;
   T* data;
   buf(MemoryType memory_type, std::size_t size)
-    : memory_type(memory_type), size(size), data(nullptr)
+    : stream(nullptr), memory_type(memory_type), size(size), data(nullptr)
   {
     switch (memory_type) {
 #ifndef BUILD_CPU_ONLY
       case MemoryType::Device: {
-        cudaMallocAsync(reinterpret_cast<void**>(&data), size * sizeof(T), cudaStreamPerThread);
-        cudaMemsetAsync(data, 0, size * sizeof(T), cudaStreamPerThread);
-        cudaStreamSynchronize(cudaStreamPerThread);
+        cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+        cudaMallocAsync(reinterpret_cast<void**>(&data), size * sizeof(T), stream);
+        cudaMemsetAsync(data, 0, size * sizeof(T), stream);
+        cudaStreamSynchronize(stream);
       } break;
 #endif
       default: {
@@ -84,7 +86,9 @@ struct buf {
     switch (memory_type) {
 #ifndef BUILD_CPU_ONLY
       case MemoryType::Device: {
-        cudaFree(data);
+        cudaFreeAsync(data, stream);
+        cudaStreamSynchronize(stream);
+        cudaStreamDestroy(stream);
       } break;
 #endif
       default: {
@@ -99,8 +103,8 @@ struct buf {
 #ifndef BUILD_CPU_ONLY
     if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) ||
         (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) {
-      cudaMemcpyAsync(r.data, data, size * sizeof(T), cudaMemcpyDefault, cudaStreamPerThread);
-      cudaStreamSynchronize(cudaStreamPerThread);
+      cudaMemcpyAsync(r.data, data, size * sizeof(T), cudaMemcpyDefault, stream);
+      cudaStreamSynchronize(stream);
       return r;
     }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 72a2c0bb05..6cadb26736 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -122,7 +122,9 @@ class configured_raft_resources {
    * It's used by the copy constructor.
    */
   explicit configured_raft_resources(const std::shared_ptr<shared_raft_resources>& shared_res)
-    : shared_res_{shared_res}, res_{rmm::cuda_stream_view(get_stream_from_global_pool())}
+    : shared_res_{shared_res},
+      res_{std::make_unique<raft::device_resources>(
+        rmm::cuda_stream_view(get_stream_from_global_pool()))}
   {
   }
 
@@ -131,9 +133,9 @@ class configured_raft_resources {
   {
   }
 
-  configured_raft_resources(configured_raft_resources&&)            = delete;
-  configured_raft_resources& operator=(configured_raft_resources&&) = delete;
-  ~configured_raft_resources()                                      = default;
+  configured_raft_resources(configured_raft_resources&&);
+  configured_raft_resources& operator=(configured_raft_resources&&);
+  ~configured_raft_resources() = default;
   configured_raft_resources(const configured_raft_resources& res)
     : configured_raft_resources{res.shared_res_}
   {
@@ -144,11 +146,11 @@ class configured_raft_resources {
     return *this;
   }
 
-  operator raft::resources&() noexcept { return res_; }
-  operator const raft::resources&() const noexcept { return res_; }
+  operator raft::resources&() noexcept { return *res_; }
+  operator const raft::resources&() const noexcept { return *res_; }
 
   /** Get the main stream */
-  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(res_); }
+  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(*res_); }
 
  private:
   /** The resources shared among multiple raft handles / threads. */
@@ -157,7 +159,11 @@ class configured_raft_resources {
    * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must
    * have its own copy of it.
    */
-  raft::device_resources res_;
+  std::unique_ptr<raft::device_resources> res_ = std::make_unique<raft::device_resources>();
 };
 
+inline configured_raft_resources::configured_raft_resources(configured_raft_resources&&) = default;
+inline configured_raft_resources& configured_raft_resources::operator=(
+  configured_raft_resources&&) = default;
+
 }  // namespace raft::bench::ann

From d51729c8dd0f6944f5d610234fc88ed5622dd0bc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 26 Apr 2024 09:43:06 +0200
Subject: [PATCH 20/37] Fix OOB bugs revealed on GH

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 54 +++++++++----------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index bccfd6c6ce..3e8a2a735f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -575,7 +575,7 @@ __device__ void search_core(
   if (small_hash_bitlen) {
     local_visited_hashmap_ptr = visited_hash_buffer;
   } else {
-    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * gridDim.y);
+    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * blockIdx.y);
   }
   hashmap::init(local_visited_hashmap_ptr, hash_bitlen, 0);
   __syncthreads();
@@ -895,6 +895,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 // This makes sync atomic significantly faster.
 constexpr size_t kCacheLineBytes = 64;
 
+constexpr uint32_t kMaxJobsNum              = 2048;
+constexpr uint32_t kMaxWorkersNum           = 2048;
+constexpr uint32_t kMaxWorkersPerThread     = 256;
+constexpr uint32_t kSoftMaxWorkersPerThread = 16;
+
 template <typename DATASET_DESCRIPTOR_T>
 struct alignas(kCacheLineBytes) job_desc_t {
   using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -997,7 +1002,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
       // Sync one warp and copy descriptor data
       static_assert(job_desc_type::kBlobSize <= WarpSize);
       job_ix = raft::shfl(job_ix, 0);
-      if (threadIdx.x < job_desc_type::kBlobSize) {
+      if (threadIdx.x < job_desc_type::kBlobSize && job_ix < kMaxJobsNum) {
         job_descriptor.blob[threadIdx.x] = job_descriptors[job_ix].input.blob[threadIdx.x];
       }
     }
@@ -1241,11 +1246,6 @@ struct local_deque_t {
   uint32_t end_{0};
 };
 
-constexpr uint32_t kMaxJobsNum              = 2048;
-constexpr uint32_t kMaxWorkersNum           = 2048;
-constexpr uint32_t kMaxWorkersPerThread     = 256;
-constexpr uint32_t kSoftMaxWorkersPerThread = 16;
-
 struct persistent_runner_base_t {
   using job_queue_type =
     atomic_queue::AtomicQueue<uint32_t, kMaxJobsNum, ~0u, false, true, false, false>;
@@ -1390,7 +1390,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   rmm::device_uvector<job_desc_type> job_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
   rmm::device_uvector<index_type> hashmap;
-  std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
+  std::atomic<uint64_t> heartbeat;
 
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
                       raft::device_matrix_view<const index_type, int64_t, row_major> graph,
@@ -1452,7 +1452,10 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     }
 
     index_type* hashmap_ptr = nullptr;
-    if (small_hash_bitlen == 0) { hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream); }
+    if (small_hash_bitlen == 0) {
+      hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream);
+      hashmap_ptr = hashmap.data();
+    }
 
     // launch the kernel
     auto* graph_ptr                   = graph.data_handle();
@@ -1493,21 +1496,11 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
 
   ~persistent_runner_t() noexcept override
   {
-    auto whs           = worker_handles.data();
-    auto whl           = worker_handles.size();
-    uint32_t worker_id = 0;
-    auto count         = whl;
-    // wait for all the jobs to finish nicely
-    while (job_queue.try_pop(worker_id)) {
-      whs[worker_id].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
-      count--;
-    }
-    if (count > 0) {
-      // try to kill stuck threads if any
-      for (uint32_t i = 0; i < whl; i++) {
-        whs[i].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
-      }
+    auto whs = worker_handles.data();
+    for (auto i = worker_handles.size(); i > 0; i--) {
+      whs[worker_queue.pop()].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
     }
+    RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream));
     RAFT_LOG_INFO("Destroyed the persistent runner.");
   }
 
@@ -1532,7 +1525,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
         return cflag;
       }};
     // update the keep-alive atomic in the meanwhile
-    last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
+    heartbeat.fetch_add(1, std::memory_order_relaxed);
     // wait for the results to arrive
     launcher.wait();
   }
@@ -1586,8 +1579,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
         runner_outer = std::dynamic_pointer_cast<RunnerT>(
           std::atomic_load_explicit(&persistent.runner, std::memory_order_relaxed));
         if (runner_outer) {
-          runner_outer->last_touch.store(std::chrono::system_clock::now(),
-                                         std::memory_order_relaxed);
+          runner_outer->heartbeat.fetch_add(1, std::memory_order_relaxed);
           ready.test_and_set(cuda::std::memory_order_release);
           ready.notify_one();
           return;
@@ -1598,32 +1590,34 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
                                    std::memory_order_relaxed);
         runner_outer = std::make_shared<RunnerT>(thread_args...);
         runner_weak  = runner_outer;
+        runner_outer->heartbeat.store(1, std::memory_order_relaxed);
         std::atomic_store_explicit(&persistent.runner,
                                    std::static_pointer_cast<persistent_runner_base_t>(runner_outer),
                                    std::memory_order_relaxed);
-        runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
         ready.test_and_set(cuda::std::memory_order_release);
         ready.notify_one();
       }
       constexpr auto kInterval = std::chrono::milliseconds(500);
+      size_t last_beat         = 0;
       while (true) {
         std::this_thread::sleep_for(kInterval);
         auto runner = runner_weak.lock();  // runner_weak is local - thread-safe
         if (!runner) {
           return;  // dead already
         }
-        if (runner->last_touch.load(std::memory_order_relaxed) + kInterval <
-            std::chrono::system_clock::now()) {
+        size_t this_beat = runner->heartbeat.load(std::memory_order_relaxed);
+        if (this_beat == last_beat) {
+          std::lock_guard<std::mutex> guard(persistent.lock);
           if (runner == std::atomic_load_explicit(
                           &persistent.runner,
                           std::memory_order_relaxed)) {  // compare pointers: this is thread-safe
-            std::lock_guard<std::mutex> guard(persistent.lock);
             std::atomic_store_explicit(&persistent.runner,
                                        std::shared_ptr<persistent_runner_base_t>{nullptr},
                                        std::memory_order_relaxed);
           }
           return;
         }
+        last_beat = this_beat;
       }
     },
     args...)

From 9dd3d327afb84e890aa462694a245beb344710a7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:02:08 +0200
Subject: [PATCH 21/37] Add a thread-local weak_ptr for the runner to further
 reduce possible contention

---
 .../detail/cagra/search_single_cta_kernel-inl.cuh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 3e8a2a735f..b147888c16 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1627,7 +1627,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
 }
 
 template <typename RunnerT, typename... Args>
-auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
+auto get_runner_nocache(Args&&... args) -> std::shared_ptr<RunnerT>
 {
   // We copy the shared pointer here, then using the copy is thread-safe.
   auto runner = std::dynamic_pointer_cast<RunnerT>(
@@ -1636,6 +1636,19 @@ auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
   return create_runner<RunnerT>(std::forward<Args>(args)...);
 }
 
+template <typename RunnerT, typename... Args>
+auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
+{
+  // Using a thread-local weak pointer allows us to avoid an extra atomic load of the persistent
+  // runner shared pointer.
+  static thread_local std::weak_ptr<RunnerT> weak;
+  auto runner = weak.lock();
+  if (runner) { return runner; }
+  runner = get_runner_nocache<RunnerT, Args...>(std::forward<Args>(args)...);
+  weak   = runner;
+  return runner;
+}
+
 template <unsigned TEAM_SIZE,
           unsigned DATASET_BLOCK_DIM,
           typename DATASET_DESCRIPTOR_T,

From 4361a5e0e482537bbd8a815e498b99f666235c4a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:52:54 +0200
Subject: [PATCH 22/37] Keep result buffers between runs to avoid blocking the
 persistent kernel.

---
 cpp/bench/ann/src/common/benchmark.hpp |  27 ++++--
 cpp/bench/ann/src/common/util.hpp      | 125 ++++++++++++++-----------
 2 files changed, 86 insertions(+), 66 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 408dc9f825..03a5496daf 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,8 +280,15 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  buf<float> distances{current_algo_props->query_memory_type, k * query_set_size};
-  buf<AnnBase::index_type> neighbors{current_algo_props->query_memory_type, k * query_set_size};
+  constexpr size_t kAlignResultBuf = 64;
+  size_t result_elem_count         = k * query_set_size;
+  result_elem_count =
+    ((result_elem_count + kAlignResultBuf - 1) / kAlignResultBuf) * kAlignResultBuf;
+  auto& result_buf = get_result_buffer_from_global_pool(
+    result_elem_count * (sizeof(float) + sizeof(AnnBase::index_type)));
+  auto* neighbors_ptr =
+    reinterpret_cast<AnnBase::index_type*>(result_buf.data(current_algo_props->query_memory_type));
+  auto* distances_ptr = reinterpret_cast<float*>(neighbors_ptr + result_elem_count);
 
   {
     nvtx_case nvtx{state.name()};
@@ -303,8 +310,8 @@ void bench_search(::benchmark::State& state,
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors.data + out_offset * k,
-                     distances.data + out_offset * k);
+                     neighbors_ptr + out_offset * k,
+                     distances_ptr + out_offset * k);
       } catch (const std::exception& e) {
         state.SkipWithError("Benchmark loop: " + std::string(e.what()));
         break;
@@ -338,10 +345,12 @@ void bench_search(::benchmark::State& state,
   if (dataset->max_k() >= k) {
     const std::int32_t* gt    = dataset->gt_set();
     const std::uint32_t max_k = dataset->max_k();
-    buf neighbors_host        = neighbors.move(MemoryType::Host);
-    std::size_t rows          = std::min(queries_processed, query_set_size);
-    std::size_t match_count   = 0;
-    std::size_t total_count   = rows * static_cast<size_t>(k);
+    result_buf.transfer_data(MemoryType::Host, current_algo_props->query_memory_type);
+    auto* neighbors_host =
+      reinterpret_cast<AnnBase::index_type*>(result_buf.data(MemoryType::Host));
+    std::size_t rows        = std::min(queries_processed, query_set_size);
+    std::size_t match_count = 0;
+    std::size_t total_count = rows * static_cast<size_t>(k);
 
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
@@ -352,7 +361,7 @@ void bench_search(::benchmark::State& state,
         size_t i_out_idx  = out_offset + i;
         if (i_out_idx < rows) {
           for (std::uint32_t j = 0; j < k; j++) {
-            auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]);
+            auto act_idx = std::int32_t(neighbors_host[i_out_idx * k + j]);
             for (std::uint32_t l = 0; l < k; l++) {
               auto exp_idx = gt[i_orig_idx * max_k + l];
               if (act_idx == exp_idx) {
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index e3ca1c5273..55c4bf0063 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -56,63 +56,6 @@ inline thread_local int benchmark_thread_id = 0;
  */
 inline thread_local int benchmark_n_threads = 1;
 
-template <typename T>
-struct buf {
-  cudaStream_t stream;
-  MemoryType memory_type;
-  std::size_t size;
-  T* data;
-  buf(MemoryType memory_type, std::size_t size)
-    : stream(nullptr), memory_type(memory_type), size(size), data(nullptr)
-  {
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
-        cudaMallocAsync(reinterpret_cast<void**>(&data), size * sizeof(T), stream);
-        cudaMemsetAsync(data, 0, size * sizeof(T), stream);
-        cudaStreamSynchronize(stream);
-      } break;
-#endif
-      default: {
-        data = reinterpret_cast<T*>(malloc(size * sizeof(T)));
-        std::memset(data, 0, size * sizeof(T));
-      }
-    }
-  }
-  ~buf() noexcept
-  {
-    if (data == nullptr) { return; }
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaFreeAsync(data, stream);
-        cudaStreamSynchronize(stream);
-        cudaStreamDestroy(stream);
-      } break;
-#endif
-      default: {
-        free(data);
-      }
-    }
-  }
-
-  [[nodiscard]] auto move(MemoryType target_memory_type) -> buf<T>
-  {
-    buf<T> r{target_memory_type, size};
-#ifndef BUILD_CPU_ONLY
-    if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) ||
-        (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) {
-      cudaMemcpyAsync(r.data, data, size * sizeof(T), cudaMemcpyDefault, stream);
-      cudaStreamSynchronize(stream);
-      return r;
-    }
-#endif
-    std::swap(data, r.data);
-    return r;
-  }
-};
-
 struct cuda_timer {
  private:
   std::optional<cudaStream_t> stream_;
@@ -250,6 +193,73 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
 #endif
 }
 
+struct result_buffer {
+  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+  {
+    if (size_ == 0) { return; }
+    data_host_ = malloc(size_);
+#ifndef BUILD_CPU_ONLY
+    cudaMallocAsync(&data_device_, size_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+  result_buffer()                                = delete;
+  result_buffer(result_buffer&&)                 = delete;
+  result_buffer& operator=(result_buffer&&)      = delete;
+  result_buffer(const result_buffer&)            = delete;
+  result_buffer& operator=(const result_buffer&) = delete;
+  ~result_buffer() noexcept
+  {
+    if (size_ == 0) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaFreeAsync(data_device_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+    free(data_host_);
+  }
+
+  [[nodiscard]] auto size() const noexcept { return size_; }
+  [[nodiscard]] auto data(ann::MemoryType loc) const noexcept
+  {
+    switch (loc) {
+      case MemoryType::Device: return data_device_;
+      default: return data_host_;
+    }
+  }
+
+  void transfer_data(ann::MemoryType dst, ann::MemoryType src)
+  {
+    auto dst_ptr = data(dst);
+    auto src_ptr = data(src);
+    if (dst_ptr == src_ptr) { return; }
+    cudaMemcpyAsync(dst_ptr, src_ptr, size_, cudaMemcpyDefault, stream_);
+    cudaStreamSynchronize(stream_);
+  }
+
+ private:
+  size_t size_{0};
+  cudaStream_t stream_ = nullptr;
+  void* data_host_     = nullptr;
+  void* data_device_   = nullptr;
+};
+
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::mutex grp_mutex;
+}  // namespace detail
+
+inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  std::lock_guard guard(detail::grp_mutex);
+  if (int(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
+    detail::global_result_buffer_pool.resize(benchmark_n_threads);
+  }
+  auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
+  if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+  return *rb;
+}
+
 /**
  * Delete all streams in the global pool.
  * It's called at the end of the `main` function - before global/static variables and cuda context
@@ -260,6 +270,7 @@ inline void reset_global_stream_pool()
 {
 #ifndef BUILD_CPU_ONLY
   std::lock_guard guard(detail::gsp_mutex);
+  detail::global_result_buffer_pool.resize(0);
   detail::global_stream_pool.resize(0);
 #endif
 }

From e96cc0f19885ee1a6002a12472c356d0b0596202 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:20:28 +0200
Subject: [PATCH 23/37] Avoid an extra layer of atomics on the persistent
 runner (shared_ptr)

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 73 +++++++------------
 1 file changed, 25 insertions(+), 48 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index b147888c16..9c119f248e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1565,38 +1565,31 @@ inline persistent_state persistent{};
 template <typename RunnerT, typename... Args>
 auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass everything by values
 {
-  // NB: storing pointer-to-shared_ptr; otherwise, notify_one()/wait() do not seem to work.
-  std::shared_ptr<RunnerT> runner_outer{nullptr};
+  std::lock_guard<std::mutex> guard(persistent.lock);
+  // Check if the runner has already been created
+  std::shared_ptr<RunnerT> runner_outer = std::dynamic_pointer_cast<RunnerT>(persistent.runner);
+  if (runner_outer) {
+    runner_outer->heartbeat.fetch_add(1, std::memory_order_relaxed);
+    return runner_outer;
+  }
+  // Runner has not yet been created (or it's incompatible):
+  //   create it in another thread and only then release the lock.
+  // Free the resources (if any) in advance
+  persistent.runner.reset();
+
   cuda::std::atomic_flag ready{};
   ready.clear(cuda::std::memory_order_relaxed);
   std::thread(
     [&runner_outer, &ready](Args... thread_args) {  // pass everything by values
-      std::weak_ptr<RunnerT> runner_weak;
-      {
-        std::lock_guard<std::mutex> guard(persistent.lock);
-        // Try to check the runner again:
-        //   it may have been created by another thread since the last check
-        runner_outer = std::dynamic_pointer_cast<RunnerT>(
-          std::atomic_load_explicit(&persistent.runner, std::memory_order_relaxed));
-        if (runner_outer) {
-          runner_outer->heartbeat.fetch_add(1, std::memory_order_relaxed);
-          ready.test_and_set(cuda::std::memory_order_release);
-          ready.notify_one();
-          return;
-        }
-        // Free the resources (if any) in advance
-        std::atomic_store_explicit(&persistent.runner,
-                                   std::shared_ptr<persistent_runner_base_t>{nullptr},
-                                   std::memory_order_relaxed);
-        runner_outer = std::make_shared<RunnerT>(thread_args...);
-        runner_weak  = runner_outer;
-        runner_outer->heartbeat.store(1, std::memory_order_relaxed);
-        std::atomic_store_explicit(&persistent.runner,
-                                   std::static_pointer_cast<persistent_runner_base_t>(runner_outer),
-                                   std::memory_order_relaxed);
-        ready.test_and_set(cuda::std::memory_order_release);
-        ready.notify_one();
-      }
+      // create the runner (the lock is acquired in the parent thread).
+      runner_outer = std::make_shared<RunnerT>(thread_args...);
+      runner_outer->heartbeat.store(1, std::memory_order_relaxed);
+      persistent.runner = std::static_pointer_cast<persistent_runner_base_t>(runner_outer);
+      std::weak_ptr<RunnerT> runner_weak = runner_outer;
+      ready.test_and_set(cuda::std::memory_order_release);
+      ready.notify_one();
+      // NB: runner_outer is passed by reference and may be dead by this time.
+
       constexpr auto kInterval = std::chrono::milliseconds(500);
       size_t last_beat         = 0;
       while (true) {
@@ -1608,13 +1601,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
         size_t this_beat = runner->heartbeat.load(std::memory_order_relaxed);
         if (this_beat == last_beat) {
           std::lock_guard<std::mutex> guard(persistent.lock);
-          if (runner == std::atomic_load_explicit(
-                          &persistent.runner,
-                          std::memory_order_relaxed)) {  // compare pointers: this is thread-safe
-            std::atomic_store_explicit(&persistent.runner,
-                                       std::shared_ptr<persistent_runner_base_t>{nullptr},
-                                       std::memory_order_relaxed);
-          }
+          if (runner == persistent.runner) { persistent.runner.reset(); }
           return;
         }
         last_beat = this_beat;
@@ -1626,25 +1613,15 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
   return runner_outer;
 }
 
-template <typename RunnerT, typename... Args>
-auto get_runner_nocache(Args&&... args) -> std::shared_ptr<RunnerT>
-{
-  // We copy the shared pointer here, then using the copy is thread-safe.
-  auto runner = std::dynamic_pointer_cast<RunnerT>(
-    std::atomic_load_explicit(&persistent.runner, std::memory_order_relaxed));
-  if (runner) { return runner; }
-  return create_runner<RunnerT>(std::forward<Args>(args)...);
-}
-
 template <typename RunnerT, typename... Args>
 auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
 {
-  // Using a thread-local weak pointer allows us to avoid an extra atomic load of the persistent
-  // runner shared pointer.
+  // Using a thread-local weak pointer allows us to avoid using locks/atomics,
+  // since the control block of weak/shared pointers is thread-safe.
   static thread_local std::weak_ptr<RunnerT> weak;
   auto runner = weak.lock();
   if (runner) { return runner; }
-  runner = get_runner_nocache<RunnerT, Args...>(std::forward<Args>(args)...);
+  runner = create_runner<RunnerT>(std::forward<Args>(args)...);
   weak   = runner;
   return runner;
 }

From c86dfcfd7b2266ce274e4dfbf3ab925b95880abf Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 29 Apr 2024 20:25:12 +0200
Subject: [PATCH 24/37] Reducing congestions: avoid too many writes to the
 last_touch/heartbeat atomic

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 9c119f248e..73497b0707 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1248,9 +1248,9 @@ struct local_deque_t {
 
 struct persistent_runner_base_t {
   using job_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, kMaxJobsNum, ~0u, false, true, false, false>;
+    atomic_queue::AtomicQueue<uint32_t, kMaxJobsNum, ~0u, true, true, false, false>;
   using worker_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersNum, ~0u, false, true, false, false>;
+    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersNum, ~0u, true, true, false, false>;
   rmm::mr::pinned_host_memory_resource worker_handles_mr;
   rmm::mr::pinned_host_memory_resource job_descriptor_mr;
   rmm::mr::cuda_memory_resource device_mr;
@@ -1390,7 +1390,9 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   rmm::device_uvector<job_desc_type> job_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
   rmm::device_uvector<index_type> hashmap;
-  std::atomic<uint64_t> heartbeat;
+  std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
+
+  constexpr static auto kLiveInterval = std::chrono::milliseconds(1000);
 
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
                       raft::device_matrix_view<const index_type, int64_t, row_major> graph,
@@ -1525,7 +1527,13 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
         return cflag;
       }};
     // update the keep-alive atomic in the meanwhile
-    heartbeat.fetch_add(1, std::memory_order_relaxed);
+    auto prev_touch = last_touch.load(std::memory_order_relaxed);
+    auto this_touch = std::chrono::system_clock::now();
+    if (prev_touch + kLiveInterval / 10 < this_touch) {
+      // to avoid congestion at this atomic, we only update it if a significant fraction of the live
+      // interval has passed.
+      last_touch.store(this_touch, std::memory_order_relaxed);
+    }
     // wait for the results to arrive
     launcher.wait();
   }
@@ -1569,7 +1577,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
   // Check if the runner has already been created
   std::shared_ptr<RunnerT> runner_outer = std::dynamic_pointer_cast<RunnerT>(persistent.runner);
   if (runner_outer) {
-    runner_outer->heartbeat.fetch_add(1, std::memory_order_relaxed);
+    runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
     return runner_outer;
   }
   // Runner has not yet been created (or it's incompatible):
@@ -1583,28 +1591,25 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
     [&runner_outer, &ready](Args... thread_args) {  // pass everything by values
       // create the runner (the lock is acquired in the parent thread).
       runner_outer = std::make_shared<RunnerT>(thread_args...);
-      runner_outer->heartbeat.store(1, std::memory_order_relaxed);
+      runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
       persistent.runner = std::static_pointer_cast<persistent_runner_base_t>(runner_outer);
       std::weak_ptr<RunnerT> runner_weak = runner_outer;
       ready.test_and_set(cuda::std::memory_order_release);
       ready.notify_one();
       // NB: runner_outer is passed by reference and may be dead by this time.
 
-      constexpr auto kInterval = std::chrono::milliseconds(500);
-      size_t last_beat         = 0;
       while (true) {
-        std::this_thread::sleep_for(kInterval);
+        std::this_thread::sleep_for(RunnerT::kLiveInterval);
         auto runner = runner_weak.lock();  // runner_weak is local - thread-safe
         if (!runner) {
           return;  // dead already
         }
-        size_t this_beat = runner->heartbeat.load(std::memory_order_relaxed);
-        if (this_beat == last_beat) {
+        if (runner->last_touch.load(std::memory_order_relaxed) + RunnerT::kLiveInterval <
+            std::chrono::system_clock::now()) {
           std::lock_guard<std::mutex> guard(persistent.lock);
           if (runner == persistent.runner) { persistent.runner.reset(); }
           return;
         }
-        last_beat = this_beat;
       }
     },
     args...)

From aaba9121d4b38b577dc0e2eca5fa4f0cb27d44da Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 May 2024 15:00:55 +0200
Subject: [PATCH 25/37] Make a custom implementation of the shared resource
 queue to optimize the throughput when .pop is the bottleneck

---
 cpp/include/atomic_queue/LICENSE              |  21 -
 cpp/include/atomic_queue/atomic_queue.h       | 457 ------------------
 cpp/include/atomic_queue/defs.h               |  99 ----
 .../cagra/search_single_cta_kernel-inl.cuh    | 117 ++++-
 4 files changed, 105 insertions(+), 589 deletions(-)
 delete mode 100644 cpp/include/atomic_queue/LICENSE
 delete mode 100644 cpp/include/atomic_queue/atomic_queue.h
 delete mode 100644 cpp/include/atomic_queue/defs.h

diff --git a/cpp/include/atomic_queue/LICENSE b/cpp/include/atomic_queue/LICENSE
deleted file mode 100644
index c1d3466926..0000000000
--- a/cpp/include/atomic_queue/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2019 Maxim Egorushkin
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/cpp/include/atomic_queue/atomic_queue.h b/cpp/include/atomic_queue/atomic_queue.h
deleted file mode 100644
index 93c237da4c..0000000000
--- a/cpp/include/atomic_queue/atomic_queue.h
+++ /dev/null
@@ -1,457 +0,0 @@
-/* -*- mode: c++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
-#ifndef ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
-#define ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
-
-// Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE.
-
-#include "defs.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace atomic_queue {
-
-using std::uint32_t;
-using std::uint64_t;
-using std::uint8_t;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace details {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <size_t elements_per_cache_line>
-struct GetCacheLineIndexBits {
-  static int constexpr value = 0;
-};
-template <>
-struct GetCacheLineIndexBits<256> {
-  static int constexpr value = 8;
-};
-template <>
-struct GetCacheLineIndexBits<128> {
-  static int constexpr value = 7;
-};
-template <>
-struct GetCacheLineIndexBits<64> {
-  static int constexpr value = 6;
-};
-template <>
-struct GetCacheLineIndexBits<32> {
-  static int constexpr value = 5;
-};
-template <>
-struct GetCacheLineIndexBits<16> {
-  static int constexpr value = 4;
-};
-template <>
-struct GetCacheLineIndexBits<8> {
-  static int constexpr value = 3;
-};
-template <>
-struct GetCacheLineIndexBits<4> {
-  static int constexpr value = 2;
-};
-template <>
-struct GetCacheLineIndexBits<2> {
-  static int constexpr value = 1;
-};
-
-template <bool minimize_contention, unsigned array_size, size_t elements_per_cache_line>
-struct GetIndexShuffleBits {
-  static int constexpr bits          = GetCacheLineIndexBits<elements_per_cache_line>::value;
-  static unsigned constexpr min_size = 1u << (bits * 2);
-  static int constexpr value         = array_size < min_size ? 0 : bits;
-};
-
-template <unsigned array_size, size_t elements_per_cache_line>
-struct GetIndexShuffleBits<false, array_size, elements_per_cache_line> {
-  static int constexpr value = 0;
-};
-
-// Multiple writers/readers contend on the same cache line when storing/loading elements at
-// subsequent indexes, aka false sharing. For power of 2 ring buffer size it is possible to re-map
-// the index in such a way that each subsequent element resides on another cache line, which
-// minimizes contention. This is done by swapping the lowest order N bits (which are the index of
-// the element within the cache line) with the next N bits (which are the index of the cache line)
-// of the element index.
-template <int BITS>
-constexpr unsigned remap_index(unsigned index) noexcept
-{
-  unsigned constexpr mix_mask{(1u << BITS) - 1};
-  unsigned const mix{(index ^ (index >> BITS)) & mix_mask};
-  return index ^ mix ^ (mix << BITS);
-}
-
-template <>
-constexpr unsigned remap_index<0>(unsigned index) noexcept
-{
-  return index;
-}
-
-template <int BITS, class T>
-constexpr T& map(T* elements, unsigned index) noexcept
-{
-  return elements[remap_index<BITS>(index)];
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Implement a "bit-twiddling hack" for finding the next power of 2 in either 32 bits or 64 bits
-// in C++11 compatible constexpr functions. The library no longer maintains C++11 compatibility.
-
-// "Runtime" version for 32 bits
-// --a;
-// a |= a >> 1;
-// a |= a >> 2;
-// a |= a >> 4;
-// a |= a >> 8;
-// a |= a >> 16;
-// ++a;
-
-template <class T>
-constexpr T decrement(T x) noexcept
-{
-  return x - 1;
-}
-
-template <class T>
-constexpr T increment(T x) noexcept
-{
-  return x + 1;
-}
-
-template <class T>
-constexpr T or_equal(T x, unsigned u) noexcept
-{
-  return x | x >> u;
-}
-
-template <class T, class... Args>
-constexpr T or_equal(T x, unsigned u, Args... rest) noexcept
-{
-  return or_equal(or_equal(x, u), rest...);
-}
-
-constexpr uint32_t round_up_to_power_of_2(uint32_t a) noexcept
-{
-  return increment(or_equal(decrement(a), 1, 2, 4, 8, 16));
-}
-
-constexpr uint64_t round_up_to_power_of_2(uint64_t a) noexcept
-{
-  return increment(or_equal(decrement(a), 1, 2, 4, 8, 16, 32));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class T>
-constexpr T nil() noexcept
-{
-#if __cpp_lib_atomic_is_always_lock_free  // Better compile-time error message requires C++17.
-  static_assert(
-    std::atomic<T>::is_always_lock_free,
-    "Queue element type T is not atomic. Use AtomicQueue2/AtomicQueueB2 for such element types.");
-#endif
-  return {};
-}
-
-template <class T>
-inline void destroy_n(T* p, unsigned n) noexcept
-{
-  for (auto q = p + n; p != q;)
-    (p++)->~T();
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace details
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Derived>
-class AtomicQueueCommon {
- protected:
-  // Put these on different cache lines to avoid false sharing between readers and writers.
-  alignas(CACHE_LINE_SIZE) std::atomic<unsigned> head_ = {};
-  alignas(CACHE_LINE_SIZE) std::atomic<unsigned> tail_ = {};
-
-  // The special member functions are not thread-safe.
-
-  AtomicQueueCommon() noexcept = default;
-
-  AtomicQueueCommon(AtomicQueueCommon const& b) noexcept
-    : head_(b.head_.load(X)), tail_(b.tail_.load(X))
-  {
-  }
-
-  AtomicQueueCommon& operator=(AtomicQueueCommon const& b) noexcept
-  {
-    head_.store(b.head_.load(X), X);
-    tail_.store(b.tail_.load(X), X);
-    return *this;
-  }
-
-  void swap(AtomicQueueCommon& b) noexcept
-  {
-    unsigned h = head_.load(X);
-    unsigned t = tail_.load(X);
-    head_.store(b.head_.load(X), X);
-    tail_.store(b.tail_.load(X), X);
-    b.head_.store(h, X);
-    b.tail_.store(t, X);
-  }
-
-  template <class T, T NIL>
-  static T do_pop_atomic(std::atomic<T>& q_element) noexcept
-  {
-    if (Derived::spsc_) {
-      for (;;) {
-        T element = q_element.load(A);
-        if (ATOMIC_QUEUE_LIKELY(element != NIL)) {
-          q_element.store(NIL, X);
-          return element;
-        }
-        if (Derived::maximize_throughput_) spin_loop_pause();
-      }
-    } else {
-      for (;;) {
-        T element = q_element.exchange(NIL, A);  // (2) The store to wait for.
-        if (ATOMIC_QUEUE_LIKELY(element != NIL)) return element;
-        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
-        do
-          spin_loop_pause();
-        while (Derived::maximize_throughput_ && q_element.load(X) == NIL);
-      }
-    }
-  }
-
-  template <class T, T NIL>
-  static void do_push_atomic(T element, std::atomic<T>& q_element) noexcept
-  {
-    assert(element != NIL);
-    if (Derived::spsc_) {
-      while (ATOMIC_QUEUE_UNLIKELY(q_element.load(X) != NIL))
-        if (Derived::maximize_throughput_) spin_loop_pause();
-      q_element.store(element, R);
-    } else {
-      for (T expected = NIL;
-           ATOMIC_QUEUE_UNLIKELY(!q_element.compare_exchange_weak(expected, element, R, X));
-           expected = NIL) {
-        do
-          spin_loop_pause();  // (1) Wait for store (2) to complete.
-        while (Derived::maximize_throughput_ && q_element.load(X) != NIL);
-      }
-    }
-  }
-
-  enum State : unsigned char { EMPTY, STORING, STORED, LOADING };
-
-  template <class T>
-  static T do_pop_any(std::atomic<unsigned char>& state, T& q_element) noexcept
-  {
-    if (Derived::spsc_) {
-      while (ATOMIC_QUEUE_UNLIKELY(state.load(A) != STORED))
-        if (Derived::maximize_throughput_) spin_loop_pause();
-      T element{std::move(q_element)};
-      state.store(EMPTY, R);
-      return element;
-    } else {
-      for (;;) {
-        unsigned char expected = STORED;
-        if (ATOMIC_QUEUE_LIKELY(state.compare_exchange_weak(expected, LOADING, A, X))) {
-          T element{std::move(q_element)};
-          state.store(EMPTY, R);
-          return element;
-        }
-        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
-        do
-          spin_loop_pause();
-        while (Derived::maximize_throughput_ && state.load(X) != STORED);
-      }
-    }
-  }
-
-  template <class U, class T>
-  static void do_push_any(U&& element, std::atomic<unsigned char>& state, T& q_element) noexcept
-  {
-    if (Derived::spsc_) {
-      while (ATOMIC_QUEUE_UNLIKELY(state.load(A) != EMPTY))
-        if (Derived::maximize_throughput_) spin_loop_pause();
-      q_element = std::forward<U>(element);
-      state.store(STORED, R);
-    } else {
-      for (;;) {
-        unsigned char expected = EMPTY;
-        if (ATOMIC_QUEUE_LIKELY(state.compare_exchange_weak(expected, STORING, A, X))) {
-          q_element = std::forward<U>(element);
-          state.store(STORED, R);
-          return;
-        }
-        // Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
-        do
-          spin_loop_pause();
-        while (Derived::maximize_throughput_ && state.load(X) != EMPTY);
-      }
-    }
-  }
-
- public:
-  template <class T>
-  bool try_push(T&& element) noexcept
-  {
-    auto head = head_.load(X);
-    if (Derived::spsc_) {
-      if (static_cast<int>(head - tail_.load(X)) >=
-          static_cast<int>(static_cast<Derived&>(*this).size_))
-        return false;
-      head_.store(head + 1, X);
-    } else {
-      do {
-        if (static_cast<int>(head - tail_.load(X)) >=
-            static_cast<int>(static_cast<Derived&>(*this).size_))
-          return false;
-      } while (ATOMIC_QUEUE_UNLIKELY(
-        !head_.compare_exchange_weak(head, head + 1, X, X)));  // This loop is not FIFO.
-    }
-
-    static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
-    return true;
-  }
-
-  template <class T>
-  bool try_pop(T& element) noexcept
-  {
-    auto tail = tail_.load(X);
-    if (Derived::spsc_) {
-      if (static_cast<int>(head_.load(X) - tail) <= 0) return false;
-      tail_.store(tail + 1, X);
-    } else {
-      do {
-        if (static_cast<int>(head_.load(X) - tail) <= 0) return false;
-      } while (ATOMIC_QUEUE_UNLIKELY(
-        !tail_.compare_exchange_weak(tail, tail + 1, X, X)));  // This loop is not FIFO.
-    }
-
-    element = static_cast<Derived&>(*this).do_pop(tail);
-    return true;
-  }
-
-  template <class T>
-  void push(T&& element) noexcept
-  {
-    unsigned head;
-    if (Derived::spsc_) {
-      head = head_.load(X);
-      head_.store(head + 1, X);
-    } else {
-      constexpr auto memory_order =
-        Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_relaxed;
-      head =
-        head_.fetch_add(1, memory_order);  // FIFO and total order on Intel regardless, as of 2019.
-    }
-    static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
-  }
-
-  auto pop() noexcept
-  {
-    unsigned tail;
-    if (Derived::spsc_) {
-      tail = tail_.load(X);
-      tail_.store(tail + 1, X);
-    } else {
-      constexpr auto memory_order =
-        Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_relaxed;
-      tail =
-        tail_.fetch_add(1, memory_order);  // FIFO and total order on Intel regardless, as of 2019.
-    }
-    return static_cast<Derived&>(*this).do_pop(tail);
-  }
-
-  bool was_empty() const noexcept { return !was_size(); }
-
-  bool was_full() const noexcept
-  {
-    return was_size() >= static_cast<int>(static_cast<Derived const&>(*this).size_);
-  }
-
-  unsigned was_size() const noexcept
-  {
-    // tail_ can be greater than head_ because of consumers doing pop, rather that try_pop, when the
-    // queue is empty.
-    return std::max(static_cast<int>(head_.load(X) - tail_.load(X)), 0);
-  }
-
-  unsigned capacity() const noexcept { return static_cast<Derived const&>(*this).size_; }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class T,
-          unsigned SIZE,
-          T NIL                    = details::nil<T>(),
-          bool MINIMIZE_CONTENTION = true,
-          bool MAXIMIZE_THROUGHPUT = true,
-          bool TOTAL_ORDER         = false,
-          bool SPSC                = false>
-class AtomicQueue
-  : public AtomicQueueCommon<
-      AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>> {
-  using Base = AtomicQueueCommon<
-    AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
-  friend Base;
-
-  static constexpr unsigned size_ =
-    MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE;
-  static constexpr int SHUFFLE_BITS =
-    details::GetIndexShuffleBits<MINIMIZE_CONTENTION,
-                                 size_,
-                                 CACHE_LINE_SIZE / sizeof(std::atomic<T>)>::value;
-  static constexpr bool total_order_         = TOTAL_ORDER;
-  static constexpr bool spsc_                = SPSC;
-  static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
-
-  alignas(CACHE_LINE_SIZE) std::atomic<T> elements_[size_];
-
-  T do_pop(unsigned tail) noexcept
-  {
-    std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, tail % size_);
-    return Base::template do_pop_atomic<T, NIL>(q_element);
-  }
-
-  void do_push(T element, unsigned head) noexcept
-  {
-    std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, head % size_);
-    Base::template do_push_atomic<T, NIL>(element, q_element);
-  }
-
- public:
-  using value_type = T;
-
-  AtomicQueue() noexcept
-  {
-    assert(
-      std::atomic<T>{NIL}.is_lock_free());  // Queue element type T is not atomic. Use
-                                            // AtomicQueue2/AtomicQueueB2 for such element types.
-    for (auto p = elements_, q = elements_ + size_; p != q; ++p)
-      p->store(NIL, X);
-  }
-
-  AtomicQueue(AtomicQueue const&)            = delete;
-  AtomicQueue& operator=(AtomicQueue const&) = delete;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace atomic_queue
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif  // ATOMIC_QUEUE_ATOMIC_QUEUE_H_INCLUDED
diff --git a/cpp/include/atomic_queue/defs.h b/cpp/include/atomic_queue/defs.h
deleted file mode 100644
index 4601b1d46f..0000000000
--- a/cpp/include/atomic_queue/defs.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* -*- mode: c++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
-#ifndef ATOMIC_QUEUE_DEFS_H_INCLUDED
-#define ATOMIC_QUEUE_DEFS_H_INCLUDED
-
-// Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE.
-
-#include <atomic>
-
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#include <emmintrin.h>
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 64;
-static inline void spin_loop_pause() noexcept {
-    _mm_pause();
-}
-} // namespace atomic_queue
-#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 64;
-static inline void spin_loop_pause() noexcept {
-#if (defined(__ARM_ARCH_6K__) || \
-     defined(__ARM_ARCH_6Z__) || \
-     defined(__ARM_ARCH_6ZK__) || \
-     defined(__ARM_ARCH_6T2__) || \
-     defined(__ARM_ARCH_7__) || \
-     defined(__ARM_ARCH_7A__) || \
-     defined(__ARM_ARCH_7R__) || \
-     defined(__ARM_ARCH_7M__) || \
-     defined(__ARM_ARCH_7S__) || \
-     defined(__ARM_ARCH_8A__) || \
-     defined(__aarch64__))
-    asm volatile ("yield" ::: "memory");
-#elif defined(_M_ARM64)
-    __yield();
-#else
-    asm volatile ("nop" ::: "memory");
-#endif
-}
-} // namespace atomic_queue
-#elif defined(__ppc64__) || defined(__powerpc64__)
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 128; // TODO: Review that this is the correct value.
-static inline void spin_loop_pause() noexcept {
-    asm volatile("or 31,31,31 # very low priority"); // TODO: Review and benchmark that this is the right instruction.
-}
-} // namespace atomic_queue
-#elif defined(__s390x__)
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 256; // TODO: Review that this is the correct value.
-static inline void spin_loop_pause() noexcept {} // TODO: Find the right instruction to use here, if any.
-} // namespace atomic_queue
-#elif defined(__riscv)
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 64;
-static inline void spin_loop_pause() noexcept {
-    asm volatile (".insn i 0x0F, 0, x0, x0, 0x010");
-}
-} // namespace atomic_queue
-#else
-#ifdef _MSC_VER
-#pragma message("Unknown CPU architecture. Using L1 cache line size of 64 bytes and no spinloop pause instruction.")
-#else
-#warning "Unknown CPU architecture. Using L1 cache line size of 64 bytes and no spinloop pause instruction."
-#endif
-namespace atomic_queue {
-constexpr int CACHE_LINE_SIZE = 64; // TODO: Review that this is the correct value.
-static inline void spin_loop_pause() noexcept {}
-} // namespace atomic_queue
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace atomic_queue {
-
-#if defined(__GNUC__) || defined(__clang__)
-#define ATOMIC_QUEUE_LIKELY(expr) __builtin_expect(static_cast<bool>(expr), 1)
-#define ATOMIC_QUEUE_UNLIKELY(expr) __builtin_expect(static_cast<bool>(expr), 0)
-#define ATOMIC_QUEUE_NOINLINE __attribute__((noinline))
-#else
-#define ATOMIC_QUEUE_LIKELY(expr) (expr)
-#define ATOMIC_QUEUE_UNLIKELY(expr) (expr)
-#define ATOMIC_QUEUE_NOINLINE
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-auto constexpr A = std::memory_order_acquire;
-auto constexpr R = std::memory_order_release;
-auto constexpr X = std::memory_order_relaxed;
-auto constexpr C = std::memory_order_seq_cst;
-auto constexpr AR = std::memory_order_acq_rel;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace atomic_queue
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif // ATOMIC_QUEUE_DEFS_H_INCLUDED
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 73497b0707..f534d37f3f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -33,7 +33,7 @@
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+#include <raft/util/integer_utils.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
@@ -44,9 +44,8 @@
 #include <cuda/atomic>
 #include <cuda/std/atomic>
 
-#include <atomic_queue/atomic_queue.h>
-
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <chrono>
 #include <cstdint>
@@ -1189,6 +1188,98 @@ struct search_kernel_config {
   }
 };
 
+/**
+ * @brief Resource queue
+ *
+ * A shared atomic ring buffer based queue optimized for throughput when bottlenecked on `pop`
+ * operation.
+ */
+template <typename T, uint32_t Size, T Empty = std::numeric_limits<T>::max()>
+struct alignas(kCacheLineBytes) resource_queue_t {
+  using value_type                   = T;
+  static constexpr uint32_t kSize    = Size;
+  static constexpr value_type kEmpty = Empty;
+  static_assert(cuda::std::atomic<value_type>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency.");
+  static constexpr uint32_t kElemsPerCacheLine =
+    raft::div_rounding_up_safe<uint32_t>(kCacheLineBytes, sizeof(value_type));
+  static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1;
+  static constexpr uint32_t kCounterLocMask   = kSize - 1;
+  // These props hold by design, but we add them here as a documentation and a sanity check.
+  static_assert(
+    kCounterIncrement * sizeof(value_type) >= kCacheLineBytes,
+    "The counter increment should be larger than the cache line size to avoid false sharing.");
+  static_assert(
+    std::gcd(kCounterIncrement, kSize) == 1,
+    "The counter increment and the size must be coprime to allow using all of the queue slots.");
+
+  static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
+
+  explicit resource_queue_t() noexcept
+  {
+    head_.store(0, kMemOrder);
+    tail_.store(0, kMemOrder);
+    for (uint32_t i = 0; i < kSize; i++) {
+      buf_[i].store(kEmpty, kMemOrder);
+    }
+  }
+
+  void push(value_type x) noexcept
+  {
+    auto& loc = buf_[head_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
+    loc_push(loc, x);
+  }
+
+  auto pop() noexcept -> value_type
+  {
+    auto& loc = buf_[tail_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
+    return loc_pop(loc);
+  }
+
+  auto try_pop(value_type& e) noexcept -> bool
+  {
+    auto tail = tail_.load(kMemOrder);
+    do {
+      // NB: static cast is here to avoid the case when the head has recently been incremented
+      // beyond the uint32_t max value.
+      if (static_cast<int32_t>(head_.load(kMemOrder) - tail) <= 0) { return false; }
+    } while (!tail_.compare_exchange_weak(tail, tail + kCounterIncrement, kMemOrder, kMemOrder));
+    e = loc_pop(buf_[tail & kCounterLocMask]);
+    return true;
+  }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) std::array<cuda::std::atomic<value_type>, kSize> buf_{};
+
+  void loc_push(cuda::std::atomic<value_type>& loc, value_type x) noexcept
+  {
+    /* [NOT A HOT SPOT]
+     We expect there's always enough place in the queue to push the item,
+     but also we expect a few pop waiters - notify them the data is available.
+     */
+    value_type e = kEmpty;
+    while (!loc.compare_exchange_weak(e, x, kMemOrder, kMemOrder)) {
+      e = kEmpty;
+    }
+    loc.notify_one();
+  }
+
+  auto loc_pop(cuda::std::atomic<value_type>& loc) noexcept -> value_type
+  {
+    // [HOT SPOT]
+    // Optimize for the case of contention: expect the loc is empty.
+    value_type x = kEmpty;
+    do {
+      loc.wait(kEmpty, kMemOrder);
+      x = loc.exchange(kEmpty, kMemOrder);
+    } while (x == kEmpty);
+    return x;
+  }
+};
+
 /** Primitive fixed-size deque for single-threaded use. */
 template <typename T>
 struct local_deque_t {
@@ -1247,10 +1338,8 @@ struct local_deque_t {
 };
 
 struct persistent_runner_base_t {
-  using job_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, kMaxJobsNum, ~0u, true, true, false, false>;
-  using worker_queue_type =
-    atomic_queue::AtomicQueue<uint32_t, kMaxWorkersNum, ~0u, true, true, false, false>;
+  using job_queue_type    = resource_queue_t<uint32_t, kMaxJobsNum>;
+  using worker_queue_type = resource_queue_t<uint32_t, kMaxWorkersNum>;
   rmm::mr::pinned_host_memory_resource worker_handles_mr;
   rmm::mr::pinned_host_memory_resource job_descriptor_mr;
   rmm::mr::cuda_memory_resource device_mr;
@@ -1291,8 +1380,12 @@ struct alignas(kCacheLineBytes) launcher_t {
       job_id{job_ids.pop()},
       completion_flag{record_work(job_id)}
   {
-    // Submit all queries in the batch
-    for (uint32_t i = 0; i < n_queries; i++) {
+    // Wait for the first worker and submit the query immediately.
+    // This is supposed to be slightly faster than `try_get_worker`,
+    // because it does not loop on the queue counter.
+    submit_query(idle_worker_ids.pop(), 0);
+    // Submit the rest of the queries in the batch
+    for (uint32_t i = 1; i < n_queries; i++) {
       uint32_t worker_id;
       while (!try_get_worker(worker_id)) {
         if (pending_reads.try_pop_front(worker_id)) {
@@ -1316,7 +1409,7 @@ struct alignas(kCacheLineBytes) launcher_t {
                                          cuda::memory_order_relaxed);
 
     while (!pending_reads.try_push_back(worker_id)) {
-      // The only reason pending_reads cannot push is that the job_queue is full.
+      // The only reason pending_reads cannot push is that the queue is full.
       // It's local, so we must pop and wait for the returned worker to finish its work.
       auto pending_worker_id = pending_reads.pop_front();
       while (!try_return_worker(pending_worker_id)) {
@@ -1492,8 +1585,8 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     RAFT_LOG_INFO(
       "Initialized the kernel in stream %zd; job_queue size = %u; worker_queue size = %u",
       int64_t((cudaStream_t)stream),
-      job_queue.was_size(),
-      worker_queue.was_size());
+      kMaxJobsNum,
+      gs.y);
   }
 
   ~persistent_runner_t() noexcept override

From 8a4ff2e172f3da83aa9a662de3e44c329128dc82 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 3 May 2024 06:46:10 +0200
Subject: [PATCH 26/37] Add expectation-based sleep to the waiting loop

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 63 ++++++++++++++-----
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index f534d37f3f..e00ae7f606 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1444,7 +1444,8 @@ struct alignas(kCacheLineBytes) launcher_t {
   }
 
   /** Wait for all work to finish and don't forget to return the workers to the shared pool. */
-  void wait()
+  auto wait(std::chrono::time_point<std::chrono::system_clock> start,
+            std::chrono::nanoseconds expected_latency)
   {
     uint32_t worker_id;
     while (pending_reads.try_pop_front(worker_id)) {
@@ -1454,14 +1455,32 @@ struct alignas(kCacheLineBytes) launcher_t {
     }
     // terminal state, should be engaged only after the `pending_reads` is empty
     // and `queries_submitted == n_queries`
+    auto now = std::chrono::system_clock::now();
+    /* [Note: sleeping]
+    This code segment is a hot spot when the number of queries is low.
+    When the number of threads is greater than the number of cores, the threads start to fight for
+    the core time, which reduces the throughput.
+    To ease the competition, we track the expected GPU latency and let a thread sleep for some
+    time, and only start to spin when it's about a time to get the result.
+
+    The constants below balance the sleep/spin time to achieve the best throughput while keeping
+    the latency at adequate levels.
+    */
+    constexpr auto kMinWakeTime  = std::chrono::nanoseconds(10000);
+    constexpr double kSleepLimit = 0.6;
     while (!is_all_done()) {
-      // Not sure if this improves the perf, but it does not seem to hurt it.
-      // Let's hope this reduces cpu utilization
-      std::this_thread::yield();
+      auto till_time = start + expected_latency * kSleepLimit - kMinWakeTime;
+      if (now < till_time) {
+        std::this_thread::sleep_until(till_time);
+      } else {
+        std::this_thread::yield();
+      }
+      now = std::chrono::system_clock::now();
     }
 
     // Return the job descriptor
     job_ids.push(job_id);
+    return now - start;
   }
 };
 
@@ -1485,7 +1504,9 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   rmm::device_uvector<index_type> hashmap;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
 
-  constexpr static auto kLiveInterval = std::chrono::milliseconds(1000);
+  // This should be large enough to make the runner live through restarts of the benchmark cases.
+  // Otherwise, the benchmarks slowdown significantly.
+  constexpr static auto kLiveInterval = std::chrono::milliseconds(2000);
 
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
                       raft::device_matrix_view<const index_type, int64_t, row_major> graph,
@@ -1605,6 +1626,10 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
               uint32_t num_queries,
               uint32_t top_k)
   {
+    // The clock is going to be used to estimate the expected latency, control the waiting behavior,
+    // and touch the keep-alive atomic.
+    auto start_time                    = std::chrono::system_clock::now();
+    thread_local auto expected_latency = std::chrono::nanoseconds(50000);
     // submit all queries
     launcher_t launcher{
       job_queue, worker_queue, worker_handles.data(), num_queries, [=](uint32_t job_ix) {
@@ -1619,25 +1644,35 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
         cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
         return cflag;
       }};
-    // update the keep-alive atomic in the meanwhile
+    // Update the state of the keep-alive atomic in the meanwhile
     auto prev_touch = last_touch.load(std::memory_order_relaxed);
-    auto this_touch = std::chrono::system_clock::now();
-    if (prev_touch + kLiveInterval / 10 < this_touch) {
+    if (prev_touch + kLiveInterval / 10 < start_time) {
       // to avoid congestion at this atomic, we only update it if a significant fraction of the live
       // interval has passed.
-      last_touch.store(this_touch, std::memory_order_relaxed);
+      last_touch.store(start_time + expected_latency, std::memory_order_relaxed);
     }
     // wait for the results to arrive
-    launcher.wait();
+    auto measured_latency = launcher.wait(start_time, expected_latency);
+    // bookkeeping: update the expected latency to wait more efficiently
+    constexpr size_t kWindow = 100;
+    expected_latency         = ((kWindow - 1) * expected_latency + measured_latency) / kWindow;
   }
 
   auto calc_coop_grid_size(uint32_t block_size, uint32_t smem_size) -> dim3
   {
     // We may need to run other kernels alongside this persistent kernel.
-    // Leave a few SMs idle.
-    // Note: even when we know there are no other kernels working at the same time, setting
-    // kDeviceUsage to 1.0 surprisingly hurts performance.
-    constexpr double kDeviceUsage = 0.9;
+    // So we can leave a few SMs idle.
+    // Note: running any other work on GPU alongside with the persistent kernel make the setup
+    // fragile.
+    //   - Running another kernel in another thread usually works, but no progress guaranteed
+    //   - Any CUDA allocations block the context (this issue may be obscured by using pools)
+    //   - Memory copies to not-pinned host memory may block the context
+    //
+    // Even when we know there are no other kernels working at the same time, setting
+    // kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care.
+    // If you suspect this is an issue, you can reduce this number to ~0.9 without a significant
+    // impact on the throughput.
+    constexpr double kDeviceUsage = 1.0;
 
     // determine the grid size
     int ctas_per_sm = 1;

From 732072da4bbd0ab0949f46411fdcc704bb42defa Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 May 2024 07:57:20 +0200
Subject: [PATCH 27/37] Make the gpu worker report reading the handle is done
 earlier.

Since the worker and job queues were decoupled, it's not necessary to wait for the job to be
read anymore. As soon as the descriptor handle is read, it can be returned to the queue.
---
 .../neighbors/detail/cagra/search_single_cta_kernel-inl.cuh   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index e00ae7f606..65b185c9b4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -993,6 +993,9 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
       do {
         worker_data_local = worker_handle.load(cuda::memory_order_relaxed);
       } while (worker_data_local.handle == kWaitForWork);
+      if (worker_data_local.handle != kNoMoreWork) {
+        worker_handle.store({kWaitForWork}, cuda::memory_order_relaxed);
+      }
       job_ix = worker_data_local.value.desc_id;
       cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
       worker_data = worker_data_local;
@@ -1007,7 +1010,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
     }
     __syncthreads();
     if (worker_data.handle == kNoMoreWork) { break; }
-    if (threadIdx.x == 0) { worker_handle.store({kWaitForWork}, cuda::memory_order_relaxed); }
 
     // reading phase
     auto* result_indices_ptr   = job_descriptor.value.result_indices_ptr;

From 7450f6f4d4c9ccf8b34888a6c156f009919f2158 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 May 2024 08:05:36 +0200
Subject: [PATCH 28/37] Move the last_touch initialization into the constructor
 of the containing type

---
 .../detail/cagra/search_single_cta_kernel-inl.cuh    | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 65b185c9b4..4f28fa9b17 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1606,10 +1606,12 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     RAFT_CUDA_TRY(cudaLaunchCooperativeKernel<std::remove_pointer_t<kernel_type>>(
       kernel, gs, bs, args, smem_size, stream));
     RAFT_LOG_INFO(
-      "Initialized the kernel in stream %zd; job_queue size = %u; worker_queue size = %u",
+      "Initialized the kernel %p in stream %zd; job_queue size = %u; worker_queue size = %u",
+      reinterpret_cast<void*>(kernel),
       int64_t((cudaStream_t)stream),
       kMaxJobsNum,
       gs.y);
+    last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
   }
 
   ~persistent_runner_t() noexcept override
@@ -1706,10 +1708,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
   std::lock_guard<std::mutex> guard(persistent.lock);
   // Check if the runner has already been created
   std::shared_ptr<RunnerT> runner_outer = std::dynamic_pointer_cast<RunnerT>(persistent.runner);
-  if (runner_outer) {
-    runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
-    return runner_outer;
-  }
+  if (runner_outer) { return runner_outer; }
   // Runner has not yet been created (or it's incompatible):
   //   create it in another thread and only then release the lock.
   // Free the resources (if any) in advance
@@ -1720,8 +1719,7 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
   std::thread(
     [&runner_outer, &ready](Args... thread_args) {  // pass everything by values
       // create the runner (the lock is acquired in the parent thread).
-      runner_outer = std::make_shared<RunnerT>(thread_args...);
-      runner_outer->last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
+      runner_outer      = std::make_shared<RunnerT>(thread_args...);
       persistent.runner = std::static_pointer_cast<persistent_runner_base_t>(runner_outer);
       std::weak_ptr<RunnerT> runner_weak = runner_outer;
       ready.test_and_set(cuda::std::memory_order_release);

From 8920dfc067cde97f7612a1f6712416475b2f2e08 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 May 2024 08:22:15 +0200
Subject: [PATCH 29/37] Modify the resource queue to never loop on head/tail
 counters

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 98 ++++++++++---------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 4f28fa9b17..566eb045f6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1227,37 +1227,51 @@ struct alignas(kCacheLineBytes) resource_queue_t {
     }
   }
 
-  void push(value_type x) noexcept
-  {
-    auto& loc = buf_[head_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
-    loc_push(loc, x);
-  }
+  /**
+   * A slot in the queue to take the value from.
+   * Once it's obtained, the corresponding value in the queue is lost for other users.
+   */
+  struct promise_t {
+    explicit promise_t(cuda::std::atomic<value_type>& loc) : loc_{loc}, val_{Empty} {}
+    ~promise_t() noexcept { wait(); }
+
+    auto test() noexcept -> bool
+    {
+      if (val_ != Empty) { return true; }
+      val_ = loc_.exchange(kEmpty, kMemOrder);
+      return val_ != Empty;
+    }
 
-  auto pop() noexcept -> value_type
-  {
-    auto& loc = buf_[tail_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
-    return loc_pop(loc);
-  }
+    auto test(value_type& e) noexcept -> bool
+    {
+      if (test()) {
+        e = val_;
+        return true;
+      }
+      return false;
+    }
 
-  auto try_pop(value_type& e) noexcept -> bool
-  {
-    auto tail = tail_.load(kMemOrder);
-    do {
-      // NB: static cast is here to avoid the case when the head has recently been incremented
-      // beyond the uint32_t max value.
-      if (static_cast<int32_t>(head_.load(kMemOrder) - tail) <= 0) { return false; }
-    } while (!tail_.compare_exchange_weak(tail, tail + kCounterIncrement, kMemOrder, kMemOrder));
-    e = loc_pop(buf_[tail & kCounterLocMask]);
-    return true;
-  }
+    auto wait() noexcept -> value_type
+    {
+      if (val_ == Empty) {
+        // [HOT SPOT]
+        // Optimize for the case of contention: expect the loc is empty.
+        do {
+          loc_.wait(kEmpty, kMemOrder);
+          val_ = loc_.exchange(kEmpty, kMemOrder);
+        } while (val_ == kEmpty);
+      }
+      return val_;
+    }
 
- private:
-  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
-  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
-  alignas(kCacheLineBytes) std::array<cuda::std::atomic<value_type>, kSize> buf_{};
+   private:
+    cuda::std::atomic<value_type>& loc_;
+    value_type val_;
+  };
 
-  void loc_push(cuda::std::atomic<value_type>& loc, value_type x) noexcept
+  void push(value_type x) noexcept
   {
+    auto& loc = buf_[head_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
     /* [NOT A HOT SPOT]
      We expect there's always enough place in the queue to push the item,
      but also we expect a few pop waiters - notify them the data is available.
@@ -1269,17 +1283,16 @@ struct alignas(kCacheLineBytes) resource_queue_t {
     loc.notify_one();
   }
 
-  auto loc_pop(cuda::std::atomic<value_type>& loc) noexcept -> value_type
+  auto pop() noexcept -> promise_t
   {
-    // [HOT SPOT]
-    // Optimize for the case of contention: expect the loc is empty.
-    value_type x = kEmpty;
-    do {
-      loc.wait(kEmpty, kMemOrder);
-      x = loc.exchange(kEmpty, kMemOrder);
-    } while (x == kEmpty);
-    return x;
+    auto& loc = buf_[tail_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
+    return promise_t{loc};
   }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) std::array<cuda::std::atomic<value_type>, kSize> buf_{};
 };
 
 /** Primitive fixed-size deque for single-threaded use. */
@@ -1379,17 +1392,15 @@ struct alignas(kCacheLineBytes) launcher_t {
       job_ids{job_ids},
       idle_worker_ids{idle_worker_ids},
       worker_handles{worker_handles},
-      job_id{job_ids.pop()},
-      completion_flag{record_work(job_id)}
+      job_id{job_ids.pop().wait()},
   {
     // Wait for the first worker and submit the query immediately.
-    // This is supposed to be slightly faster than `try_get_worker`,
-    // because it does not loop on the queue counter.
-    submit_query(idle_worker_ids.pop(), 0);
+    submit_query(idle_worker_ids.pop().wait(), 0);
     // Submit the rest of the queries in the batch
     for (uint32_t i = 1; i < n_queries; i++) {
+      auto promised_worker = idle_worker_ids.pop();
       uint32_t worker_id;
-      while (!try_get_worker(worker_id)) {
+      while (!promised_worker.test(worker_id)) {
         if (pending_reads.try_pop_front(worker_id)) {
           // TODO optimization: avoid the roundtrip through pending_worker_ids
           if (!try_return_worker(worker_id)) { pending_reads.push_front(worker_id); }
@@ -1433,9 +1444,6 @@ struct alignas(kCacheLineBytes) launcher_t {
     }
   }
 
-  /** Try get a free worker if any. */
-  auto try_get_worker(uint32_t& worker_id) -> bool { return idle_worker_ids.try_pop(worker_id); }
-
   /** Check if all workers finished their work. */
   auto is_all_done()
   {
@@ -1618,7 +1626,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   {
     auto whs = worker_handles.data();
     for (auto i = worker_handles.size(); i > 0; i--) {
-      whs[worker_queue.pop()].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
+      whs[worker_queue.pop().wait()].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
     }
     RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream));
     RAFT_LOG_INFO("Destroyed the persistent runner.");

From ba7895794727fc5d2b2766d975ceff0586b38411 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 May 2024 11:52:17 +0200
Subject: [PATCH 30/37] Replace yield() with a smarter, work-aware pause() to
 ease the CPU usage by threads

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 148 +++++++++++++-----
 1 file changed, 107 insertions(+), 41 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 566eb045f6..7c29f47e3e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1218,7 +1218,7 @@ struct alignas(kCacheLineBytes) resource_queue_t {
 
   static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
 
-  explicit resource_queue_t() noexcept
+  explicit resource_queue_t(uint32_t capacity = Size) noexcept : capacity_{capacity}
   {
     head_.store(0, kMemOrder);
     tail_.store(0, kMemOrder);
@@ -1227,6 +1227,12 @@ struct alignas(kCacheLineBytes) resource_queue_t {
     }
   }
 
+  /** Nominal capacity of the queue. */
+  [[nodiscard]] auto capacity() const { return capacity_; }
+
+  /** This does not affect the queue behavior, but merely declares a nominal capacity. */
+  void set_capacity(uint32_t capacity) { capacity_ = capacity; }
+
   /**
    * A slot in the queue to take the value from.
    * Once it's obtained, the corresponding value in the queue is lost for other users.
@@ -1293,6 +1299,7 @@ struct alignas(kCacheLineBytes) resource_queue_t {
   alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
   alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
   alignas(kCacheLineBytes) std::array<cuda::std::atomic<value_type>, kSize> buf_{};
+  alignas(kCacheLineBytes) uint32_t capacity_;
 };
 
 /** Primitive fixed-size deque for single-threaded use. */
@@ -1382,6 +1389,19 @@ struct alignas(kCacheLineBytes) launcher_t {
   completion_flag_type* completion_flag;
   bool all_done = false;
 
+  /* [Note: sleeping]
+  When the number of threads is greater than the number of cores, the threads start to fight for
+  the CPU time, which reduces the throughput.
+  To ease the competition, we track the expected GPU latency and let a thread sleep for some
+  time, and only start to spin when it's about a time to get the result.
+  */
+  static inline constexpr auto kDefaultLatency     = std::chrono::nanoseconds(50000);
+  static inline thread_local auto expected_latency = kDefaultLatency;
+  const std::chrono::time_point<std::chrono::system_clock> start;
+  std::chrono::time_point<std::chrono::system_clock> now;
+  const int64_t pause_factor;
+  int pause_count = 0;
+
   template <typename RecordWork>
   launcher_t(job_queue_type& job_ids,
              worker_queue_type& idle_worker_ids,
@@ -1393,6 +1413,10 @@ struct alignas(kCacheLineBytes) launcher_t {
       idle_worker_ids{idle_worker_ids},
       worker_handles{worker_handles},
       job_id{job_ids.pop().wait()},
+      completion_flag{record_work(job_id)},
+      start{std::chrono::system_clock::now()},
+      pause_factor{calc_pause_factor(n_queries)},
+      now{start}
   {
     // Wait for the first worker and submit the query immediately.
     submit_query(idle_worker_ids.pop().wait(), 0);
@@ -1402,10 +1426,9 @@ struct alignas(kCacheLineBytes) launcher_t {
       uint32_t worker_id;
       while (!promised_worker.test(worker_id)) {
         if (pending_reads.try_pop_front(worker_id)) {
-          // TODO optimization: avoid the roundtrip through pending_worker_ids
           if (!try_return_worker(worker_id)) { pending_reads.push_front(worker_id); }
         } else {
-          std::this_thread::yield();
+          pause();
         }
       }
       submit_query(worker_id, i);
@@ -1416,7 +1439,14 @@ struct alignas(kCacheLineBytes) launcher_t {
     }
   }
 
-  void submit_query(uint32_t worker_id, uint32_t query_id)
+  inline ~launcher_t() noexcept  // NOLINT
+  {
+    // bookkeeping: update the expected latency to wait more efficiently later
+    constexpr size_t kWindow = 100;  // moving average memory
+    expected_latency         = ((kWindow - 1) * expected_latency + now - start) / kWindow;
+  }
+
+  inline void submit_query(uint32_t worker_id, uint32_t query_id)
   {
     worker_handles[worker_id].data.store(worker_handle_t::data_t{.value = {job_id, query_id}},
                                          cuda::memory_order_relaxed);
@@ -1426,13 +1456,13 @@ struct alignas(kCacheLineBytes) launcher_t {
       // It's local, so we must pop and wait for the returned worker to finish its work.
       auto pending_worker_id = pending_reads.pop_front();
       while (!try_return_worker(pending_worker_id)) {
-        std::this_thread::yield();
+        pause();
       }
     }
   }
 
   /** Check if the worker has finished the work; if so, return it to the shared pool. */
-  auto try_return_worker(uint32_t worker_id) -> bool
+  inline auto try_return_worker(uint32_t worker_id) -> bool
   {
     // Use the cached `all_done` - makes sense when called from the `wait()` routine.
     if (all_done ||
@@ -1445,7 +1475,7 @@ struct alignas(kCacheLineBytes) launcher_t {
   }
 
   /** Check if all workers finished their work. */
-  auto is_all_done()
+  inline auto is_all_done()
   {
     // Cache the result of the check to avoid doing unnecessary atomic loads.
     if (all_done) { return true; }
@@ -1453,44 +1483,81 @@ struct alignas(kCacheLineBytes) launcher_t {
     return all_done;
   }
 
+  /** The launcher shouldn't attempt to wait past the returned time. */
+  [[nodiscard]] inline auto sleep_limit() const
+  {
+    constexpr auto kMinWakeTime  = std::chrono::nanoseconds(10000);
+    constexpr double kSleepLimit = 0.6;
+    return start + expected_latency * kSleepLimit - kMinWakeTime;
+  }
+
+  /**
+   * When the latency is much larger than expected, it's a sign that there is a thread contention.
+   * Then we switch to sleeping instead of waiting to give the cpu cycles to other threads.
+   */
+  [[nodiscard]] inline auto overtime_threshold() const
+  {
+    constexpr auto kOvertimeFactor = 3;
+    return start + expected_latency * kOvertimeFactor;
+  }
+
+  /**
+   * Calculate the fraction of time can be spent sleeping in a single call to `pause()`.
+   * Naturally it depends on the number of queries in a batch and the number of parallel workers.
+   */
+  [[nodiscard]] inline auto calc_pause_factor(uint32_t n_queries) const -> uint32_t
+  {
+    constexpr uint32_t kMultiplier = 10;
+    return kMultiplier * raft::div_rounding_up_safe(n_queries, idle_worker_ids.capacity());
+  }
+
+  /** Wait a little bit (called in a loop). */
+  inline void pause()
+  {
+    // Don't sleep this many times hoping for smoother run
+    constexpr auto kSpinLimit = 3;
+    // It doesn't make much sense to slee less than this
+    constexpr auto kPauseTimeMin = std::chrono::nanoseconds(1000);
+    // Bound sleeping time
+    constexpr auto kPauseTimeMax = std::chrono::nanoseconds(10000000);
+    if (pause_count++ < kSpinLimit) {
+      std::this_thread::yield();
+      return;
+    }
+    now                  = std::chrono::system_clock::now();
+    auto pause_time_base = std::max(now - start, expected_latency);
+    auto pause_time      = std::clamp(pause_time_base / pause_factor, kPauseTimeMin, kPauseTimeMax);
+    if (now + pause_time < sleep_limit() || now > overtime_threshold()) {
+      std::this_thread::sleep_for(pause_time);
+    } else {
+      std::this_thread::yield();
+    }
+  }
+
   /** Wait for all work to finish and don't forget to return the workers to the shared pool. */
-  auto wait(std::chrono::time_point<std::chrono::system_clock> start,
-            std::chrono::nanoseconds expected_latency)
+  inline void wait()
   {
     uint32_t worker_id;
     while (pending_reads.try_pop_front(worker_id)) {
       while (!try_return_worker(worker_id)) {
-        if (!is_all_done()) { std::this_thread::yield(); }
+        if (!is_all_done()) { pause(); }
       }
     }
     // terminal state, should be engaged only after the `pending_reads` is empty
     // and `queries_submitted == n_queries`
-    auto now = std::chrono::system_clock::now();
-    /* [Note: sleeping]
-    This code segment is a hot spot when the number of queries is low.
-    When the number of threads is greater than the number of cores, the threads start to fight for
-    the core time, which reduces the throughput.
-    To ease the competition, we track the expected GPU latency and let a thread sleep for some
-    time, and only start to spin when it's about a time to get the result.
-
-    The constants below balance the sleep/spin time to achieve the best throughput while keeping
-    the latency at adequate levels.
-    */
-    constexpr auto kMinWakeTime  = std::chrono::nanoseconds(10000);
-    constexpr double kSleepLimit = 0.6;
+    now = std::chrono::system_clock::now();
     while (!is_all_done()) {
-      auto till_time = start + expected_latency * kSleepLimit - kMinWakeTime;
+      auto till_time = sleep_limit();
       if (now < till_time) {
         std::this_thread::sleep_until(till_time);
+        now = std::chrono::system_clock::now();
       } else {
-        std::this_thread::yield();
+        pause();
       }
-      now = std::chrono::system_clock::now();
     }
 
     // Return the job descriptor
     job_ids.push(job_id);
-    return now - start;
   }
 };
 
@@ -1569,6 +1636,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     }
 
     // initialize the worker queue
+    worker_queue.set_capacity(gs.y);
     worker_handles.resize(gs.y, stream);
     auto* worker_handles_ptr = worker_handles.data();
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -1617,8 +1685,8 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
       "Initialized the kernel %p in stream %zd; job_queue size = %u; worker_queue size = %u",
       reinterpret_cast<void*>(kernel),
       int64_t((cudaStream_t)stream),
-      kMaxJobsNum,
-      gs.y);
+      job_queue.capacity(),
+      worker_queue.capacity());
     last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
   }
 
@@ -1638,10 +1706,6 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
               uint32_t num_queries,
               uint32_t top_k)
   {
-    // The clock is going to be used to estimate the expected latency, control the waiting behavior,
-    // and touch the keep-alive atomic.
-    auto start_time                    = std::chrono::system_clock::now();
-    thread_local auto expected_latency = std::chrono::nanoseconds(50000);
     // submit all queries
     launcher_t launcher{
       job_queue, worker_queue, worker_handles.data(), num_queries, [=](uint32_t job_ix) {
@@ -1656,18 +1720,16 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
         cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
         return cflag;
       }};
+
     // Update the state of the keep-alive atomic in the meanwhile
     auto prev_touch = last_touch.load(std::memory_order_relaxed);
-    if (prev_touch + kLiveInterval / 10 < start_time) {
+    if (prev_touch + kLiveInterval / 10 < launcher.now) {
       // to avoid congestion at this atomic, we only update it if a significant fraction of the live
       // interval has passed.
-      last_touch.store(start_time + expected_latency, std::memory_order_relaxed);
+      last_touch.store(launcher.now, std::memory_order_relaxed);
     }
     // wait for the results to arrive
-    auto measured_latency = launcher.wait(start_time, expected_latency);
-    // bookkeeping: update the expected latency to wait more efficiently
-    constexpr size_t kWindow = 100;
-    expected_latency         = ((kWindow - 1) * expected_latency + measured_latency) / kWindow;
+    launcher.wait();
   }
 
   auto calc_coop_grid_size(uint32_t block_size, uint32_t smem_size) -> dim3
@@ -1762,8 +1824,12 @@ auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
   static thread_local std::weak_ptr<RunnerT> weak;
   auto runner = weak.lock();
   if (runner) { return runner; }
-  runner = create_runner<RunnerT>(std::forward<Args>(args)...);
-  weak   = runner;
+  // Thread-local variable expected_latency makes sense only for a current RunnerT configuration.
+  // If `weak` is not alive, it's a hint the configuration has changed and we should reset our
+  // estimate of the expected launch latency.
+  launcher_t::expected_latency = launcher_t::kDefaultLatency;
+  runner                       = create_runner<RunnerT>(std::forward<Args>(args)...);
+  weak                         = runner;
   return runner;
 }
 

From 304a864c955aad94b04c291bbfe16afb139de8d7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 13 May 2024 08:06:35 +0200
Subject: [PATCH 31/37] Expose thread_block_size parameter

---
 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index bda7ba8d03..656aa72d71 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -249,6 +249,9 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
   if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
   if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("thread_block_size")) {
+    param.p.thread_block_size = conf.at("thread_block_size");
+  }
   if (conf.contains("algo")) {
     if (conf.at("algo") == "single_cta") {
       param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA;

From 08799556e5396b3e9ce561ff96472f65fcece277 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 14 May 2024 10:48:06 +0200
Subject: [PATCH 32/37] Make the 'persistent' parameter in the search_params

---
 cpp/include/raft/neighbors/cagra_types.hpp           |  2 ++
 .../raft/neighbors/detail/cagra/search_plan.cuh      | 12 ++----------
 .../neighbors/detail/cagra/search_single_cta.cuh     | 10 ++++++++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 97c9c0d098..3ea75313c5 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -124,6 +124,8 @@ struct search_params : ann::search_params {
   uint32_t num_random_samplings = 1;
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
+  /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
+  bool persistent = false;
 };
 
 static_assert(std::is_aggregate_v<index_params>);
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index 41a07388c7..a9dc894587 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -107,21 +107,13 @@ struct search_plan_impl_base : public search_params {
   int64_t graph_degree;
   uint32_t topk;
   raft::distance::DistanceType metric;
-  bool is_persistent;
-
-  static constexpr uint64_t kPMask = 0x8000000000000000LL;
 
   search_plan_impl_base(search_params params,
                         int64_t dim,
                         int64_t graph_degree,
                         uint32_t topk,
                         raft::distance::DistanceType metric)
-    : search_params(params),
-      dim(dim),
-      graph_degree(graph_degree),
-      topk(topk),
-      metric(metric),
-      is_persistent(params.rand_xor_mask & kPMask)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
   {
     set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
@@ -194,7 +186,7 @@ struct search_plan_impl : public search_plan_impl_base {
     check_params();
     calc_hashmap_params(res);
     set_dataset_block_and_team_size(dim);
-    if (!is_persistent) {  // Persistent kernel does not provide this functionality
+    if (!persistent) {  // Persistent kernel does not provide this functionality
       num_executed_iterations.resize(max_queries, resource::get_cuda_stream(res));
     }
     RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 9e1215713f..442296aa40 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -201,7 +201,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     }
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
     hashmap_size = 0;
-    if (small_hash_bitlen == 0 && !this->is_persistent) {
+    if (small_hash_bitlen == 0 && !this->persistent) {
       hashmap_size = max_queries * hashmap::get_size(hash_bitlen);
       hashmap.resize(hashmap_size, resource::get_cuda_stream(res));
     }
@@ -221,6 +221,12 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
                   SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = resource::get_cuda_stream(res);
+
+    // Set the 'persistent' flag as the first bit of rand_xor_mask to avoid changing the signature
+    // of the select_and_run for now.
+    constexpr uint64_t kPMask = 0x8000000000000000LL;
+    auto rand_xor_mask_augmented =
+      this->persistent ? (rand_xor_mask | kPMask) : (rand_xor_mask & ~kPMask);
     select_and_run<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T>(
       dataset_desc,
       graph,
@@ -239,7 +245,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       small_hash_bitlen,
       small_hash_reset_interval,
       num_random_samplings,
-      rand_xor_mask,
+      rand_xor_mask_augmented,
       num_seeds,
       itopk_size,
       search_width,

From affdcb29288323e29fcdcb7edaee735ecf2dd9de Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 14 May 2024 15:43:09 +0200
Subject: [PATCH 33/37] Update the parameter parser to use the 'persistent'
 flag in the search_params struct

---
 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 656aa72d71..289e7a293f 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -249,6 +249,7 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
   if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
   if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("persistent")) { param.p.persistent = conf.at("persistent"); }
   if (conf.contains("thread_block_size")) {
     param.p.thread_block_size = conf.at("thread_block_size");
   }
@@ -274,12 +275,5 @@ void parse_search_param(const nlohmann::json& conf,
   }
   // Same ratio as in IVF-PQ
   param.refine_ratio = conf.value("refine_ratio", 1.0f);
-
-  uint64_t pmask = 0x8000000000000000LL;
-  if (conf.contains("persistent") && conf.at("persistent")) {
-    param.p.rand_xor_mask |= pmask;
-  } else {
-    param.p.rand_xor_mask &= ~pmask;
-  }
 }
 #endif

From a48d8f877e38026a1d1d2f50d8bcd0ce68d1d55d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 08:23:40 +0200
Subject: [PATCH 34/37] Fix the uses_stream() not adapted to the previous
 change introducing the 'persistent' search parameter

---
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 386d21f616..b9666be548 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -116,14 +116,11 @@ class RaftCagra : public ANN<T>, public AnnGPU {
 
   [[nodiscard]] auto uses_stream() const noexcept -> bool override
   {
-    // To avoid too much api changes in the prototype, I encode whether the algorithm runs
-    // persistent kernel using the highest bit in the `rand_xor_mask` parameter.
-    uint64_t pmask = 0x8000000000000000LL;
     // If the algorithm uses persistent kernel, the CPU has to synchronize by the end of computing
     // the result. Hence it guarantees the benchmark CUDA stream is empty by the end of the
-    // execution. Hence we notify the benchmark to not waste the time on recording & synchronizing
+    // execution. Hence we inform the benchmark to not waste the time on recording & synchronizing
     // the event.
-    return !(search_params_.rand_xor_mask & pmask);
+    return !search_params_.persistent;
   }
 
   // to enable dataset access from GPU memory

From 6079cc957348205849f23019c928dccf993f139b Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 May 2024 16:52:55 +0200
Subject: [PATCH 35/37] Recover the uses_stream() function in the cagra_wrapper
 after the code merge

---
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 0b892dec35..1e8e50fd01 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -112,6 +112,15 @@ class RaftCagra : public ANN<T>, public AnnGPU {
     return handle_.get_sync_stream();
   }
 
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override
+  {
+    // If the algorithm uses persistent kernel, the CPU has to synchronize by the end of computing
+    // the result. Hence it guarantees the benchmark CUDA stream is empty by the end of the
+    // execution. Hence we inform the benchmark to not waste the time on recording & synchronizing
+    // the event.
+    return !search_params_.persistent;
+  }
+
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
   {

From 74d47c2b006ed36d6e77df88d89c174266f41077 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 24 Jun 2024 11:24:43 +0200
Subject: [PATCH 36/37] Restart the persistent kernel if launch parameters
 changes

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 69 +++++++++++++++++--
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 7c29f47e3e..87e2073823 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1580,11 +1580,40 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   rmm::device_uvector<uint32_t> completion_counters;
   rmm::device_uvector<index_type> hashmap;
   std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
+  uint64_t param_hash;
 
   // This should be large enough to make the runner live through restarts of the benchmark cases.
   // Otherwise, the benchmarks slowdown significantly.
   constexpr static auto kLiveInterval = std::chrono::milliseconds(2000);
 
+  /**
+   * Calculate the hash of the parameters to detect if they've changed across the calls.
+   * NB: this must have the same argument types as the constructor.
+   */
+  static inline auto calculate_parameter_hash(
+    DATASET_DESCRIPTOR_T dataset_desc,
+    raft::device_matrix_view<const index_type, int64_t, row_major> graph,
+    uint32_t num_itopk_candidates,
+    uint32_t block_size,  //
+    uint32_t smem_size,
+    int64_t hash_bitlen,
+    size_t small_hash_bitlen,
+    size_t small_hash_reset_interval,
+    uint32_t num_random_samplings,
+    uint64_t rand_xor_mask,
+    uint32_t num_seeds,
+    size_t itopk_size,
+    size_t search_width,
+    size_t min_iterations,
+    size_t max_iterations,
+    SAMPLE_FILTER_T sample_filter,
+    raft::distance::DistanceType metric) -> uint64_t
+  {
+    return uint64_t(graph.data_handle()) ^ num_itopk_candidates ^ block_size ^ smem_size ^
+           hash_bitlen ^ small_hash_reset_interval ^ num_random_samplings ^ rand_xor_mask ^
+           num_seeds ^ itopk_size ^ search_width ^ min_iterations ^ max_iterations ^ metric;
+  }
+
   persistent_runner_t(DATASET_DESCRIPTOR_T dataset_desc,
                       raft::device_matrix_view<const index_type, int64_t, row_major> graph,
                       uint32_t num_itopk_candidates,
@@ -1609,7 +1638,24 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
       worker_handles(0, stream, worker_handles_mr),
       job_descriptors(kMaxJobsNum, stream, job_descriptor_mr),
       completion_counters(kMaxJobsNum, stream, device_mr),
-      hashmap(0, stream, device_mr)
+      hashmap(0, stream, device_mr),
+      param_hash(calculate_parameter_hash(dataset_desc,
+                                          graph,
+                                          num_itopk_candidates,
+                                          block_size,
+                                          smem_size,
+                                          hash_bitlen,
+                                          small_hash_bitlen,
+                                          small_hash_reset_interval,
+                                          num_random_samplings,
+                                          rand_xor_mask,
+                                          num_seeds,
+                                          itopk_size,
+                                          search_width,
+                                          min_iterations,
+                                          max_iterations,
+                                          sample_filter,
+                                          metric))
   {
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
@@ -1778,7 +1824,13 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
   std::lock_guard<std::mutex> guard(persistent.lock);
   // Check if the runner has already been created
   std::shared_ptr<RunnerT> runner_outer = std::dynamic_pointer_cast<RunnerT>(persistent.runner);
-  if (runner_outer) { return runner_outer; }
+  if (runner_outer) {
+    if (runner_outer->param_hash == RunnerT::calculate_parameter_hash(args...)) {
+      return runner_outer;
+    } else {
+      runner_outer.reset();
+    }
+  }
   // Runner has not yet been created (or it's incompatible):
   //   create it in another thread and only then release the lock.
   // Free the resources (if any) in advance
@@ -1817,18 +1869,25 @@ auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass
 }
 
 template <typename RunnerT, typename... Args>
-auto get_runner(Args&&... args) -> std::shared_ptr<RunnerT>
+auto get_runner(Args... args) -> std::shared_ptr<RunnerT>
 {
   // Using a thread-local weak pointer allows us to avoid using locks/atomics,
   // since the control block of weak/shared pointers is thread-safe.
   static thread_local std::weak_ptr<RunnerT> weak;
   auto runner = weak.lock();
-  if (runner) { return runner; }
+  if (runner) {
+    if (runner->param_hash == RunnerT::calculate_parameter_hash(args...)) {
+      return runner;
+    } else {
+      weak.reset();
+      runner.reset();
+    }
+  }
   // Thread-local variable expected_latency makes sense only for a current RunnerT configuration.
   // If `weak` is not alive, it's a hint the configuration has changed and we should reset our
   // estimate of the expected launch latency.
   launcher_t::expected_latency = launcher_t::kDefaultLatency;
-  runner                       = create_runner<RunnerT>(std::forward<Args>(args)...);
+  runner                       = create_runner<RunnerT>(args...);
   weak                         = runner;
   return runner;
 }

From 22a77c26022f2bad6d32abbc9190391e6c2b8908 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 24 Jun 2024 17:39:38 +0200
Subject: [PATCH 37/37] Allow 'pinned' value for the benchmark queries and
 adjust host refinement helper to avoid unnecessary allocations and stream
 syncs in that case

---
 cpp/bench/ann/src/common/ann_types.hpp        |  3 +
 cpp/bench/ann/src/common/dataset.hpp          | 56 +++++++++++++++++--
 cpp/bench/ann/src/common/util.hpp             | 33 ++++++++++-
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 56 +++++++++++++------
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   | 35 +++++++++---
 .../cagra/search_single_cta_kernel-inl.cuh    |  4 ++
 6 files changed, 154 insertions(+), 33 deletions(-)

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index 19cc36b17e..70c502da6a 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -33,6 +33,7 @@ enum Objective {
 enum class MemoryType {
   Host,
   HostMmap,
+  HostPinned,
   Device,
 };
 
@@ -58,6 +59,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::Host;
   } else if (memory_type == "mmap") {
     return MemoryType::HostMmap;
+  } else if (memory_type == "pinned") {
+    return MemoryType::HostPinned;
   } else if (memory_type == "device") {
     return MemoryType::Device;
   } else {
diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
index 8fcff77d3c..7e8e7ba8f8 100644
--- a/cpp/bench/ann/src/common/dataset.hpp
+++ b/cpp/bench/ann/src/common/dataset.hpp
@@ -283,7 +283,28 @@ class Dataset {
   {
     switch (memory_type) {
       case MemoryType::Device: return query_set_on_gpu();
-      default: return query_set();
+      case MemoryType::Host: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (query_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          query_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::HostPinned: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (!query_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          query_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
+      default: return nullptr;
     }
   }
 
@@ -291,7 +312,27 @@ class Dataset {
   {
     switch (memory_type) {
       case MemoryType::Device: return base_set_on_gpu();
-      case MemoryType::Host: return base_set();
+      case MemoryType::Host: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (base_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          base_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::HostPinned: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (!base_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          base_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
       case MemoryType::HostMmap: return mapped_base_set();
       default: return nullptr;
     }
@@ -312,18 +353,23 @@ class Dataset {
   mutable T* d_query_set_     = nullptr;
   mutable T* mapped_base_set_ = nullptr;
   mutable int32_t* gt_set_    = nullptr;
+
+  mutable bool base_set_pinned_  = false;
+  mutable bool query_set_pinned_ = false;
 };
 
 template <typename T>
 Dataset<T>::~Dataset()
 {
-  delete[] base_set_;
-  delete[] query_set_;
-  delete[] gt_set_;
 #ifndef BUILD_CPU_ONLY
   if (d_base_set_) { cudaFree(d_base_set_); }
   if (d_query_set_) { cudaFree(d_query_set_); }
+  if (base_set_pinned_) { cudaHostUnregister(base_set_); }
+  if (query_set_pinned_) { cudaHostUnregister(query_set_); }
 #endif
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
 }
 
 template <typename T>
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 96185c79eb..c481f589bd 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -197,10 +197,12 @@ struct result_buffer {
   explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
   {
     if (size_ == 0) { return; }
-    data_host_ = malloc(size_);
 #ifndef BUILD_CPU_ONLY
     cudaMallocAsync(&data_device_, size_, stream_);
+    cudaMallocHost(&data_host_, size_);
     cudaStreamSynchronize(stream_);
+#else
+    data_host_ = malloc(size_);
 #endif
   }
   result_buffer()                                = delete;
@@ -213,9 +215,11 @@ struct result_buffer {
     if (size_ == 0) { return; }
 #ifndef BUILD_CPU_ONLY
     cudaFreeAsync(data_device_, stream_);
+    cudaFreeHost(data_host_);
     cudaStreamSynchronize(stream_);
-#endif
+#else
     free(data_host_);
+#endif
   }
 
   [[nodiscard]] auto size() const noexcept { return size_; }
@@ -278,6 +282,31 @@ inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
   return rb;
 }
 
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_tmp_buffer_pool(0);
+inline std::mutex gtp_mutex;
+}  // namespace detail
+
+/**
+ * Global temporary buffer pool for use by algorithms.
+ * In contrast to `get_result_buffer_from_global_pool`, the content of these buffers is never
+ * initialized.
+ */
+inline auto get_tmp_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> result_buffer& {
+    std::lock_guard guard(detail::gtp_mutex);
+    if (static_cast<int>(detail::global_tmp_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_tmp_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_tmp_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    return *rb;
+  }();
+  return rb;
+}
+
 /**
  * Delete all streams and memory allocations in the global pool.
  * It's called at the end of the `main` function - before global/static variables and cuda context
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 9b086fdb23..f754faa17b 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -228,27 +228,47 @@ void refine_helper(const raft::resources& res,
   } else {
     auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
       dataset.data_handle(), dataset.extent(0), dataset.extent(1));
-    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
-    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
-    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
-    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
 
-    auto stream = resource::get_cuda_stream(res);
-    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
-    raft::copy(
-      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+    if (raft::get_device_for_address(queries.data_handle()) >= 0) {
+      // Queries & results are on the device
 
-    raft::resource::sync_stream(res);  // wait for the queries and candidates
-    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
-                                                                        dataset_host,
-                                                                        queries_host.view(),
-                                                                        candidates_host.view(),
-                                                                        neighbors_host.view(),
-                                                                        distances_host.view(),
-                                                                        metric);
+      auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+      auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+
+      auto stream = resource::get_cuda_stream(res);
+      raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+      raft::copy(
+        candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+      raft::resource::sync_stream(res);  // wait for the queries and candidates
+      raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                          dataset_host,
+                                                                          queries_host.view(),
+                                                                          candidates_host.view(),
+                                                                          neighbors_host.view(),
+                                                                          distances_host.view(),
+                                                                          metric);
+
+      raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+
+    } else {
+      // Queries & results are on the host - no device sync / copy needed
+
+      auto queries_host = raft::make_host_matrix_view<const data_type, extents_type>(
+        queries.data_handle(), batch_size, dim);
+      auto candidates_host = raft::make_host_matrix_view<const index_type, extents_type>(
+        candidates.data_handle(), batch_size, k0);
+      auto neighbors_host =
+        raft::make_host_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+      auto distances_host =
+        raft::make_host_matrix_view<float, extents_type>(distances, batch_size, k);
 
-    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
-    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+      raft::neighbors::refine<index_type, data_type, float, extents_type>(
+        res, dataset_host, queries_host, candidates_host, neighbors_host, distances_host, metric);
+    }
   }
 }
 
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 1e8e50fd01..2a3004c24c 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -329,14 +329,33 @@ void RaftCagra<T, IdxT>::search(
   } else {
     auto queries_v =
       raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
-    auto candidate_ixs =
-      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
-    auto candidate_dists =
-      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
-    search_base(
-      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
-    refine_helper(
-      res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
+
+    auto& tmp_buf = get_tmp_buffer_from_global_pool((sizeof(float) + sizeof(AnnBase::index_type)) *
+                                                    batch_size * k0);
+    auto mem_type =
+      raft::get_device_for_address(neighbors) >= 0 ? MemoryType::Device : MemoryType::HostPinned;
+
+    auto candidate_ixs = raft::make_device_matrix_view<AnnBase::index_type, AnnBase::index_type>(
+      reinterpret_cast<AnnBase::index_type*>(tmp_buf.data(mem_type)), batch_size, k0);
+    auto candidate_dists = reinterpret_cast<float*>(candidate_ixs.data_handle() + batch_size * k0);
+
+    search_base(queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists);
+
+    if (mem_type == MemoryType::HostPinned && uses_stream()) {
+      // If the algorithm uses a stream to synchronize (non-persistent kernel), but the data is in
+      // the pinned host memory, we need top synchronize before the refinement operation to wait for
+      // the data being available for the host.
+      raft::resource::sync_stream(res);
+    }
+
+    refine_helper(res,
+                  *input_dataset_v_,
+                  queries_v,
+                  raft::make_const_mdspan(candidate_ixs),
+                  k,
+                  neighbors,
+                  distances,
+                  index_->metric());
   }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 87e2073823..48b98b9b93 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1050,6 +1050,10 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel_p(
                                  sample_filter,
                                  metric);
 
+    // make sure all writes are visible even for the host
+    //     (e.g. when result buffers are in pinned memory)
+    cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+
     // arrive to mark the end of the work phase
     __syncthreads();
     if (threadIdx.x == 0) {