diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fec1248bb..d8d554648 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -199,6 +199,96 @@ endif()
 
 # ##################################################################################################
 # * cuvs ---------------------------------------------------------------------
+add_library(
+  cuvs-cagra-search STATIC
+  src/neighbors/cagra_search_float.cu
+  src/neighbors/cagra_search_int8.cu
+  src/neighbors/cagra_search_uint8.cu
+  src/neighbors/detail/cagra/compute_distance.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
+  src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
+  src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
+  src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
+)
+
+file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu")
+set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64)
+
+set_target_properties(
+  cuvs-cagra-search
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             CUDA_SEPARABLE_COMPILATION ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+             POSITION_INDEPENDENT_CODE ON
+)
+target_link_libraries(cuvs-cagra-search PRIVATE raft::raft)
+target_include_directories(
+  cuvs-cagra-search PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+)
+target_compile_options(
+  cuvs-cagra-search PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+)
 
 add_library(
   cuvs SHARED
@@ -266,109 +356,11 @@ add_library(
   src/neighbors/cagra_extend_int8.cu
   src/neighbors/cagra_extend_uint8.cu
   src/neighbors/cagra_optimize.cu
-  src/neighbors/cagra_search_float.cu
-  src/neighbors/cagra_search_int8.cu
-  src/neighbors/cagra_search_uint8.cu
   src/neighbors/cagra_serialize_float.cu
   src/neighbors/cagra_serialize_int8.cu
   src/neighbors/cagra_serialize_uint8.cu
   src/neighbors/detail/cagra/cagra_build.cpp
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+  src/neighbors/detail/cagra/topk_for_cagra/topk.cu
   $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
   src/neighbors/ivf_flat_index.cpp
   src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -463,7 +455,7 @@ if(NOT BUILD_CPU_ONLY)
   target_link_libraries(
     cuvs
     PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
   )
 endif()
 
@@ -539,7 +531,7 @@ target_compile_options(
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
 )
 # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+target_link_options(cuvs PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
 
 # ##################################################################################################
 # * cuvs_c -------------------------------------------------------------------------------
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 414438067..8218b5f52 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -172,6 +172,22 @@ struct owning_dataset : public strided_dataset<DataT, IdxT> {
   };
 };
 
+template <typename DatasetT>
+struct is_strided_dataset : std::false_type {};
+
+template <typename DataT, typename IdxT>
+struct is_strided_dataset<strided_dataset<DataT, IdxT>> : std::true_type {};
+
+template <typename DataT, typename IdxT>
+struct is_strided_dataset<non_owning_dataset<DataT, IdxT>> : std::true_type {};
+
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+struct is_strided_dataset<owning_dataset<DataT, IdxT, LayoutPolicy, ContainerPolicy>>
+  : std::true_type {};
+
+template <typename DatasetT>
+inline constexpr bool is_strided_dataset_v = is_strided_dataset<DatasetT>::value;
+
 /**
  * @brief Contstruct a strided matrix from any mdarray or mdspan.
  *
@@ -284,23 +300,25 @@ auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t
  */
 template <typename MathT, typename IdxT>
 struct vpq_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  using math_type  = MathT;
   /** Vector Quantization codebook - "coarse cluster centers". */
-  raft::device_matrix<MathT, uint32_t, raft::row_major> vq_code_book;
+  raft::device_matrix<math_type, uint32_t, raft::row_major> vq_code_book;
   /** Product Quantization codebook - "fine cluster centers".  */
-  raft::device_matrix<MathT, uint32_t, raft::row_major> pq_code_book;
+  raft::device_matrix<math_type, uint32_t, raft::row_major> pq_code_book;
   /** Compressed dataset.  */
-  raft::device_matrix<uint8_t, IdxT, raft::row_major> data;
+  raft::device_matrix<uint8_t, index_type, raft::row_major> data;
 
-  vpq_dataset(raft::device_matrix<MathT, uint32_t, raft::row_major>&& vq_code_book,
-              raft::device_matrix<MathT, uint32_t, raft::row_major>&& pq_code_book,
-              raft::device_matrix<uint8_t, IdxT, raft::row_major>&& data)
+  vpq_dataset(raft::device_matrix<math_type, uint32_t, raft::row_major>&& vq_code_book,
+              raft::device_matrix<math_type, uint32_t, raft::row_major>&& pq_code_book,
+              raft::device_matrix<uint8_t, index_type, raft::row_major>&& data)
     : vq_code_book{std::move(vq_code_book)},
       pq_code_book{std::move(pq_code_book)},
       data{std::move(data)}
   {
   }
 
-  [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); }
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return data.extent(0); }
   [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); }
   [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
 
@@ -354,6 +372,15 @@ struct vpq_dataset : public dataset<IdxT> {
   }
 };
 
+template <typename DatasetT>
+struct is_vpq_dataset : std::false_type {};
+
+template <typename MathT, typename IdxT>
+struct is_vpq_dataset<vpq_dataset<MathT, IdxT>> : std::true_type {};
+
+template <typename DatasetT>
+inline constexpr bool is_vpq_dataset_v = is_vpq_dataset<DatasetT>::value;
+
 namespace filtering {
 
 /* A filter that filters nothing. This is the default behavior. */
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 1db2dca64..29f790ec5 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -224,7 +224,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
 }
 
 template <typename T, typename IdxT>
-RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
+static __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
 {
   IdxT gid = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
   IdxT i   = gid / len_b;
@@ -234,12 +234,12 @@ RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T*
 }
 
 template <typename T, typename IdxT>
-RAFT_KERNEL block_copy_kernel(const IdxT* in_offsets,
-                              const IdxT* out_offsets,
-                              IdxT n_blocks,
-                              const T* in_data,
-                              T* out_data,
-                              IdxT n_mult)
+static __global__ void block_copy_kernel(const IdxT* in_offsets,
+                                         const IdxT* out_offsets,
+                                         IdxT n_blocks,
+                                         const T* in_data,
+                                         T* out_data,
+                                         IdxT n_mult)
 {
   IdxT i = static_cast<IdxT>(blockDim.x) * static_cast<IdxT>(blockIdx.x) + threadIdx.x;
   // find the source offset using the binary search.
@@ -317,7 +317,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s
 }
 
 template <typename T, typename S, typename IdxT, typename LabelT>
-RAFT_KERNEL copy_selected_kernel(
+static __global__ void copy_selected_kernel(
   IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst)
 {
   IdxT gid   = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
diff --git a/cpp/src/neighbors/detail/cagra/bitonic.hpp b/cpp/src/neighbors/detail/cagra/bitonic.hpp
index 26195bd9c..ed609d6fd 100644
--- a/cpp/src/neighbors/detail/cagra/bitonic.hpp
+++ b/cpp/src/neighbors/detail/cagra/bitonic.hpp
@@ -26,7 +26,7 @@ namespace bitonic {
 namespace detail {
 
 template <class K, class V>
-_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc)
+RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc)
 {
   if ((k0 != k1) && ((k0 < k1) != asc)) {
     const auto tmp_k = k0;
@@ -39,7 +39,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool a
 }
 
 template <class K, class V>
-_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset, const bool asc)
+RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0,
+                                                V& v0,
+                                                const unsigned lane_offset,
+                                                const bool asc)
 {
   auto k1 = __shfl_xor_sync(~0u, k0, lane_offset);
   auto v1 = __shfl_xor_sync(~0u, v0, lane_offset);
@@ -51,7 +54,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
 struct warp_merge_core {
-  _RAFT_DEVICE inline void operator()(K k[N], V v[N], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[N],
+                                              V v[N],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     const auto lane_id = threadIdx.x % warp_size;
 
@@ -93,7 +99,10 @@ struct warp_merge_core {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 6, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[6], V v[6], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[6],
+                                              V v[6],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 6;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -141,7 +150,10 @@ struct warp_merge_core<K, V, 6, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 3, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[3], V v[3], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[3],
+                                              V v[3],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 3;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -171,7 +183,10 @@ struct warp_merge_core<K, V, 3, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 2, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[2], V v[2], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[2],
+                                              V v[2],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 2;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -197,7 +212,10 @@ struct warp_merge_core<K, V, 2, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 1, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[1], V v[1], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[1],
+                                              V v[1],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     const auto lane_id    = threadIdx.x % warp_size;
     const std::uint32_t b = range;
@@ -211,14 +229,15 @@ struct warp_merge_core<K, V, 1, warp_size> {
 }  // namespace detail
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
-__device__ void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true)
+RAFT_DEVICE_INLINE_FUNCTION void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true)
 {
   detail::warp_merge_core<K, V, N, warp_size>{}(k, v, range, asc);
 }
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
-__device__ void warp_sort(K k[N], V v[N], const bool asc = true)
+RAFT_DEVICE_INLINE_FUNCTION void warp_sort(K k[N], V v[N], const bool asc = true)
 {
+#pragma unroll
   for (std::uint32_t range = 1; range <= warp_size; range <<= 1) {
     warp_merge<K, V, N, warp_size>(k, v, range, asc);
   }
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index cfb5f7919..6dc601f32 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "compute_distance_vpq.cuh"
 #include "factory.cuh"
 #include "search_plan.cuh"
 #include "search_single_cta_inst.cuh"
@@ -85,29 +84,22 @@ inline
   return filter;
 }
 
-template <typename DatasetDescriptorT, typename CagraSampleFilterT>
-void search_main_core(
-  raft::resources const& res,
-  search_params params,
-  DatasetDescriptorT dataset_desc,
-  raft::device_matrix_view<const typename DatasetDescriptorT::INDEX_T, int64_t, raft::row_major>
-    graph,
-  raft::device_matrix_view<const typename DatasetDescriptorT::DATA_T, int64_t, raft::row_major>
-    queries,
-  raft::device_matrix_view<typename DatasetDescriptorT::INDEX_T, int64_t, raft::row_major>
-    neighbors,
-  raft::device_matrix_view<typename DatasetDescriptorT::DISTANCE_T, int64_t, raft::row_major>
-    distances,
-  CagraSampleFilterT sample_filter    = CagraSampleFilterT(),
-  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
+template <typename DataT, typename IndexT, typename DistanceT, typename CagraSampleFilterT>
+void search_main_core(raft::resources const& res,
+                      search_params params,
+                      const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+                      raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                      raft::device_matrix_view<const DataT, int64_t, raft::row_major> queries,
+                      raft::device_matrix_view<IndexT, int64_t, raft::row_major> neighbors,
+                      raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
+                      CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
-                 static_cast<size_t>(dataset_desc.size),
-                 static_cast<size_t>(dataset_desc.dim));
+                 static_cast<size_t>(graph.extent(0)),
+                 static_cast<size_t>(queries.extent(1)));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));
-  RAFT_EXPECTS(queries.extent(1) == dataset_desc.dim, "Queries and index dim must match");
   const uint32_t topk = neighbors.extent(1);
 
   cudaDeviceProp deviceProp = raft::resource::get_device_properties(res);
@@ -119,12 +111,12 @@ void search_main_core(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
     topk,
-    dataset_desc.dim);
+    queries.extent(1));
 
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
-  std::unique_ptr<search_plan_impl<DatasetDescriptorT, CagraSampleFilterT_s>> plan =
-    factory<DatasetDescriptorT, CagraSampleFilterT_s>::create(
-      res, params, dataset_desc.dim, graph.extent(1), topk, metric);
+  std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT_s>> plan =
+    factory<DataT, IndexT, DistanceT, CagraSampleFilterT_s>::create(
+      res, params, dataset_desc, queries.extent(1), graph.extent(1), topk);
 
   plan->check(topk);
 
@@ -134,21 +126,17 @@ void search_main_core(
 
   for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) {
     const uint32_t n_queries = std::min<std::size_t>(max_queries, queries.extent(0) - qid);
-    auto _topk_indices_ptr =
-      reinterpret_cast<typename DatasetDescriptorT::INDEX_T*>(neighbors.data_handle()) +
-      (topk * qid);
+    auto _topk_indices_ptr   = reinterpret_cast<IndexT*>(neighbors.data_handle()) + (topk * qid);
     auto _topk_distances_ptr = distances.data_handle() + (topk * qid);
     // todo(tfeher): one could keep distances optional and pass nullptr
     const auto* _query_ptr = queries.data_handle() + (query_dim * qid);
     const auto* _seed_ptr =
       plan->num_seeds > 0
-        ? reinterpret_cast<const typename DatasetDescriptorT::INDEX_T*>(plan->dev_seed.data()) +
-            (plan->num_seeds * qid)
+        ? reinterpret_cast<const IndexT*>(plan->dev_seed.data()) + (plan->num_seeds * qid)
         : nullptr;
     uint32_t* _num_executed_iterations = nullptr;
 
     (*plan)(res,
-            dataset_desc,
             graph,
             _topk_indices_ptr,
             _topk_distances_ptr,
@@ -161,77 +149,6 @@ void search_main_core(
   }
 }
 
-template <class T,
-          class DatasetT,
-          class DatasetIdxT,
-          class InternalIdxT,
-          class DistanceT,
-          class CagraSampleFilterT>
-void launch_vpq_search_main_core(
-  raft::resources const& res,
-  const vpq_dataset<DatasetT, DatasetIdxT>* vpq_dset,
-  search_params params,
-  raft::device_matrix_view<const InternalIdxT, int64_t, raft::row_major> graph,
-  raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
-  raft::device_matrix_view<InternalIdxT, int64_t, raft::row_major> neighbors,
-  raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
-  CagraSampleFilterT sample_filter,
-  const cuvs::distance::DistanceType metric)
-{
-  RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
-  RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
-               "Only pq_len 2 or 4 is supported for now");
-  RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0,
-               "dim must be a multiple of pq_dim at the moment");
-
-  const float vq_scale = 1.0f;
-  const float pq_scale = 1.0f;
-
-  if (vpq_dset->pq_bits() == 8) {
-    if (vpq_dset->pq_len() == 2) {
-      using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
-                                                          DatasetT,
-                                                          8 /*PQ bit*/,
-                                                          2 /* Subspace dimension*/,
-                                                          DistanceT,
-                                                          InternalIdxT>;
-      dataset_desc_t dataset_desc(vpq_dset->data.data_handle(),
-                                  vpq_dset->encoded_row_length(),
-                                  vpq_dset->pq_dim(),
-                                  vpq_dset->vq_code_book.data_handle(),
-                                  vq_scale,
-                                  vpq_dset->pq_code_book.data_handle(),
-                                  pq_scale,
-                                  size_t(vpq_dset->n_rows()),
-                                  vpq_dset->dim());
-      search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
-    } else if (vpq_dset->pq_len() == 4) {
-      using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
-                                                          DatasetT,
-                                                          8 /*PQ bit*/,
-                                                          4 /* Subspace dimension*/,
-                                                          DistanceT,
-                                                          InternalIdxT>;
-      dataset_desc_t dataset_desc(vpq_dset->data.data_handle(),
-                                  vpq_dset->encoded_row_length(),
-                                  vpq_dset->pq_dim(),
-                                  vpq_dset->vq_code_book.data_handle(),
-                                  vq_scale,
-                                  vpq_dset->pq_code_book.data_handle(),
-                                  pq_scale,
-                                  size_t(vpq_dset->n_rows()),
-                                  vpq_dset->dim());
-      search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
-    } else {
-      RAFT_FAIL("Subspace dimension must be 2 or 4");
-    }
-  } else {
-    RAFT_FAIL("Only 8-bit PQ is supported now");
-  }
-}
-
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -264,6 +181,7 @@ void search_main(raft::resources const& res,
                  raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
                  CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  auto stream         = raft::resource::get_cuda_stream(res);
   const auto& graph   = index.graph();
   auto graph_internal = raft::make_device_matrix_view<const InternalIdxT, int64_t, raft::row_major>(
     reinterpret_cast<const InternalIdxT*>(graph.data_handle()), graph.extent(0), graph.extent(1));
@@ -273,39 +191,21 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
-    // Set TEAM_SIZE and DATASET_BLOCK_SIZE to zero tentatively since these parameters cannot be
-    // determined here. They are set just before kernel launch.
-    using dataset_desc_t = standard_dataset_descriptor_t<T, InternalIdxT, DistanceT>;
     // Search using a plain (strided) row-major dataset
-    const dataset_desc_t dataset_desc(strided_dset->view().data_handle(),
-                                      strided_dset->n_rows(),
-                                      strided_dset->dim(),
-                                      strided_dset->stride());
-    search_main_core<dataset_desc_t, CagraSampleFilterT>(res,
-                                                         params,
-                                                         dataset_desc,
-                                                         graph_internal,
-                                                         queries,
-                                                         neighbors,
-                                                         distances,
-                                                         sample_filter,
-                                                         index.metric());
+    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+      res, params, *strided_dset, index.metric());
+    search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
+      res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<float, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     // Search using a compressed dataset
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    launch_vpq_search_main_core<T, half, ds_idx_type, InternalIdxT, DistanceT, CagraSampleFilterT>(
-      res,
-      vpq_dset,
-      params,
-      graph_internal,
-      queries,
-      neighbors,
-      distances,
-      sample_filter,
-      index.metric());
+    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+      res, params, *vpq_dset, index.metric());
+    search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
+      res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
   } else if (auto* empty_dset = dynamic_cast<const empty_dataset<ds_idx_type>*>(&index.data());
              empty_dset != nullptr) {
     // Forgot to add a dataset.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
new file mode 100644
index 000000000..8407ef055
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#pragma once
+
+#include "compute_distance_standard.hpp"
+#include "compute_distance_vpq.hpp"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+
+extern template struct instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+using descriptor_instances = instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init(const cagra::search_params& params,
+                             const DatasetT& dataset,
+                             cuvs::distance::DistanceType metric,
+                             rmm::cuda_stream_view stream)
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
+{
+  auto [init, priority] =
+    descriptor_instances::select<DataT, IndexT, DistanceT>(params, dataset, metric);
+  if (init == nullptr || priority < 0) {
+    RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
+  }
+  return init(params, dataset, metric, stream);
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
new file mode 100644
index 000000000..45316e59b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance-ext.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+
+template struct instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 2b0c750ff..4bed275ab 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -20,303 +20,363 @@
 #include "utils.hpp"
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/operators.hpp>
 
 // TODO: This shouldn't be invoking spatial/knn
 #include "../ann_utils.cuh"
 
+#include <raft/util/device_loads_stores.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <functional>
+#include <memory>
 #include <type_traits>
 
 namespace cuvs::neighbors::cagra::detail {
-namespace device {
 
-// using LOAD_256BIT_T = ulonglong4;
-using LOAD_128BIT_T = uint4;
-using LOAD_64BIT_T  = uint64_t;
-
-template <class LOAD_T, class DATA_T>
-_RAFT_DEVICE constexpr unsigned get_vlen()
-{
-  return utils::size_of<LOAD_T>() / utils::size_of<DATA_T>();
-}
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class DATASET_DESCRIPTOR_T,
-          class DISTANCE_T,
-          class INDEX_T>
-_RAFT_DEVICE void compute_distance_to_random_nodes(
-  INDEX_T* const result_indices_ptr,       // [num_pickup]
-  DISTANCE_T* const result_distances_ptr,  // [num_pickup]
-  const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer,
-  const DATASET_DESCRIPTOR_T& dataset_desc,
-  const std::size_t num_pickup,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const INDEX_T* const seed_ptr,  // [num_seeds]
-  const uint32_t num_seeds,
-  INDEX_T* const visited_hash_ptr,
-  const uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric,
-  const uint32_t block_id   = 0,
-  const uint32_t num_blocks = 1)
-{
-  uint32_t max_i = num_pickup;
-  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-
-  for (uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += blockDim.x / TEAM_SIZE) {
-    const bool valid_i = (i < num_pickup);
-
-    INDEX_T best_index_team_local;
-    DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
-    for (uint32_t j = 0; j < num_distilation; j++) {
-      // Select a node randomly and compute the distance to it
-      INDEX_T seed_index;
-      if (valid_i) {
-        // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id)));
-        uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j)));
-        if (seed_ptr && (gid < num_seeds)) {
-          seed_index = seed_ptr[gid];
-        } else {
-          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size;
-        }
-      }
-
-      DISTANCE_T norm2;
-      switch (metric) {
-        case cuvs::distance::DistanceType::L2Expanded:
-          norm2 =
-            dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                     TEAM_SIZE,
-                                                     cuvs::distance::DistanceType::L2Expanded>(
-              query_buffer, seed_index, valid_i);
-          break;
-        case cuvs::distance::DistanceType::InnerProduct:
-          norm2 =
-            dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                     TEAM_SIZE,
-                                                     cuvs::distance::DistanceType::InnerProduct>(
-              query_buffer, seed_index, valid_i);
-          break;
-        default: break;
-      }
-
-      if (valid_i && (norm2 < best_norm2_team_local)) {
-        best_norm2_team_local = norm2;
-        best_index_team_local = seed_index;
+/**
+ * @brief Dataset and distance description.
+ *
+ * This is the base type for the dataset/distance descriptors.
+ * The actual implementations are hidden in `compute_distance_***-impl.cuh` files, which should be
+ * included only in `compute_distance_***.cu` files to enforce separable compilation.
+ *
+ * [Note: manual dispatch]
+ * The descriptor type hierarchy declared here resembles the usual C++ inheritance: the search
+ * kernels take a pointer to the base type as an argument, but the actual implementation types are
+ * passed by the host. The kernels only ever need two functions `setup_workspace` and
+ * `compute_distance`; the choice of the implementation happens at the runtime.
+ *
+ * However, for performance reasons, we don't use the C++ virtual dispatch mechanics here.
+ * The extra pointer-chasing and register usage overheads associated with virtual tables turn out to
+ * cause a significant slowdown in the performance-critical `compute_distance`.
+ * Instead, we manually dispatch the two polymorphic functions and store them as fields in the
+ * descriptor structure.
+ *
+ * [Note: initialization/dispatch]
+ * The host doesn't know the addresses of the device symbols. That means we either need to resolve
+ * the device functions and store them in the descriptor directly on the device, or use
+ * `cudaMemcpyFromSymbolAsync` to fetch them (note, there is same problem with classes: if an object
+ * is created on the host, its pointer to the vtable would be invalid on device).
+ * We take the first approach: there's an `***_init_kernel` for each descriptor instance that is
+ * called before the search kernel; all it does is call a (placement) new with an appropriate type
+ * and arguments in a single GPU thread.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
+  using base_type  = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using LOAD_T     = device::LOAD_128BIT_T;
+  using DATA_T     = DataT;
+  using INDEX_T    = IndexT;
+  using DISTANCE_T = DistanceT;
+
+  /**
+   * @brief "polymorphic" `compute_distance` arguments.
+   *
+   * This is a tightly-packed POD arguments of `compute_distance`.
+   * **Important** this structure is passed by value to `compute_distance`; it's important it
+   * remains small.
+   *
+   * [Note: arguments layout]
+   * The descriptor implementations require different sets of arguments (with couple arguments
+   * overlapping). At the same time the `compute_distance` is defined such that it accepts the
+   * `args_t` by value. That means the layout of the struct must be identical for all descriptor
+   * implementations. We workaround this requirement by defining generic fields in this struct and
+   * assignging the meaning to them on the implementation side.
+   */
+  struct alignas(LOAD_T) args_t {
+    void* extra_ptr1;
+    void* extra_ptr2;
+    /** Pointer to the workspace in the shared memory (filled in every copy by a thread block). */
+    uint32_t smem_ws_ptr;
+    /** Dimensionality of the data/queries. */
+    uint32_t dim;
+    uint32_t extra_word1;
+    uint32_t extra_word2;
+
+    /**
+     * Load this struct from shared memory.
+     *
+     * NB: until `compute_distance` is called, the arguments struct is stored in the shared memory
+     * as a member of the descriptor struct. This helper functions saves a few instructions by
+     * forcing the compiler to assume it is indeed in the shared memory address space.
+     */
+    RAFT_DEVICE_INLINE_FUNCTION auto load() const -> args_t
+    {
+      constexpr int kCount = sizeof(*this) / sizeof(LOAD_T);
+      using blob_type      = LOAD_T[kCount];
+      args_t r;
+      auto& src = reinterpret_cast<const blob_type&>(*this);
+      auto& dst = reinterpret_cast<blob_type&>(r);
+#pragma unroll
+      for (int i = 0; i < kCount; i++) {
+        device::lds(dst[i], src + i);
       }
+      return r;
     }
-
-    const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-    if (valid_i && lane_id == 0) {
-      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
-        result_distances_ptr[i] = best_norm2_team_local;
-        result_indices_ptr[i]   = best_index_team_local;
-      } else {
-        result_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
-        result_indices_ptr[i]   = utils::get_max_value<INDEX_T>();
-      }
+  };
+
+  /** Shared memory usage and team_size packed into a single uint32_t to save on memory requests. */
+  struct smem_and_team_size_t {
+    uint32_t value;
+    RAFT_INLINE_FUNCTION constexpr smem_and_team_size_t(uint32_t smem_size_bytes,
+                                                        uint32_t team_size_bitshift)
+      : value{(team_size_bitshift << 24) | smem_size_bytes}
+    {
     }
-  }
-}
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          unsigned MAX_N_FRAGS,
-          class DATASET_DESCRIPTOR_T,
-          class DISTANCE_T,
-          class INDEX_T>
-_RAFT_DEVICE void compute_distance_to_child_nodes(
-  INDEX_T* const result_child_indices_ptr,
-  DISTANCE_T* const result_child_distances_ptr,
-  // query
-  const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer,
-  // [dataset_dim, dataset_size]
-  const DATASET_DESCRIPTOR_T& dataset_desc,
-  // [knn_k, dataset_size]
-  const INDEX_T* const knn_graph,
-  const std::uint32_t knn_k,
-  // hashmap
-  INDEX_T* const visited_hashmap_ptr,
-  const std::uint32_t hash_bitlen,
-  const INDEX_T* const parent_indices,
-  const INDEX_T* const internal_topk_list,
-  const std::uint32_t search_width,
-  const cuvs::distance::DistanceType metric)
-{
-  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-  const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
-
-  // Read child indices of parents from knn graph and check if the distance
-  // computaiton is necessary.
-  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
-    const INDEX_T smem_parent_id = parent_indices[i / knn_k];
-    INDEX_T child_id             = invalid_index;
-    if (smem_parent_id != invalid_index) {
-      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
-      child_id             = knn_graph[(i % knn_k) + (static_cast<int64_t>(knn_k) * parent_id)];
+    /** Total dynamic shared memory required by the descriptor.  */
+    RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t
+    {
+      return value & 0xffffffu;
     }
-    if (child_id != invalid_index) {
-      if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
-        child_id = invalid_index;
-      }
+    RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t
+    {
+      return (value >> 24) & 0xffu;
+    }
+    /** How many threads are involved in computing a single distance. */
+    RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t
+    {
+      return 1u << team_size_bitshift();
     }
-    result_child_indices_ptr[i] = child_id;
+  };
+  static_assert(sizeof(smem_and_team_size_t) == sizeof(uint32_t));
+
+  using setup_workspace_type  = const base_type*(const base_type*, void*, const DATA_T*, uint32_t);
+  using compute_distance_type = DISTANCE_T(const args_t, const INDEX_T);
+
+  args_t args;
+
+  /** Copy the descriptor and the query into shared memory and do any other work, such as
+   * initializing the codebook. */
+  setup_workspace_type* setup_workspace_impl;
+  /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector
+   * given by the dataset_index. */
+  compute_distance_type* compute_distance_impl;
+  /** A placeholder for an implementation-specific pointer. */
+  void* extra_ptr3;
+  smem_and_team_size_t smem_and_team_size;
+
+  /** Number of records in the database. */
+  INDEX_T size;
+
+  RAFT_INLINE_FUNCTION dataset_descriptor_base_t(setup_workspace_type* setup_workspace_impl,
+                                                 compute_distance_type* compute_distance_impl,
+                                                 INDEX_T size,
+                                                 uint32_t dim,
+                                                 uint32_t team_size_bitshift,
+                                                 uint32_t smem_ws_size_in_bytes)
+    : setup_workspace_impl(setup_workspace_impl),
+      compute_distance_impl(compute_distance_impl),
+      size(size),
+      smem_and_team_size(smem_ws_size_in_bytes, team_size_bitshift),
+      args{nullptr, nullptr, 0, dim, 0, 0}
+  {
   }
-  __syncthreads();
 
-  // Compute the distance to child nodes
-  std::uint32_t max_i = knn_k * search_width;
-  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-  for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) {
-    const auto i       = tid / TEAM_SIZE;
-    const bool valid_i = (i < (knn_k * search_width));
-    INDEX_T child_id   = invalid_index;
-    if (valid_i) { child_id = result_child_indices_ptr[i]; }
+  /** Total dynamic shared memory required by the descriptor.  */
+  RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.smem_ws_size_in_bytes();
+  }
+  RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.team_size_bitshift();
+  }
+  RAFT_DEVICE_INLINE_FUNCTION constexpr auto team_size_bitshift_from_smem() const noexcept
+    -> uint32_t
+  {
+    uint32_t sts;
+    raft::lds(sts, reinterpret_cast<const uint32_t*>(&smem_and_team_size));
+    return reinterpret_cast<smem_and_team_size_t&>(sts).team_size_bitshift();
+  }
 
-    DISTANCE_T norm2;
-    switch (metric) {
-      case cuvs::distance::DistanceType::L2Expanded:
-        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                         TEAM_SIZE,
-                                                         cuvs::distance::DistanceType::L2Expanded>(
-          query_buffer, child_id, child_id != invalid_index);
-        break;
-      case cuvs::distance::DistanceType::InnerProduct:
-        norm2 =
-          dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                   TEAM_SIZE,
-                                                   cuvs::distance::DistanceType::InnerProduct>(
-            query_buffer, child_id, child_id != invalid_index);
-        break;
-      default: break;
-    }
+  /** How many threads are involved in computing a single distance. */
+  RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.team_size();
+  }
 
-    // Store the distance
-    const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-    if (valid_i && lane_id == 0) {
-      if (child_id != invalid_index) {
-        result_child_distances_ptr[i] = norm2;
-      } else {
-        result_child_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
-      }
-    }
+  RAFT_DEVICE_INLINE_FUNCTION auto setup_workspace(void* smem_ptr,
+                                                   const DATA_T* queries_ptr,
+                                                   uint32_t query_id) const -> const base_type*
+  {
+    return setup_workspace_impl(this, smem_ptr, queries_ptr, query_id);
   }
-}
 
-}  // namespace device
+  RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const
+    -> DISTANCE_T
+  {
+    auto per_thread_distances = valid ? compute_distance_impl(args.load(), dataset_index) : 0;
+    return device::team_sum(per_thread_distances, team_size_bitshift_from_smem());
+  }
+};
 
-template <class QUERY_T_, class DISTANCE_T_, class INDEX_T_>
-struct dataset_descriptor_base_t {
-  using INDEX_T    = INDEX_T_;
-  using QUERY_T    = QUERY_T_;
-  using DISTANCE_T = DISTANCE_T_;
+/**
+ * @brief Hosting a device descriptor.
+ *
+ * The dataset descriptor is initialized on the device side and stays there.
+ * The host struct manages the lifetime of the associated device pointer and a couple parameters
+ * affecting the search kernel launch config.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct dataset_descriptor_host {
+  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  uint32_t smem_ws_size_in_bytes = 0;
+  uint32_t team_size             = 0;
+
+  template <typename DescriptorImpl>
+  dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream)
+    : dev_ptr_{[stream]() {
+                 dev_descriptor_t* p;
+                 RAFT_CUDA_TRY(cudaMallocAsync(&p, sizeof(DescriptorImpl), stream));
+                 return p;
+               }(),
+               [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }},
+      smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
+      team_size{dd_host.team_size()}
+  {
+  }
 
-  const INDEX_T size;
-  const std::uint32_t dim;
+  [[nodiscard]] auto dev_ptr() const -> const dev_descriptor_t* { return dev_ptr_.get(); }
+  [[nodiscard]] auto dev_ptr() -> dev_descriptor_t* { return dev_ptr_.get(); }
 
-  dataset_descriptor_base_t(const INDEX_T size, const std::uint32_t dim) : size(size), dim(dim) {}
+ private:
+  std::unique_ptr<dev_descriptor_t, std::function<void(dev_descriptor_t*)>> dev_ptr_;
 };
 
-template <class DATA_T_, class INDEX_T, class DISTANCE_T = float>
-struct standard_dataset_descriptor_t
-  : public dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T> {
-  using LOAD_T  = device::LOAD_128BIT_T;
-  using DATA_T  = DATA_T_;
-  using QUERY_T = typename dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::QUERY_T;
-
-  const DATA_T* const ptr;
-  const std::size_t ld;
-  using dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::size;
-  using dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::dim;
-
-  standard_dataset_descriptor_t(const DATA_T* const ptr,
-                                const std::size_t size,
-                                const std::uint32_t dim,
-                                const std::size_t ld)
-    : dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>(size, dim), ptr(ptr), ld(ld)
+/**
+ * @brief The signature for descriptor initialization.
+ *
+ * There is an init function associated with every descriptor implementation. It's responsible for
+ * initializing the device-side descriptor instance (calling the init kernel).
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+using init_desc_type =
+  dataset_descriptor_host<DataT, IndexT, DistanceT> (*)(const cagra::search_params&,
+                                                        const DatasetT&,
+                                                        cuvs::distance::DistanceType,
+                                                        rmm::cuda_stream_view);
+
+/**
+ * @brief Descriptor instance specification.
+ *
+ * This type provides a decentralized way for selecting a descriptor instance best suitable for the
+ * given dataset and distance metric.
+ * There is a spec for every descriptor (described in the interface files
+ * `compute_distance_***.hpp`).
+ *
+ * The `instance_spec` implementation must have the following static member template functions:
+ *   * constexpr bool accepts_dataset()
+ *     - tells whether the spec is compatible with the dataset type, executed at compile time.
+ *   * double priority(..)
+ *     - tells how to select a single spec out of possibly several compatible specs
+ *   * init_desc_type init
+ *     - (see `init_desc_type` above) the function to initialize the descriptor.
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct instance_spec {
+  using data_type     = DataT;
+  using index_type    = IndexT;
+  using distance_type = DistanceT;
+  using host_type     = dataset_descriptor_host<DataT, IndexT, DistanceT>;
+  /** Use this to constrain the input dataset type. */
+  template <typename DatasetT>
+  constexpr static inline bool accepts_dataset()
   {
+    return false;
   }
+};
 
-  static const std::uint32_t smem_buffer_size_in_byte = 0;
-  __device__ void set_smem_ptr(void* const){};
-
-  template <uint32_t DATASET_BLOCK_DIM>
-  __device__ void copy_query(const DATA_T* const dmem_query_ptr,
-                             QUERY_T* const smem_query_ptr,
-                             const std::uint32_t query_smem_buffer_length)
-  {
-    for (unsigned i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-      unsigned j = device::swizzling(i);
-      if (i < dim) {
-        smem_query_ptr[j] =
-          cuvs::spatial::knn::detail::utils::mapping<QUERY_T>{}(dmem_query_ptr[i]);
-      } else {
-        smem_query_ptr[j] = 0.0;
-      }
-    }
+/** Whether the descriptor is compatible with the dataset and arguments at the type level
+ * (compile-time check).
+ */
+template <typename InstanceSpec,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename DatasetT>
+constexpr bool spec_sound = std::is_same_v<DataT, typename InstanceSpec::data_type> &&
+                            std::is_same_v<IndexT, typename InstanceSpec::index_type> &&
+                            std::is_same_v<DistanceT, typename InstanceSpec::distance_type> &&
+                            InstanceSpec::template accepts_dataset<DatasetT>();
+
+/**
+ * @brief Get the init function and the priority of the descriptor given by the InstanceSpec.
+ *
+ * @return (init function, priority)
+ */
+template <typename InstanceSpec,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename DatasetT>
+constexpr auto spec_match(const cagra::search_params& params,
+                          const DatasetT& dataset,
+                          cuvs::distance::DistanceType metric)
+  -> std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>
+{
+  if constexpr (spec_sound<InstanceSpec, DataT, IndexT, DistanceT, DatasetT>) {
+    return std::make_tuple(InstanceSpec::template init<DatasetT>,
+                           InstanceSpec::template priority(params, dataset, metric));
   }
+  return std::make_tuple(nullptr, -1.0);
+}
 
-  template <typename T, cuvs::distance::DistanceType METRIC>
-  std::enable_if_t<METRIC == cuvs::distance::DistanceType::L2Expanded, T> __device__
-  dist_op(T a, T b) const
+/**
+ * @brief Select the best matching descriptor instance from the given type-level list.
+ *
+ * This is a helper struct that goes through the given list of specs (given as template arguments),
+ * filters is (partially at compile time and partially at runtime), and selects the descriptor with
+ * the highest priority.
+ *
+ * There is a single point in the codebase, where all specs are brought together; it's in the
+ * `neighbors/detail/cagra/compute_distance-ext.cuh`, which is generated by
+ * `neighbors/detail/cagra/compute_distance_00_generate.py`.
+ * Hence, `compute_distance_00_generate.py` is the only place you need to manually change to modify
+ * or extend the list supported dataset descriptors.
+ * The logic of selecting the descriptor is fully defined in this file, whereas the priorities of
+ * specific implementations are defined next to the implementations.
+ */
+template <typename... Specs>
+struct instance_selector {
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params&, const DatasetT&, cuvs::distance::DistanceType)
+    -> std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>
   {
-    T diff = a - b;
-    return diff * diff;
+    return std::make_tuple(nullptr, -1.0);
   }
+};
 
-  template <typename T, cuvs::distance::DistanceType METRIC>
-  std::enable_if_t<METRIC == cuvs::distance::DistanceType::InnerProduct, T> __device__
-  dist_op(T a, T b) const
+template <typename Spec, typename... Specs>
+struct instance_selector<Spec, Specs...> {
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params& params,
+                     const DatasetT& dataset,
+                     cuvs::distance::DistanceType metric)
+    -> std::enable_if_t<spec_sound<Spec, DataT, IndexT, DistanceT, DatasetT>,
+                        std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>>
   {
-    return -a * b;
+    auto s0 = spec_match<Spec, DataT, IndexT, DistanceT, DatasetT>(params, dataset, metric);
+    auto ss = instance_selector<Specs...>::template select<DataT, IndexT, DistanceT, DatasetT>(
+      params, dataset, metric);
+    return std::get<1>(s0) >= std::get<1>(ss) ? s0 : ss;
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, cuvs::distance::DistanceType METRIC>
-  __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
-                                           const INDEX_T dataset_i,
-                                           const bool valid) const
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params& params,
+                     const DatasetT& dataset,
+                     cuvs::distance::DistanceType metric)
+    -> std::enable_if_t<!spec_sound<Spec, DataT, IndexT, DistanceT, DatasetT>,
+                        std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>>
   {
-    const auto dataset_ptr  = ptr + dataset_i * ld;
-    const unsigned lane_id  = threadIdx.x % TEAM_SIZE;
-    constexpr unsigned vlen = device::get_vlen<LOAD_T, DATA_T>();
-    // #include <raft/util/cuda_dev_essentials.cuh
-    constexpr unsigned reg_nelem = raft::ceildiv<unsigned>(DATASET_BLOCK_DIM, TEAM_SIZE * vlen);
-    raft::TxN_t<DATA_T, vlen> dl_buff[reg_nelem];
-
-    DISTANCE_T norm2 = 0;
-    if (valid) {
-      for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) {
-#pragma unroll
-        for (uint32_t e = 0; e < reg_nelem; e++) {
-          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset;
-          if (k >= dim) break;
-          dl_buff[e].load(dataset_ptr, k);
-        }
-#pragma unroll
-        for (uint32_t e = 0; e < reg_nelem; e++) {
-          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset;
-          if (k >= dim) break;
-#pragma unroll
-          for (uint32_t v = 0; v < vlen; v++) {
-            const uint32_t kv = k + v;
-            // Note this loop can go above the dataset_dim for padded arrays. This is not a problem
-            // because:
-            // - Above the last element (dataset_dim-1), the query array is filled with zeros.
-            // - The data buffer has to be also padded with zeros.
-            DISTANCE_T d = query_ptr[device::swizzling(kv)];
-            norm2 += dist_op<DISTANCE_T, METRIC>(
-              d, cuvs::spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]));
-          }
-        }
-      }
-    }
-    for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
-      norm2 += __shfl_xor_sync(0xffffffff, norm2, offset);
-    }
-    return norm2;
+    return instance_selector<Specs...>::template select<DataT, IndexT, DistanceT, DatasetT>(
+      params, dataset, metric);
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
new file mode 100644
index 000000000..52a15e2a1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import glob
+
+template = """/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+{includes}
+
+namespace cuvs::neighbors::cagra::detail {{
+
+using namespace cuvs::distance;
+{content}
+
+}}  // namespace cuvs::neighbors::cagra::detail
+"""
+
+mxdim_team = [(128, 8), (256, 16), (512, 32)]
+#mxdim_team = [(64, 8), (128, 16), (256, 32)]
+#mxdim_team = [(32, 8), (64, 16), (128, 32)]
+
+pq_bits = [8]
+pq_lens = [2, 4]
+
+# rblock = [(256, 4), (512, 2), (1024, 1)]
+# rcandidates = [32]
+# rsize = [256, 512]
+code_book_types = ["half"]
+
+search_types = dict(
+    float_uint32=("float", "uint32_t", "float"),  # data_t, idx_t, distance_t
+    half_uint32=("half", "uint32_t", "float"),
+    int8_uint32=("int8_t", "uint32_t", "float"),
+    uint8_uint32=("uint8_t", "uint32_t", "float"),
+    # float_uint64=("float", "uint64_t", "float"),
+    # half_uint64=("half", "uint64_t", "float"),
+)
+
+metric_prefix = 'DistanceType::'
+
+specs = []
+descs = []
+cmake_list = []
+
+
+
+
+# Cleanup first
+for f in glob.glob("compute_distance_standard_*.cu"):
+  os.remove(f)
+for f in glob.glob("compute_distance_vpq_*.cu"):
+  os.remove(f)
+
+# Generate new files
+for type_path, (data_t, idx_t, distance_t) in search_types.items():
+    for (mxdim, team) in mxdim_team:
+        # CAGRA
+        for metric in ['L2Expanded', 'InnerProduct']:
+            path = f"compute_distance_standard_{metric}_{type_path}_dim{mxdim}_t{team}.cu"
+            includes = '#include "compute_distance_standard-impl.cuh"'
+            params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}"
+            spec = f"standard_descriptor_spec<{params}>"
+            content = f"""template struct {spec};"""
+            specs.append(spec)
+            with open(path, "w") as f:
+                f.write(template.format(includes=includes, content=content))
+                cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+
+        # CAGRA-Q
+        for code_book_t in code_book_types:
+            for pq_len in pq_lens:
+                for pq_bit in pq_bits:
+                    for metric in ['L2Expanded']:
+                        path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu"
+                        includes = '#include "compute_distance_vpq-impl.cuh"'
+                        params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}"
+                        spec = f"vpq_descriptor_spec<{params}>"
+                        content = f"""template struct {spec};"""
+                        specs.append(spec)
+                        with open(path, "w") as f:
+                            f.write(template.format(includes=includes, content=content))
+                            cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+
+with open("compute_distance-ext.cuh", "w") as f:
+    includes = '''
+#pragma once
+
+#include "compute_distance_standard.hpp"
+#include "compute_distance_vpq.hpp"
+'''
+    newline = "\n"
+    contents = f'''
+{newline.join(map(lambda s: "extern template struct " + s + ";", specs))}
+
+extern template struct
+  instance_selector<{("," + newline + "                    ").join(specs)}>;
+
+using descriptor_instances =
+  instance_selector<{("," + newline + "                    ").join(specs)}>;
+
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init(const cagra::search_params& params,
+                             const DatasetT& dataset,
+                             cuvs::distance::DistanceType metric,
+                             rmm::cuda_stream_view stream)
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
+{{
+  auto [init, priority] = descriptor_instances::select<DataT, IndexT, DistanceT>(params, dataset, metric);
+  if (init == nullptr || priority < 0) {{
+    RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
+  }}
+  return init(params, dataset, metric, stream);
+}}
+'''
+    f.write(template.format(includes=includes, content=contents))
+
+
+with open("compute_distance.cu", "w") as f:
+    includes = '#include "compute_distance-ext.cuh"'
+    newline = "\n"
+    contents = f'''
+template struct instance_selector<{("," + newline + "                    ").join(specs)}>;
+'''
+    f.write(template.format(includes=includes, content=contents))
+
+cmake_list.sort()
+for path in cmake_list:
+    print(path)
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
new file mode 100644
index 000000000..b0205508a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_distance_standard.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+namespace {
+template <typename T, cuvs::distance::DistanceType Metric>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b)
+  -> std::enable_if_t<Metric == cuvs::distance::DistanceType::L2Expanded, T>
+{
+  T diff = a - b;
+  return diff * diff;
+}
+
+template <typename T, cuvs::distance::DistanceType Metric>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b)
+  -> std::enable_if_t<Metric == cuvs::distance::DistanceType::InnerProduct, T>
+{
+  return -a * b;
+}
+}  // namespace
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct standard_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
+  using base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using QUERY_T   = float;
+  using base_type::args;
+  using base_type::smem_ws_size_in_bytes;
+  using typename base_type::args_t;
+  using typename base_type::compute_distance_type;
+  using typename base_type::DATA_T;
+  using typename base_type::DISTANCE_T;
+  using typename base_type::INDEX_T;
+  using typename base_type::LOAD_T;
+  using typename base_type::setup_workspace_type;
+  constexpr static inline auto kMetric          = Metric;
+  constexpr static inline auto kTeamSize        = TeamSize;
+  constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
+
+  static constexpr RAFT_INLINE_FUNCTION auto ptr(const args_t& args) noexcept
+    -> const DATA_T* const&
+  {
+    return (const DATA_T* const&)(args.extra_ptr1);
+  }
+  static constexpr RAFT_INLINE_FUNCTION auto ptr(args_t& args) noexcept -> const DATA_T*&
+  {
+    return (const DATA_T*&)(args.extra_ptr1);
+  }
+
+  static constexpr RAFT_INLINE_FUNCTION auto ld(const args_t& args) noexcept -> const uint32_t&
+  {
+    return args.extra_word1;
+  }
+  static constexpr RAFT_INLINE_FUNCTION auto ld(args_t& args) noexcept -> uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  _RAFT_HOST_DEVICE standard_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
+                                                  compute_distance_type* compute_distance_impl,
+                                                  const DATA_T* ptr,
+                                                  INDEX_T size,
+                                                  uint32_t dim,
+                                                  uint32_t ld)
+    : base_type(setup_workspace_impl,
+                compute_distance_impl,
+                size,
+                dim,
+                raft::Pow2<TeamSize>::Log2,
+                get_smem_ws_size_in_bytes(dim))
+  {
+    standard_dataset_descriptor_t::ptr(args) = ptr;
+    standard_dataset_descriptor_t::ld(args)  = ld;
+    static_assert(sizeof(*this) == sizeof(base_type));
+    static_assert(alignof(standard_dataset_descriptor_t) == alignof(base_type));
+  }
+
+ private:
+  RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t
+  {
+    return sizeof(standard_dataset_descriptor_t) +
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+  }
+};
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto setup_workspace_standard(
+  const DescriptorT* that,
+  void* smem_ptr,
+  const typename DescriptorT::DATA_T* queries_ptr,
+  uint32_t query_id) -> const DescriptorT*
+{
+  using DATA_T                    = typename DescriptorT::DATA_T;
+  using LOAD_T                    = typename DescriptorT::LOAD_T;
+  using base_type                 = typename DescriptorT::base_type;
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  using word_type                 = uint32_t;
+  constexpr auto kTeamSize        = DescriptorT::kTeamSize;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  auto* r                         = reinterpret_cast<DescriptorT*>(smem_ptr);
+  auto* buf                       = reinterpret_cast<QUERY_T*>(r + 1);
+  if (r != that) {
+    constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type);
+    using blob_type           = word_type[kCount];
+    auto& src                 = reinterpret_cast<const blob_type&>(*that);
+    auto& dst                 = reinterpret_cast<blob_type&>(*r);
+    for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    const auto smem_ptr_offset =
+      reinterpret_cast<uint8_t*>(&(r->args.smem_ws_ptr)) - reinterpret_cast<uint8_t*>(r);
+    if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) {
+      r->args.smem_ws_ptr = uint32_t(__cvta_generic_to_shared(buf));
+    }
+    __syncthreads();
+  }
+
+  uint32_t dim        = r->args.dim;
+  auto buf_len        = raft::round_up_safe<uint32_t>(dim, kDatasetBlockDim);
+  constexpr auto vlen = device::get_vlen<LOAD_T, DATA_T>();
+  queries_ptr += dim * query_id;
+  for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) {
+    unsigned j = device::swizzling<kDatasetBlockDim, vlen * kTeamSize>(i);
+    if (i < dim) {
+      buf[j] = cuvs::spatial::knn::detail::utils::mapping<QUERY_T>{}(queries_ptr[i]);
+    } else {
+      buf[j] = 0.0;
+    }
+  }
+
+  return const_cast<const DescriptorT*>(r);
+}
+
+template <typename DescriptorT>
+RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker(
+  const typename DescriptorT::DATA_T* __restrict__ dataset_ptr,
+  uint32_t dim,
+  uint32_t query_smem_ptr) -> typename DescriptorT::DISTANCE_T
+{
+  using DATA_T                    = typename DescriptorT::DATA_T;
+  using DISTANCE_T                = typename DescriptorT::DISTANCE_T;
+  using LOAD_T                    = typename DescriptorT::LOAD_T;
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  constexpr auto kTeamSize        = DescriptorT::kTeamSize;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto vlen             = device::get_vlen<LOAD_T, DATA_T>();
+  constexpr auto reg_nelem =
+    raft::div_rounding_up_unsafe<uint32_t>(kDatasetBlockDim, kTeamSize * vlen);
+
+  DISTANCE_T r = 0;
+  for (uint32_t elem_offset = (threadIdx.x % kTeamSize) * vlen; elem_offset < dim;
+       elem_offset += kDatasetBlockDim) {
+    DATA_T data[reg_nelem][vlen];
+#pragma unroll
+    for (uint32_t e = 0; e < reg_nelem; e++) {
+      const uint32_t k = e * (kTeamSize * vlen) + elem_offset;
+      if (k >= dim) break;
+      device::ldg_cg(reinterpret_cast<LOAD_T&>(data[e]),
+                     reinterpret_cast<const LOAD_T*>(dataset_ptr + k));
+    }
+#pragma unroll
+    for (uint32_t e = 0; e < reg_nelem; e++) {
+      const uint32_t k = e * (kTeamSize * vlen) + elem_offset;
+      if (k >= dim) break;
+#pragma unroll
+      for (uint32_t v = 0; v < vlen; v++) {
+        // Note this loop can go above the dataset_dim for padded arrays. This is not a problem
+        // because:
+        // - Above the last element (dataset_dim-1), the query array is filled with zeros.
+        // - The data buffer has to be also padded with zeros.
+        DISTANCE_T d;
+        device::lds(
+          d,
+          query_smem_ptr +
+            sizeof(QUERY_T) * device::swizzling<kDatasetBlockDim, vlen * kTeamSize>(k + v));
+        r += dist_op<DISTANCE_T, DescriptorT::kMetric>(
+          d, cuvs::spatial::knn::detail::utils::mapping<DISTANCE_T>{}(data[e][v]));
+      }
+    }
+  }
+  return r;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto compute_distance_standard(
+  const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) ->
+  typename DescriptorT::DISTANCE_T
+{
+  return compute_distance_standard_worker<DescriptorT>(
+    DescriptorT::ptr(args) + (static_cast<std::uint64_t>(DescriptorT::ld(args)) * dataset_index),
+    args.dim,
+    args.smem_ws_ptr);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+RAFT_KERNEL __launch_bounds__(1, 1)
+  standard_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
+                                          const DataT* ptr,
+                                          IndexT size,
+                                          uint32_t dim,
+                                          uint32_t ld)
+{
+  using desc_type =
+    standard_dataset_descriptor_t<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>;
+  using base_type = typename desc_type::base_type;
+  new (out) desc_type(reinterpret_cast<typename base_type::setup_workspace_type*>(
+                        &setup_workspace_standard<desc_type>),
+                      reinterpret_cast<typename base_type::compute_distance_type*>(
+                        &compute_distance_standard<desc_type>),
+                      ptr,
+                      size,
+                      dim,
+                      ld);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+dataset_descriptor_host<DataT, IndexT, DistanceT>
+standard_descriptor_spec<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>::init_(
+  const cagra::search_params& params,
+  const DataT* ptr,
+  IndexT size,
+  uint32_t dim,
+  uint32_t ld,
+  rmm::cuda_stream_view stream)
+{
+  using desc_type =
+    standard_dataset_descriptor_t<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>;
+  using base_type = typename desc_type::base_type;
+  desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld};
+  host_type result{dd_host, stream};
+
+  standard_dataset_descriptor_init_kernel<Metric,
+                                          TeamSize,
+                                          DatasetBlockDim,
+                                          DataT,
+                                          IndexT,
+                                          DistanceT>
+    <<<1, 1, 0, stream>>>(result.dev_ptr(), ptr, size, dim, desc_type::ld(dd_host.args));
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  return result;
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
new file mode 100644
index 000000000..df1b77e86
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_distance.hpp"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct standard_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
+  using base_type = instance_spec<DataT, IndexT, DistanceT>;
+  using typename base_type::data_type;
+  using typename base_type::distance_type;
+  using typename base_type::host_type;
+  using typename base_type::index_type;
+
+  template <typename DatasetT>
+  constexpr static inline bool accepts_dataset()
+  {
+    return is_strided_dataset_v<DatasetT>;
+  }
+
+  template <typename DatasetT>
+  static auto init(const cagra::search_params& params,
+                   const DatasetT& dataset,
+                   cuvs::distance::DistanceType metric,
+                   rmm::cuda_stream_view stream) -> host_type
+  {
+    return init_(params,
+                 dataset.view().data_handle(),
+                 IndexT(dataset.n_rows()),
+                 dataset.dim(),
+                 dataset.stride(),
+                 stream);
+  }
+
+  template <typename DatasetT>
+  static auto priority(const cagra::search_params& params,
+                       const DatasetT& dataset,
+                       cuvs::distance::DistanceType metric) -> double
+  {
+    // If explicit team_size is specified and doesn't match the instance, discard it
+    if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
+    if (Metric != metric) { return -1.0; }
+    // Otherwise, favor the closest dataset dimensionality.
+    return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim)));
+  }
+
+ private:
+  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(const cagra::search_params& params,
+                                                                 const DataT* ptr,
+                                                                 IndexT size,
+                                                                 uint32_t dim,
+                                                                 uint32_t ld,
+                                                                 rmm::cuda_stream_view stream);
+};
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
index 1116eaaa4..af5e89a76 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
index 7e3ec363d..332eb6bf9 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
index af60c776a..3e5c11240 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
new file mode 100644
index 000000000..92ca114f7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
index 5dd79a79b..cfad79f3a 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         half,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
new file mode 100644
index 000000000..8c208044b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         half,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..929df5bbe
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..3cc4a2c95
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..a87e866eb
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..650d9ecac
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..6f7f4b97f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..e7b96ab49
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
new file mode 100644
index 000000000..b45cf3669
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
new file mode 100644
index 000000000..7d1206c37
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
new file mode 100644
index 000000000..251316b2c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
new file mode 100644
index 000000000..e3870df40
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
new file mode 100644
index 000000000..1253d7cd4
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
new file mode 100644
index 000000000..792532c2c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..c9c960cf9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..d7a12804b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         16,
+                                         256,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..a4f06c283
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         32,
+                                         512,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..199f05e49
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         8,
+                                         128,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..0962ecd82
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         16,
+                                         256,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..9c7e4ab03
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         32,
+                                         512,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
new file mode 100644
index 000000000..86c592502
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compute_distance_vpq.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PQ_BITS,
+          uint32_t PQ_LEN,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
+  using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using CODE_BOOK_T = CodebookT;
+  using QUERY_T     = half;
+  using base_type::args;
+  using base_type::extra_ptr3;
+  using typename base_type::args_t;
+  using typename base_type::compute_distance_type;
+  using typename base_type::DATA_T;
+  using typename base_type::DISTANCE_T;
+  using typename base_type::INDEX_T;
+  using typename base_type::LOAD_T;
+  using typename base_type::setup_workspace_type;
+  constexpr static inline auto kMetric          = Metric;
+  constexpr static inline auto kTeamSize        = TeamSize;
+  constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
+  constexpr static inline auto kPqBits          = PQ_BITS;
+  constexpr static inline auto kPqLen           = PQ_LEN;
+
+  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
+
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(args_t& args) noexcept
+    -> const uint8_t*&
+  {
+    return (const uint8_t*&)args.extra_ptr1;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(args_t& args) noexcept
+    -> const CODE_BOOK_T*&
+  {
+    return (const CODE_BOOK_T*&)args.extra_ptr2;
+  }
+  RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() noexcept -> const CODE_BOOK_T*&
+  {
+    return (const CODE_BOOK_T*&)extra_ptr3;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(args_t& args) noexcept -> uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(const args_t& args) noexcept
+    -> const uint8_t* const&
+  {
+    return (const uint8_t*&)args.extra_ptr1;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(const args_t& args) noexcept
+    -> const CODE_BOOK_T* const&
+  {
+    return (const CODE_BOOK_T*&)args.extra_ptr2;
+  }
+  RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() const noexcept -> const CODE_BOOK_T* const&
+  {
+    return (const CODE_BOOK_T*&)extra_ptr3;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(const args_t& args) noexcept
+    -> const uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
+    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
+
+  _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
+                                                 compute_distance_type* compute_distance_impl,
+                                                 const std::uint8_t* encoded_dataset_ptr,
+                                                 std::uint32_t encoded_dataset_dim,
+                                                 const CODE_BOOK_T* vq_code_book_ptr,
+                                                 const CODE_BOOK_T* pq_code_book_ptr,
+                                                 IndexT size,
+                                                 std::uint32_t dim)
+    : base_type(setup_workspace_impl,
+                compute_distance_impl,
+                size,
+                dim,
+                raft::Pow2<TeamSize>::Log2,
+                get_smem_ws_size_in_bytes(dim))
+  {
+    cagra_q_dataset_descriptor_t::encoded_dataset_ptr(args) = encoded_dataset_ptr;
+    cagra_q_dataset_descriptor_t::vq_code_book_ptr(args)    = vq_code_book_ptr;
+    this->pq_code_book_ptr()                                = pq_code_book_ptr;
+    cagra_q_dataset_descriptor_t::encoded_dataset_dim(args) = encoded_dataset_dim;
+    static_assert(sizeof(*this) == sizeof(base_type));
+    static_assert(alignof(cagra_q_dataset_descriptor_t) == alignof(base_type));
+  }
+
+ private:
+  RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t
+  {
+    /* SMEM workspace layout:
+      1. The descriptor itself
+      2. Codebook (kSMemCodeBookSizeInBytes bytes)
+      3. Queries (smem_query_buffer_length elems)
+    */
+    return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+  }
+};
+
+template <auto Block, auto Stride, typename T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto transpose(T x) -> T
+{
+  auto i = x % Block;
+  auto j = x / Block;
+  auto k = i % Stride;
+  auto l = i / Stride;
+  return j * Block + k * (Block / Stride) + l;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
+                                                   void* smem_ptr,
+                                                   const typename DescriptorT::DATA_T* queries_ptr,
+                                                   uint32_t query_id) -> const DescriptorT*
+{
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T               = typename DescriptorT::CODE_BOOK_T;
+  using word_type                 = uint32_t;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS          = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN           = DescriptorT::kPqLen;
+
+  auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
+
+  if (r != that) {
+    constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type);
+    using blob_type           = word_type[kCount];
+    auto& src                 = reinterpret_cast<const blob_type&>(*that);
+    auto& dst                 = reinterpret_cast<blob_type&>(*r);
+    for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+
+    auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1));
+    const auto smem_ptr_offset =
+      reinterpret_cast<uint8_t*>(&(r->args.smem_ws_ptr)) - reinterpret_cast<uint8_t*>(r);
+    if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) {
+      r->args.smem_ws_ptr = codebook_buf;
+    }
+    __syncthreads();
+
+    // Copy PQ table
+    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+      half2 buf2;
+      buf2.x = r->pq_code_book_ptr()[i];
+      buf2.y = r->pq_code_book_ptr()[i + 1];
+
+      // Change the order of PQ code book array to reduce the
+      // frequency of bank conflicts.
+      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+      const auto j                          = i / num_elements_per_bank;
+      const auto smem_index =
+        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+
+      device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+    }
+  }
+
+  uint32_t dim = r->args.dim;
+  queries_ptr += dim * query_id;
+
+  constexpr cuvs::spatial::knn::detail::utils::mapping<QUERY_T> mapping{};
+  auto smem_query_ptr =
+    reinterpret_cast<QUERY_T*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
+                               DescriptorT::kSMemCodeBookSizeInBytes);
+  for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
+    half2 buf2{0, 0};
+    if (i < dim) { buf2.x = mapping(queries_ptr[i]); }
+    if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); }
+    if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
+      // Transpose the queries buffer to avoid bank conflicts in compute_distance.
+      constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+      constexpr auto kStride  = vlen * PQ_LEN / 2;
+      reinterpret_cast<half2*>(smem_query_ptr)[transpose<kDatasetBlockDim / 2, kStride>(i / 2)] =
+        buf2;
+    } else {
+      (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
+    }
+  }
+
+  return const_cast<const DescriptorT*>(r);
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
+  const uint8_t* __restrict__ dataset_ptr,
+  const typename DescriptorT::CODE_BOOK_T* __restrict__ vq_code_book_ptr,
+  uint32_t dim,
+  uint32_t pq_codebook_ptr) -> typename DescriptorT::DISTANCE_T
+{
+  using DISTANCE_T               = typename DescriptorT::DISTANCE_T;
+  using LOAD_T                   = typename DescriptorT::LOAD_T;
+  using QUERY_T                  = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T              = typename DescriptorT::CODE_BOOK_T;
+  constexpr auto TeamSize        = DescriptorT::kTeamSize;
+  constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS         = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+
+  const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
+  static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
+  constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+  constexpr uint32_t nelem =
+    raft::div_rounding_up_unsafe<uint32_t>(DatasetBlockDim / PQ_LEN, TeamSize * vlen);
+
+  constexpr auto kTeamMask = DescriptorT::kTeamSize - 1;
+  constexpr auto kTeamVLen = TeamSize * vlen;
+
+  const auto n_subspace = raft::div_rounding_up_unsafe(dim, PQ_LEN);
+  const auto laneId     = threadIdx.x & kTeamMask;
+  DISTANCE_T norm       = 0;
+  for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim;
+       elem_offset += DatasetBlockDim / PQ_LEN) {
+    // Loading PQ codes
+    uint32_t pq_codes[nelem];
+#pragma unroll
+    for (std::uint32_t e = 0; e < nelem; e++) {
+      const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+      if (k >= n_subspace) break;
+      // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
+      device::ldg_cg(pq_codes[e], reinterpret_cast<const std::uint32_t*>(dataset_ptr + 4 + k));
+    }
+    //
+    if constexpr (PQ_LEN % 2 == 0) {
+      // **** Use half2 for distance computation ****
+#pragma unroll
+      for (std::uint32_t e = 0; e < nelem; e++) {
+        const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+        if (k >= n_subspace) break;
+        // Loading VQ code-book
+        half2 vq_vals[PQ_LEN][vlen / 2];
+#pragma unroll
+        for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+          const uint32_t d = (vlen * m) + (PQ_LEN * k);
+          if (d >= dim) break;
+          device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
+        }
+        // Compute distance
+        std::uint32_t pq_code = pq_codes[e];
+#pragma unroll
+        for (std::uint32_t v = 0; v < vlen; v++) {
+          if (PQ_LEN * (v + k) >= dim) break;
+#pragma unroll
+          for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) {
+            constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
+            const std::uint32_t d1     = m + (PQ_LEN / 2) * v;
+            const std::uint32_t d =
+              d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+            half2 q2, c2;
+            // Loading query vector from smem
+            device::lds(q2, query_ptr + sizeof(half2) * d);
+            // Loading PQ code book from smem
+            device::lds(c2,
+                        pq_codebook_ptr +
+                          sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff))));
+            // L2 distance
+            auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+            dist      = dist * dist;
+            norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+          }
+          pq_code >>= 8;
+        }
+      }
+    } else {
+      // **** Use float for distance computation ****
+#pragma unroll
+      for (std::uint32_t e = 0; e < nelem; e++) {
+        const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+        if (k >= n_subspace) break;
+        // Loading VQ code-book
+        CODE_BOOK_T vq_vals[PQ_LEN][vlen];
+#pragma unroll
+        for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+          const std::uint32_t d = (vlen * m) + (PQ_LEN * k);
+          if (d >= dim) break;
+          // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device memory)
+          device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
+        }
+        // Compute distance
+        std::uint32_t pq_code = pq_codes[e];
+#pragma unroll
+        for (std::uint32_t v = 0; v < vlen; v++) {
+          if (PQ_LEN * (v + k) >= dim) break;
+          CODE_BOOK_T pq_vals[PQ_LEN];
+          device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff));
+#pragma unroll
+          for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+            const std::uint32_t d1 = m + (PQ_LEN * v);
+            const std::uint32_t d  = d1 + (PQ_LEN * k);
+            // if (d >= dataset_dim) break;
+            DISTANCE_T diff;
+            device::lds(diff, query_ptr + sizeof(QUERY_T) * d);
+            diff -= static_cast<DISTANCE_T>(pq_vals[m]);
+            diff -=
+              static_cast<DISTANCE_T>(reinterpret_cast<CODE_BOOK_T(&)[PQ_LEN * vlen]>(vq_vals)[d1]);
+            norm += diff * diff;
+          }
+          pq_code >>= 8;
+        }
+      }
+    }
+  }
+  return norm;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto compute_distance_vpq(
+  const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) ->
+  typename DescriptorT::DISTANCE_T
+{
+  const auto* dataset_ptr =
+    DescriptorT::encoded_dataset_ptr(args) +
+    (static_cast<std::uint64_t>(DescriptorT::encoded_dataset_dim(args)) * dataset_index);
+  uint32_t vq_code;
+  device::ldg_cg(vq_code, reinterpret_cast<const std::uint32_t*>(dataset_ptr));
+  return compute_distance_vpq_worker<DescriptorT>(
+    dataset_ptr /* advance dataset pointer by the size of vq_code */,
+    DescriptorT::vq_code_book_ptr(args) + args.dim * vq_code,
+    args.dim,
+    args.smem_ws_ptr);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+RAFT_KERNEL __launch_bounds__(1, 1)
+  vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
+                                     const std::uint8_t* encoded_dataset_ptr,
+                                     uint32_t encoded_dataset_dim,
+                                     const CodebookT* vq_code_book_ptr,
+                                     const CodebookT* pq_code_book_ptr,
+                                     IndexT size,
+                                     uint32_t dim)
+{
+  using desc_type = cagra_q_dataset_descriptor_t<Metric,
+                                                 TeamSize,
+                                                 DatasetBlockDim,
+                                                 PqBits,
+                                                 PqLen,
+                                                 CodebookT,
+                                                 DataT,
+                                                 IndexT,
+                                                 DistanceT>;
+  using base_type = typename desc_type::base_type;
+  new (out) desc_type(
+    reinterpret_cast<typename base_type::setup_workspace_type*>(&setup_workspace_vpq<desc_type>),
+    reinterpret_cast<typename base_type::compute_distance_type*>(&compute_distance_vpq<desc_type>),
+    encoded_dataset_ptr,
+    encoded_dataset_dim,
+    vq_code_book_ptr,
+    pq_code_book_ptr,
+    size,
+    dim);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+dataset_descriptor_host<DataT, IndexT, DistanceT>
+vpq_descriptor_spec<Metric,
+                    TeamSize,
+                    DatasetBlockDim,
+                    PqBits,
+                    PqLen,
+                    CodebookT,
+                    DataT,
+                    IndexT,
+                    DistanceT>::init_(const cagra::search_params& params,
+                                      const std::uint8_t* encoded_dataset_ptr,
+                                      uint32_t encoded_dataset_dim,
+                                      const CodebookT* vq_code_book_ptr,
+                                      const CodebookT* pq_code_book_ptr,
+                                      IndexT size,
+                                      uint32_t dim,
+                                      rmm::cuda_stream_view stream)
+{
+  using desc_type = cagra_q_dataset_descriptor_t<Metric,
+                                                 TeamSize,
+                                                 DatasetBlockDim,
+                                                 PqBits,
+                                                 PqLen,
+                                                 CodebookT,
+                                                 DataT,
+                                                 IndexT,
+                                                 DistanceT>;
+  using base_type = typename desc_type::base_type;
+
+  desc_type dd_host{nullptr,
+                    nullptr,
+                    encoded_dataset_ptr,
+                    encoded_dataset_dim,
+                    vq_code_book_ptr,
+                    pq_code_book_ptr,
+                    size,
+                    dim};
+  host_type result{dd_host, stream};
+  vpq_dataset_descriptor_init_kernel<Metric,
+                                     TeamSize,
+                                     DatasetBlockDim,
+                                     PqBits,
+                                     PqLen,
+                                     CodebookT,
+                                     DataT,
+                                     IndexT,
+                                     DistanceT><<<1, 1, 0, stream>>>(result.dev_ptr(),
+                                                                     encoded_dataset_ptr,
+                                                                     encoded_dataset_dim,
+                                                                     vq_code_book_ptr,
+                                                                     pq_code_book_ptr,
+                                                                     size,
+                                                                     dim);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  return result;
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
deleted file mode 100644
index 68973662f..000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "compute_distance.hpp"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/integer_utils.hpp>
-
-namespace cuvs::neighbors::cagra::detail {
-template <class DATA_T_,
-          class CODE_BOOK_T_,
-          unsigned PQ_BITS,
-          unsigned PQ_LEN,
-          class DISTANCE_T,
-          class INDEX_T>
-struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T> {
-  using LOAD_T      = device::LOAD_128BIT_T;
-  using DATA_T      = DATA_T_;
-  using CODE_BOOK_T = CODE_BOOK_T_;
-  using QUERY_T     = typename dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::QUERY_T;
-
-  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
-
-  const std::uint8_t* encoded_dataset_ptr;
-  const std::uint32_t encoded_dataset_dim;
-  const std::uint32_t n_subspace;
-  const CODE_BOOK_T* vq_code_book_ptr;
-  const float vq_scale;
-  const CODE_BOOK_T* pq_code_book_ptr;
-  const float pq_scale;
-  using dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::size;
-  using dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::dim;
-
-  // Set on device
-  CODE_BOOK_T* smem_pq_code_book_ptr;
-  static const std::uint32_t smem_buffer_size_in_byte =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
-
-  __device__ void set_smem_ptr(void* const smem_ptr)
-  {
-    smem_pq_code_book_ptr = reinterpret_cast<CODE_BOOK_T*>(smem_ptr);
-
-    // Copy PQ table
-    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-      half2 buf2;
-      buf2.x = pq_code_book_ptr[i];
-      buf2.y = pq_code_book_ptr[i + 1];
-
-      // Change the order of PQ code book array to reduce the
-      // frequency of bank conflicts.
-      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
-      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
-      const auto j                          = i / num_elements_per_bank;
-      const auto smem_index =
-        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
-      reinterpret_cast<half2*>(smem_pq_code_book_ptr)[smem_index] = buf2;
-    }
-  }
-
-  cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr,
-                               const std::uint32_t encoded_dataset_dim,
-                               const std::uint32_t n_subspace,
-                               const CODE_BOOK_T* const vq_code_book_ptr,
-                               const float vq_scale,
-                               const CODE_BOOK_T* const pq_code_book_ptr,
-                               const float pq_scale,
-                               const std::size_t size,
-                               const std::uint32_t dim)
-    : dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>(size, dim),
-      encoded_dataset_ptr(encoded_dataset_ptr),
-      encoded_dataset_dim(encoded_dataset_dim),
-      n_subspace(n_subspace),
-      vq_code_book_ptr(vq_code_book_ptr),
-      vq_scale(vq_scale),
-      pq_code_book_ptr(pq_code_book_ptr),
-      pq_scale(pq_scale)
-  {
-  }
-
-  template <uint32_t DATASET_BLOCK_DIM>
-  __device__ void copy_query(const DATA_T* const dmem_query_ptr,
-                             QUERY_T* const smem_query_ptr,
-                             const std::uint32_t query_smem_buffer_length)
-  {
-    constexpr cuvs::spatial::knn::detail::utils::mapping<half> mapping{};
-    for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
-      half2 buf2{0, 0};
-      if (i < dim) { buf2.x = mapping(dmem_query_ptr[i]); }
-      if (i + 1 < dim) { buf2.y = mapping(dmem_query_ptr[i + 1]); }
-      if ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
-        // Use swizzling in the condition to reduce bank conflicts in shared
-        // memory, which are likely to occur when pq_code_book_dim is large.
-        ((half2*)smem_query_ptr)[device::swizzling<std::uint32_t, DATASET_BLOCK_DIM / 2>(i / 2)] =
-          buf2;
-      } else {
-        (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
-      }
-    }
-  }
-
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, cuvs::distance::DistanceType METRIC>
-  __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
-                                           const INDEX_T node_id,
-                                           const bool valid) const
-  {
-    float norm = 0;
-    if (valid) {
-      const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-      const uint32_t vq_code = *(reinterpret_cast<const std::uint32_t*>(
-        encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id)));
-      if (PQ_BITS == 8) {
-        for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) {
-          constexpr unsigned vlen = 4;  // **** DO NOT CHANGE ****
-          constexpr unsigned nelem =
-            raft::div_rounding_up_unsafe<unsigned>(DATASET_BLOCK_DIM / PQ_LEN, TEAM_SIZE * vlen);
-          // Loading PQ codes
-          uint32_t pq_codes[nelem];
-#pragma unroll
-          for (std::uint32_t e = 0; e < nelem; e++) {
-            const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-            if (k >= n_subspace) break;
-            // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
-            pq_codes[e] = *(reinterpret_cast<const std::uint32_t*>(
-              encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id) +
-              4 + k));
-          }
-          //
-          if constexpr (PQ_LEN % 2 == 0) {
-            // **** Use half2 for distance computation ****
-            half2 norm2{0, 0};
-#pragma unroll
-            for (std::uint32_t e = 0; e < nelem; e++) {
-              const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-              if (k >= n_subspace) break;
-              // Loading VQ code-book
-              raft::TxN_t<half2, vlen / 2> vq_vals[PQ_LEN];
-#pragma unroll
-              for (std::uint32_t m = 0; m < PQ_LEN; m += 1) {
-                const uint32_t d = (vlen * m) + (PQ_LEN * k);
-                if (d >= dim) break;
-                vq_vals[m].load(
-                  reinterpret_cast<const half2*>(vq_code_book_ptr + d + (dim * vq_code)), 0);
-              }
-              // Compute distance
-              std::uint32_t pq_code = pq_codes[e];
-#pragma unroll
-              for (std::uint32_t v = 0; v < vlen; v++) {
-                if (PQ_LEN * (v + k) >= dim) break;
-#pragma unroll
-                for (std::uint32_t m = 0; m < PQ_LEN; m += 2) {
-                  const std::uint32_t d1 = m + (PQ_LEN * v);
-                  const std::uint32_t d  = d1 + (PQ_LEN * k);
-                  // Loading query vector in smem
-                  half2 diff2 = (reinterpret_cast<const half2*>(
-                    query_ptr))[device::swizzling<std::uint32_t, DATASET_BLOCK_DIM / 2>(d / 2)];
-                  // Loading PQ code book in smem
-                  diff2 -= *(reinterpret_cast<half2*>(
-                    smem_pq_code_book_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff))));
-                  diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2];
-                  norm2 += diff2 * diff2;
-                }
-                pq_code >>= 8;
-              }
-            }
-            norm += static_cast<float>(norm2.x + norm2.y);
-          } else {
-            // **** Use float for distance computation ****
-#pragma unroll
-            for (std::uint32_t e = 0; e < nelem; e++) {
-              const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-              if (k >= n_subspace) break;
-              // Loading VQ code-book
-              raft::TxN_t<CODE_BOOK_T, vlen> vq_vals[PQ_LEN];
-#pragma unroll
-              for (std::uint32_t m = 0; m < PQ_LEN; m++) {
-                const std::uint32_t d = (vlen * m) + (PQ_LEN * k);
-                if (d >= dim) break;
-                // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device
-                // memory)
-                vq_vals[m].load(
-                  reinterpret_cast<const half2*>(vq_code_book_ptr + d + (dim * vq_code)), 0);
-              }
-              // Compute distance
-              std::uint32_t pq_code = pq_codes[e];
-#pragma unroll
-              for (std::uint32_t v = 0; v < vlen; v++) {
-                if (PQ_LEN * (v + k) >= dim) break;
-                raft::TxN_t<CODE_BOOK_T, PQ_LEN> pq_vals;
-                pq_vals.load(
-                  reinterpret_cast<const half2*>(smem_pq_code_book_ptr + PQ_LEN * (pq_code & 0xff)),
-                  0);  // (from L1$ or smem)
-#pragma unroll
-                for (std::uint32_t m = 0; m < PQ_LEN; m++) {
-                  const std::uint32_t d1 = m + (PQ_LEN * v);
-                  const std::uint32_t d  = d1 + (PQ_LEN * k);
-                  // if (d >= dataset_dim) break;
-                  DISTANCE_T diff = query_ptr[d];  // (from smem)
-                  diff -= pq_scale * static_cast<float>(pq_vals.data[m]);
-                  diff -= vq_scale * static_cast<float>(vq_vals[d1 / vlen].val.data[d1 % vlen]);
-                  norm += diff * diff;
-                }
-                pq_code >>= 8;
-              }
-            }
-          }
-        }
-      }
-    }
-    for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
-      norm += __shfl_xor_sync(0xffffffff, norm, offset);
-    }
-    return norm;
-  }
-};
-
-}  // namespace cuvs::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
new file mode 100644
index 000000000..378d2943e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compute_distance.hpp"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
+  using base_type = instance_spec<DataT, IndexT, DistanceT>;
+  using typename base_type::data_type;
+  using typename base_type::distance_type;
+  using typename base_type::host_type;
+  using typename base_type::index_type;
+
+  template <typename DatasetT>
+  constexpr static inline auto accepts_dataset()
+    -> std::enable_if_t<is_vpq_dataset_v<DatasetT>, bool>
+  {
+    return std::is_same_v<typename DatasetT::math_type, CodebookT>;
+  }
+
+  template <typename DatasetT>
+  constexpr static inline auto accepts_dataset()
+    -> std::enable_if_t<!is_vpq_dataset_v<DatasetT>, bool>
+  {
+    return false;
+  }
+
+  template <typename DatasetT>
+  static auto init(const cagra::search_params& params,
+                   const DatasetT& dataset,
+                   cuvs::distance::DistanceType metric,
+                   rmm::cuda_stream_view stream) -> host_type
+  {
+    return init_(params,
+                 dataset.data.data_handle(),
+                 dataset.encoded_row_length(),
+                 dataset.vq_code_book.data_handle(),
+                 dataset.pq_code_book.data_handle(),
+                 IndexT(dataset.n_rows()),
+                 dataset.dim(),
+                 stream);
+  }
+
+  template <typename DatasetT>
+  static auto priority(const cagra::search_params& params,
+                       const DatasetT& dataset,
+                       cuvs::distance::DistanceType metric) -> double
+  {
+    // If explicit team_size is specified and doesn't match the instance, discard it
+    if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
+    if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; }
+    // Match codebook params
+    if (dataset.pq_bits() != PqBits) { return -1.0; }
+    if (dataset.pq_len() != PqLen) { return -1.0; }
+    // Otherwise, favor the closest dataset dimensionality.
+    return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim)));
+  }
+
+ private:
+  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(
+    const cagra::search_params& params,
+    const std::uint8_t* encoded_dataset_ptr,
+    uint32_t encoded_dataset_dim,
+    const CodebookT* vq_code_book_ptr,
+    const CodebookT* pq_code_book_ptr,
+    IndexT size,
+    uint32_t dim,
+    rmm::cuda_stream_view stream);
+};
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
index 9ec7ce3dd..a56a5a9df 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
index 292a1429a..f58a8c7df 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..bdc072e61
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..301c8c55b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..05ebeae2b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..e343d938c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
index 1a5ad50e3..5d950351f 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
index 0ab23d7eb..453e15df3 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..c79cb74b6
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..dee326d54
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..a1ef9ba92
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..f2f01c8d4
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
new file mode 100644
index 000000000..1afccb8fd
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
new file mode 100644
index 000000000..28ea523ee
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..eca36cc36
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..89aed8afc
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..ff646b22c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..633a805c7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
new file mode 100644
index 000000000..3a09161ea
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
new file mode 100644
index 000000000..85331d243
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..a7719074a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..7dd028b82
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..78f37b135
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..d3eb20a05
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 192d81aa8..b7cb9c42d 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -15,10 +15,15 @@
  */
 #pragma once
 
+#include "hashmap.hpp"
 #include "utils.hpp"
 
+#include <cuvs/distance/distance.hpp>
+
 // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors
 #include <raft/core/detail/macros.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/warp_primitives.cuh>
 
 #include <cuda_fp16.h>
 
@@ -31,6 +36,16 @@ namespace device {
 // warpSize for compile time calculation
 constexpr unsigned warp_size = 32;
 
+// using LOAD_256BIT_T = ulonglong4;
+using LOAD_128BIT_T = uint4;
+using LOAD_64BIT_T  = uint64_t;
+
+template <class LOAD_T, class DATA_T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen()
+{
+  return utils::size_of<LOAD_T>() / utils::size_of<DATA_T>();
+}
+
 /** Xorshift rondem number generator.
  *
  * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference.
@@ -43,18 +58,299 @@ _RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u)
   return u * 0x2545F4914F6CDD1DULL;
 }
 
-template <class T, unsigned X_MAX = 1024>
-_RAFT_DEVICE inline T swizzling(T x)
+template <uint32_t Dim = 1024, uint32_t Stride = 128, typename T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto swizzling(T x) -> T
 {
   // Address swizzling reduces bank conflicts in shared memory, but increases
   // the amount of operation instead.
   // return x;
-  if constexpr (X_MAX <= 1024) {
-    return (x) ^ ((x) >> 5);
+  if constexpr (Stride <= 32) {
+    return x;
+  } else if constexpr (Dim <= 1024) {
+    return x ^ (x >> 5);
   } else {
-    return (x) ^ (((x) >> 5) & 0x1f);
+    return x ^ ((x >> 5) & 0x1f);
+  }
+}
+
+template <uint32_t TeamSize, typename T>
+RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T
+{
+#pragma unroll
+  for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) {
+    x += raft::shfl_xor(x, stride, TeamSize);
+  }
+  return x;
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size_bitshift) -> T
+{
+  switch (team_size_bitshift) {
+    case 5: x += raft::shfl_xor(x, 16);
+    case 4: x += raft::shfl_xor(x, 8);
+    case 3: x += raft::shfl_xor(x, 4);
+    case 2: x += raft::shfl_xor(x, 2);
+    case 1: x += raft::shfl_xor(x, 1);
+    default: return x;
+  }
+}
+
+template <typename IndexT,
+          typename DistanceT,
+          typename DATASET_DESCRIPTOR_T>
+RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
+  IndexT* __restrict__ result_indices_ptr,       // [num_pickup]
+  DistanceT* __restrict__ result_distances_ptr,  // [num_pickup]
+  const DATASET_DESCRIPTOR_T& dataset_desc,
+  const uint32_t num_pickup,
+  const uint32_t num_distilation,
+  const uint64_t rand_xor_mask,
+  const IndexT* __restrict__ seed_ptr,  // [num_seeds]
+  const uint32_t num_seeds,
+  IndexT* __restrict__ visited_hash_ptr,
+  const uint32_t hash_bitlen,
+  const uint32_t block_id   = 0,
+  const uint32_t num_blocks = 1)
+{
+  const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem();
+  const auto max_i = raft::round_up_safe<uint32_t>(num_pickup, warp_size >> team_size_bits);
+  const auto compute_distance = dataset_desc.compute_distance_impl;
+
+  for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
+    const bool valid_i = (i < num_pickup);
+
+    IndexT best_index_team_local;
+    DistanceT best_norm2_team_local = raft::upper_bound<DistanceT>();
+    for (uint32_t j = 0; j < num_distilation; j++) {
+      // Select a node randomly and compute the distance to it
+      IndexT seed_index;
+      if (valid_i) {
+        // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id)));
+        uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j)));
+        if (seed_ptr && (gid < num_seeds)) {
+          seed_index = seed_ptr[gid];
+        } else {
+          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size;
+        }
+      }
+
+      const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i);
+
+      if (valid_i && (norm2 < best_norm2_team_local)) {
+        best_norm2_team_local = norm2;
+        best_index_team_local = seed_index;
+      }
+    }
+
+    const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u);
+    if (valid_i && lane_id == 0) {
+      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
+        result_distances_ptr[i] = best_norm2_team_local;
+        result_indices_ptr[i]   = best_index_team_local;
+      } else {
+        result_distances_ptr[i] = raft::upper_bound<DistanceT>();
+        result_indices_ptr[i]   = raft::upper_bound<IndexT>();
+      }
+    }
   }
 }
 
+template <typename IndexT, typename DistanceT, typename DATASET_DESCRIPTOR_T>
+RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes(
+  IndexT* __restrict__ result_child_indices_ptr,
+  DistanceT* __restrict__ result_child_distances_ptr,
+  // [dataset_dim, dataset_size]
+  const DATASET_DESCRIPTOR_T& dataset_desc,
+  // [knn_k, dataset_size]
+  const IndexT* __restrict__ knn_graph,
+  const uint32_t knn_k,
+  // hashmap
+  IndexT* __restrict__ visited_hashmap_ptr,
+  const uint32_t hash_bitlen,
+  const IndexT* __restrict__ parent_indices,
+  const IndexT* __restrict__ internal_topk_list,
+  const uint32_t search_width)
+{
+  constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask<IndexT>::value;
+  constexpr IndexT invalid_index    = raft::upper_bound<IndexT>();
+
+  // Read child indices of parents from knn graph and check if the distance
+  // computaiton is necessary.
+  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
+    const IndexT smem_parent_id = parent_indices[i / knn_k];
+    IndexT child_id             = invalid_index;
+    if (smem_parent_id != invalid_index) {
+      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
+      child_id             = knn_graph[(i % knn_k) + (static_cast<int64_t>(knn_k) * parent_id)];
+    }
+    if (child_id != invalid_index) {
+      if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
+        child_id = invalid_index;
+      }
+    }
+    result_child_indices_ptr[i] = child_id;
+  }
+  __syncthreads();
+
+  // Compute the distance to child nodes
+  const auto team_size_bits   = dataset_desc.team_size_bitshift_from_smem();
+  const auto num_k            = knn_k * search_width;
+  const auto max_i            = raft::round_up_safe(num_k, warp_size >> team_size_bits);
+  const auto compute_distance = dataset_desc.compute_distance_impl;
+  const auto args             = dataset_desc.args.load();
+  const bool lead_lane        = (threadIdx.x & ((1u << team_size_bits) - 1u)) == 0;
+  for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += blockDim.x >> team_size_bits) {
+    const bool valid_i  = i < num_k;
+    const auto child_id = valid_i ? result_child_indices_ptr[i] : invalid_index;
+
+    // We should be calling `dataset_desc.compute_distance(..)` here as follows:
+    // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index);
+    // Instead, we manually inline this function for performance reasons.
+    // This allows us to move the fetching of the arguments from shared memory out of the loop.
+    const DistanceT child_dist = device::team_sum(
+      (child_id != invalid_index) ? compute_distance(args, child_id)
+                                  : (lead_lane ? raft::upper_bound<DistanceT>() : 0),
+      team_size_bits);
+
+    // Store the distance
+    if (valid_i && lead_lane) { result_child_distances_ptr[i] = child_dist; }
+  }
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr)
+{
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(reinterpret_cast<uint16_t&>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half2& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(reinterpret_cast<uint32_t&>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[1], uint32_t addr)
+{
+  asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast<uint16_t*>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[2], uint32_t addr)
+{
+  asm volatile("ld.shared.v2.u16 {%0, %1}, [%2];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)), "=h"(*reinterpret_cast<uint16_t*>(x + 1))
+               : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[4], uint32_t addr)
+{
+  asm volatile("ld.shared.v4.u16 {%0, %1, %2, %3}, [%4];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 1)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 2)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 3))
+               : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr)
+{
+  lds(x, uint32_t(__cvta_generic_to_shared(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr)
+{
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr)
+{
+  lds(x, uint32_t(__cvta_generic_to_shared(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x)
+{
+  asm volatile("st.shared.v2.u16 [%0], {%1, %2};"
+               :
+               : "r"(addr),
+                 "h"(reinterpret_cast<const uint16_t&>(x.x)),
+                 "h"(reinterpret_cast<const uint16_t&>(x.y)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint4& x, const uint4* addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint4& x, const uint4* addr)
+{
+  asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint32_t& x, const uint32_t* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint32_t& x, const uint32_t* addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half& x, const half* addr)
+{
+  asm volatile("ld.global.ca.u16 {%0}, [%1];"
+               : "=h"(reinterpret_cast<uint16_t&>(x))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[1], const half* addr)
+{
+  asm volatile("ld.global.ca.u16 {%0}, [%1];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[2], const half* addr)
+{
+  asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)), "=h"(*reinterpret_cast<uint16_t*>(x + 1))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[4], const half* addr)
+{
+  asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 1)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 2)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 3))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2& x, const half* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];"
+               : "=r"(reinterpret_cast<uint32_t&>(x))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[1], const half* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];"
+               : "=r"(*reinterpret_cast<uint32_t*>(x))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[2], const half* addr)
+{
+  asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];"
+               : "=r"(*reinterpret_cast<uint32_t*>(x)), "=r"(*reinterpret_cast<uint32_t*>(x + 1))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+
 }  // namespace device
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index 183d6051f..1c99f72f7 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "compute_distance-ext.cuh"
 #include "search_multi_cta.cuh"
 #include "search_multi_kernel.cuh"
 #include "search_plan.cuh"
@@ -25,71 +26,153 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-template <typename DATASET_DESCRIPTOR_T,
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
           typename CagraSampleFilterT = cuvs::neighbors::filtering::none_cagra_sample_filter>
 class factory {
-  using T         = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using IdxT      = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DistanceT = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
  public:
   /**
    * Create a search structure for dataset with dim features.
    */
-  static std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>> create(
+  static std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT>> create(
     raft::resources const& res,
     search_params const& params,
+    const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
     int64_t dim,
     int64_t graph_degree,
-    uint32_t topk,
-    const cuvs::distance::DistanceType metric)
+    uint32_t topk)
   {
-    search_plan_impl_base plan(params, dim, graph_degree, topk, metric);
-    switch (plan.dataset_block_dim) {
-      case 128:
-        switch (plan.team_size) {
-          case 8: return dispatch_kernel<128, 8>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      case 256:
-        switch (plan.team_size) {
-          case 16: return dispatch_kernel<256, 16>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      case 512:
-        switch (plan.team_size) {
-          case 32: return dispatch_kernel<512, 32>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      default: THROW("Incorrect dataset_block_dim (%lu)\n", plan.dataset_block_dim);
-    }
-    return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>();
+    search_plan_impl_base plan(params, dim, graph_degree, topk);
+    return dispatch_kernel(res, plan, dataset_desc);
   }
 
  private:
-  template <unsigned DATASET_BLOCK_DIM, unsigned TEAM_SIZE>
-  static std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>
-  dispatch_kernel(raft::resources const& res, search_plan_impl_base& plan)
+  static std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT>>
+  dispatch_kernel(raft::resources const& res,
+                  search_plan_impl_base& plan,
+                  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc)
   {
     if (plan.algo == search_algo::SINGLE_CTA) {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new single_cta_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        single_cta_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     } else if (plan.algo == search_algo::MULTI_CTA) {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new multi_cta_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        multi_cta_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     } else {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new multi_kernel_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        multi_kernel_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     }
   }
 };
+
+/*
+Caching of dataset/distance descriptor initialization
+  (see `dataset_descriptor_init_with_cache` below).
+ */
+namespace descriptor_cache {
+
+/**
+ * The key for caching consists of a minimal set of fields that uniquely define the descriptor.
+ * The key field names are the same as of the descriptor and the contents are not relevant for
+ * caching.
+ */
+struct key {
+  uint64_t data_ptr;
+  uint64_t n_rows;
+  uint32_t dim;
+  uint32_t extra_val;  // this one has different meanings for different descriptor types
+  uint32_t team_size;
+  uint32_t metric;
+};
+
+template <typename DatasetT>
+auto make_key(const cagra::search_params& params,
+              const DatasetT& dataset,
+              cuvs::distance::DistanceType metric)
+  -> std::enable_if_t<is_strided_dataset_v<DatasetT>, key>
+{
+  return key{reinterpret_cast<uint64_t>(dataset.view().data_handle()),
+             uint64_t(dataset.n_rows()),
+             dataset.dim(),
+             dataset.stride(),
+             uint32_t(params.team_size),
+             uint32_t(metric)};
+}
+
+template <typename DatasetT>
+auto make_key(const cagra::search_params& params,
+              const DatasetT& dataset,
+              cuvs::distance::DistanceType metric)
+  -> std::enable_if_t<is_vpq_dataset_v<DatasetT>, key>
+{
+  return key{reinterpret_cast<uint64_t>(dataset.data.data_handle()),
+             uint64_t(dataset.n_rows()),
+             dataset.dim(),
+             uint32_t(reinterpret_cast<uint64_t>(dataset.pq_code_book.data_handle()) >> 6),
+             uint32_t(params.team_size),
+             uint32_t(metric)};
+}
+
+inline auto operator==(const key& a, const key& b) -> bool
+{
+  return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim &&
+         a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric;
+}
+
+struct key_hash {
+  inline auto operator()(const key& x) const noexcept -> std::size_t
+  {
+    return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} +
+           (size_t{x.team_size} ^ size_t{x.metric});
+  }
+};
+
+template <typename DataT, typename IndexT, typename DistanceT>
+struct store {
+  /** Number of descriptors to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  raft::cache::lru<key,
+                   key_hash,
+                   std::equal_to<>,
+                   std::shared_ptr<dataset_descriptor_host<DataT, IndexT, DistanceT>>>
+    value{kDefaultSize};
+};
+
+}  // namespace descriptor_cache
+
+/**
+ * Call `dataset_descriptor_init` with memoization.
+ * (NB: `dataset_descriptor_init` is a function in a generated header file
+ * `neighbors/detail/cagra/compute_distance-ext.cuh`).
+ *
+ * `dataset_descriptor_init`  involves calling a CUDA kernel to resolve device symbols before the
+ * main search kernel runs. This adds an extra unwanted latency.
+ * Caching the the descriptor helps to hide this latency for repeated searches.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init_with_cache(const raft::resources& res,
+                                        const cagra::search_params& params,
+                                        const DatasetT& dataset,
+                                        cuvs::distance::DistanceType metric)
+  -> const dataset_descriptor_host<DataT, IndexT, DistanceT>&
+{
+  using desc_t = dataset_descriptor_host<DataT, IndexT, DistanceT>;
+  auto key     = descriptor_cache::make_key(params, dataset, metric);
+  auto& cache =
+    raft::resource::get_custom_resource<descriptor_cache::store<DataT, IndexT, DistanceT>>(res)
+      ->value;
+  std::shared_ptr<desc_t> desc{nullptr};
+  if (!cache.get(key, &desc)) {
+    desc = std::make_shared<desc_t>(std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(
+      params, dataset, metric, raft::resource::get_cuda_stream(res))));
+    cache.set(key, desc);
+  }
+  return *desc;
+}
+
 };  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 515be75df..9edbbf5c1 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -73,12 +73,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 }
 
 template <class DATA_T, class IdxT, int numElementsPerThread>
-RAFT_KERNEL kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
-                      const IdxT dataset_size,
-                      const uint32_t dataset_dim,
-                      IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                      const uint32_t graph_size,
-                      const uint32_t graph_degree)
+__global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
+                          const IdxT dataset_size,
+                          const uint32_t dataset_dim,
+                          IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                          const uint32_t graph_size,
+                          const uint32_t graph_degree)
 {
   const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize;
   if (srcNode >= graph_size) { return; }
@@ -129,15 +129,15 @@ RAFT_KERNEL kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, data
 }
 
 template <int MAX_DEGREE, class IdxT>
-RAFT_KERNEL kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                       const uint32_t graph_size,
-                       const uint32_t graph_degree,
-                       const uint32_t degree,
-                       const uint32_t batch_size,
-                       const uint32_t batch_id,
-                       uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
-                       uint32_t* const num_no_detour_edges,  // [graph_size]
-                       uint64_t* const stats)
+__global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                           const uint32_t graph_size,
+                           const uint32_t graph_degree,
+                           const uint32_t degree,
+                           const uint32_t batch_size,
+                           const uint32_t batch_id,
+                           uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
+                           uint32_t* const num_no_detour_edges,  // [graph_size]
+                           uint64_t* const stats)
 {
   __shared__ uint32_t smem_num_detour[MAX_DEGREE];
   uint64_t* const num_retain = stats;
@@ -192,11 +192,11 @@ RAFT_KERNEL kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
-                                IdxT* const rev_graph,            // [size, degree]
-                                uint32_t* const rev_graph_count,  // [graph_size]
-                                const uint32_t graph_size,
-                                const uint32_t degree)
+__global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
+                                    IdxT* const rev_graph,            // [size, degree]
+                                    uint32_t* const rev_graph_count,  // [graph_size]
+                                    const uint32_t graph_size,
+                                    const uint32_t degree)
 {
   const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint32_t tnum = blockDim.x * gridDim.x;
@@ -221,16 +221,16 @@ __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label)
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph,                 // [graph_size, graph_degree]
-                                      const IdxT* candidate_edges,     // [graph_size]
-                                      IdxT* outgoing_num_edges,        // [graph_size]
-                                      IdxT* incoming_num_edges,        // [graph_size]
-                                      const IdxT* outgoing_max_edges,  // [graph_size]
-                                      const IdxT* incoming_max_edges,  // [graph_size]
-                                      const IdxT* label,               // [graph_size]
-                                      const uint32_t graph_size,
-                                      const uint32_t graph_degree,
-                                      uint64_t* stats)
+__global__ void kern_mst_opt_update_graph(IdxT* mst_graph,  // [graph_size, graph_degree]
+                                          const IdxT* candidate_edges,     // [graph_size]
+                                          IdxT* outgoing_num_edges,        // [graph_size]
+                                          IdxT* incoming_num_edges,        // [graph_size]
+                                          const IdxT* outgoing_max_edges,  // [graph_size]
+                                          const IdxT* incoming_max_edges,  // [graph_size]
+                                          const IdxT* label,               // [graph_size]
+                                          const uint32_t graph_size,
+                                          const uint32_t graph_degree,
+                                          uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -310,11 +310,11 @@ RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph,                 // [graph
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_labeling(IdxT* label,            // [graph_size]
-                                  const IdxT* mst_graph,  // [graph_size, graph_degree]
-                                  const uint32_t graph_size,
-                                  const uint32_t graph_degree,
-                                  uint64_t* stats)
+__global__ void kern_mst_opt_labeling(IdxT* label,            // [graph_size]
+                                      const IdxT* mst_graph,  // [graph_size, graph_degree]
+                                      const uint32_t graph_size,
+                                      const uint32_t graph_degree,
+                                      uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -348,10 +348,10 @@ RAFT_KERNEL kern_mst_opt_labeling(IdxT* label,            // [graph_size]
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
-                                      const IdxT* label,   // [graph_size]
-                                      const uint32_t graph_size,
-                                      uint64_t* stats)
+__global__ void kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
+                                          const IdxT* label,   // [graph_size]
+                                          const uint32_t graph_size,
+                                          uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -375,14 +375,14 @@ RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_postprocessing(IdxT* outgoing_num_edges,  // [graph_size]
-                                        IdxT* incoming_num_edges,  // [graph_size]
-                                        IdxT* outgoing_max_edges,  // [graph_size]
-                                        IdxT* incoming_max_edges,  // [graph_size]
-                                        const IdxT* cluster_size,  // [graph_size]
-                                        const uint32_t graph_size,
-                                        const uint32_t graph_degree,
-                                        uint64_t* stats)
+__global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges,  // [graph_size]
+                                            IdxT* incoming_num_edges,  // [graph_size]
+                                            IdxT* outgoing_max_edges,  // [graph_size]
+                                            IdxT* incoming_max_edges,  // [graph_size]
+                                            const IdxT* cluster_size,  // [graph_size]
+                                            const uint32_t graph_size,
+                                            const uint32_t graph_degree,
+                                            uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
diff --git a/cpp/src/neighbors/detail/cagra/hashmap.hpp b/cpp/src/neighbors/detail/cagra/hashmap.hpp
index dd6c6c844..2c62dda90 100644
--- a/cpp/src/neighbors/detail/cagra/hashmap.hpp
+++ b/cpp/src/neighbors/detail/cagra/hashmap.hpp
@@ -29,10 +29,12 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace hashmap {
 
-_RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
+RAFT_INLINE_FUNCTION uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
 
 template <class IdxT>
-_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0)
+RAFT_DEVICE_INLINE_FUNCTION void init(IdxT* const table,
+                                      const unsigned bitlen,
+                                      unsigned FIRST_TID = 0)
 {
   if (threadIdx.x < FIRST_TID) return;
   for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) {
@@ -41,7 +43,9 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned
 }
 
 template <class IdxT>
-_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table,
+                                            const uint32_t bitlen,
+                                            const IdxT key)
 {
   // Open addressing is used for collision resolution
   const uint32_t size     = get_size(bitlen);
@@ -68,7 +72,9 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co
 }
 
 template <unsigned TEAM_SIZE, class IdxT>
-_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table,
+                                            const uint32_t bitlen,
+                                            const IdxT key)
 {
   IdxT ret = 0;
   if (threadIdx.x % TEAM_SIZE == 0) { ret = insert(table, bitlen, key); }
@@ -78,5 +84,17 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co
   return ret;
 }
 
+template <class IdxT>
+RAFT_DEVICE_INLINE_FUNCTION uint32_t
+insert(unsigned team_size, IdxT* const table, const uint32_t bitlen, const IdxT key)
+{
+  IdxT ret = 0;
+  if (threadIdx.x % team_size == 0) { ret = insert(table, bitlen, key); }
+  for (unsigned offset = 1; offset < team_size; offset *= 2) {
+    ret |= __shfl_xor_sync(0xffffffff, ret, offset);
+  }
+  return ret;
+}
+
 }  // namespace hashmap
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py
deleted file mode 100644
index 63171373f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-header = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-#include "compute_distance_vpq.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-"""
-
-trailer = """
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
-"""
-
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
-pq_bits = [8]
-subspace_dims = [2, 4]
-# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
-# mxelem = [64, 128, 256]
-load_types = ["uint4"]
-code_book_types = ["half"]
-search_types = dict(
-    float_uint32=(
-        "float",
-        "uint32_t",
-        "float",
-    ),  # data_t, vec_idx_t, distance_t
-    half_uint32=("half", "uint32_t", "float"),
-    int8_uint32=("int8_t", "uint32_t", "float"),
-    uint8_uint32=("uint8_t", "uint32_t", "float"),
-    float_uint64=("float", "uint64_t", "float"),
-    half_uint64=("half", "uint64_t", "float"),
-)
-# knn
-for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        for code_book_t in code_book_types:
-            for subspace_dim in subspace_dims:
-                for pq_bit in pq_bits:
-                    path = f"q_search_multi_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu"
-                    with open(path, "w") as f:
-                        f.write(header)
-                        f.write(
-                                f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-                        )
-                        f.write(trailer)
-                        # For pasting into CMakeLists.txt
-                    print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 5d94a501a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 56534dc05..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 7ff962058..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 3387a32a3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 2d3f2cb1d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 73dd8cd4b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index b5e33602d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 32fe0d628..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index e2726ea26..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index b4ebd49c4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 72f198c92..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index dfb667a7f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index c583569f6..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index fedfb5146..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 2b6e8e3da..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 4a97fb752..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 675cd3c93..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index b42b3289c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 0db4296f1..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 4a2610dc7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index b1c15662e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 201f68fb5..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 26744ed76..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 1bce71bef..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 694304f3c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index e6a563731..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 5c554af3f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 965b43c07..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 97a4f8092..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index bdd1719b3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e39bc1e2d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 599cf327a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 621c5a249..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index cbed3ef8a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 7428bfd9e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 70efefdb0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 4039b8582..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 022eb0e05..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e48b2ed71..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 64f08530f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py
deleted file mode 100644
index bc5f506ac..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-header = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-#include "compute_distance_vpq.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-"""
-
-trailer = """
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
-"""
-
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
-# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
-# itopk_candidates = [64, 128, 256]
-# itopk_size = [64, 128, 256, 512]
-# mxelem = [64, 128, 256]
-
-pq_bits = [8]
-subspace_dims = [2, 4]
-
-# rblock = [(256, 4), (512, 2), (1024, 1)]
-# rcandidates = [32]
-# rsize = [256, 512]
-code_book_types = ["half"]
-
-search_types = dict(
-    float_uint32=("float", "uint32_t", "float"),  # data_t, idx_t, distance_t
-    half_uint32=("half", "uint32_t", "float"),
-    int8_uint32=("int8_t", "uint32_t", "float"),
-    uint8_uint32=("uint8_t", "uint32_t", "float"),
-    float_uint64=("float", "uint64_t", "float"),
-    half_uint64=("half", "uint64_t", "float"),
-)
-
-# knn
-for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        for code_book_t in code_book_types:
-            for subspace_dim in subspace_dims:
-                for pq_bit in pq_bits:
-                    path = f"q_search_single_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu"
-                    with open(path, "w") as f:
-                        f.write(header)
-                        f.write(
-                                f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-                        )
-
-                        f.write(trailer)
-                        # For pasting into CMakeLists.txt
-                        print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index b40322741..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 36273d0d4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index ef483437a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index d9ebb1b85..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index e86524ee0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 9f2b7fbc7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 1ce4f5520..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 2d6f93ef0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 5f3267410..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 631ac7938..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index ea8faee1c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 061b1a04e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 15610d853..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index f984b46f0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 45299f272..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index fcb91be8c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index b594fedab..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index a82be6b55..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index d80fef52c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index e2c3ef4f7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 98889811d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index f5e9d12c9..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 4f14910b4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 67d52f8d5..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 1420918a1..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index eb0a72da3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 7a98b59a9..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 7e07033c7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 857f32712..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 3c00c5223..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e5c4c7b69..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 22359d71b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 37c783f19..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 0a4049d79..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 773f567c4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index dfc176abd..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 680c32655..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index e57881e82..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 525004f2e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 7af2ef124..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 0fd36c31b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index d4cc5f449..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index aa58ac2b7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 189c3ed9c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 9dc9aaae3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 100110313..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 8d4e0aeee..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 4c7318735..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index efbf9b56d..9bcccd9f9 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -16,12 +16,12 @@
 #pragma once
 
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_multi_cta_kernel.cuh"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
 #include <raft/core/detail/macros.hpp>
@@ -51,48 +51,46 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace multi_cta_search {
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-
-struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type ::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
+
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   uint32_t num_cta_per_query;
   rmm::device_uvector<INDEX_T> intermediate_indices;
@@ -102,12 +100,11 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric),
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk),
       intermediate_indices(0, raft::resource::get_cuda_stream(res)),
       intermediate_distances(0, raft::resource::get_cuda_stream(res)),
       topk_workspace(0, raft::resource::get_cuda_stream(res))
@@ -129,13 +126,9 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     // constexpr unsigned max_result_buffer_size = 256;
     RAFT_EXPECTS(result_buffer_size_32 <= 256, "Result buffer size cannot exceed 256");
 
-    const auto query_smem_buffer_length =
-      raft::ceildiv<uint32_t>(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-
-    smem_size = sizeof(float) * query_smem_buffer_length +
+    smem_size = dataset_desc.smem_ws_size_in_bytes +
                 (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
-                sizeof(uint32_t) * search_width + sizeof(uint32_t) +
-                DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
+                sizeof(uint32_t) * search_width + sizeof(uint32_t);
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
 
     //
@@ -204,44 +197,37 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   ~search() {}
 
-  void operator()(
-    raft::resources const& res,
-    // raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
-    DATASET_DESCRIPTOR_T dataset_desc,
-    raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-      graph,
-    typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-    typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-    const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-    const uint32_t num_queries,
-    const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-    uint32_t* const num_executed_iterations,                     // [num_queries,]
-    uint32_t topk,
-    SAMPLE_FILTER_T sample_filter)
+  void operator()(raft::resources const& res,
+                  raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  const uint32_t num_queries,
+                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
+                  uint32_t* const num_executed_iterations,  // [num_queries,]
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-
-    select_and_run<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-      dataset_desc,
-      graph,
-      intermediate_indices.data(),
-      intermediate_distances.data(),
-      queries_ptr,
-      num_queries,
-      dev_seed_ptr,
-      num_executed_iterations,
-      *this,
-      topk,
-      thread_block_size,
-      result_buffer_size,
-      smem_size,
-      hash_bitlen,
-      hashmap.data(),
-      num_cta_per_query,
-      num_seeds,
-      sample_filter,
-      this->metric,
-      stream);
+    select_and_run(dataset_desc.dev_ptr(),
+                   graph,
+                   intermediate_indices.data(),
+                   intermediate_distances.data(),
+                   queries_ptr,
+                   num_queries,
+                   dev_seed_ptr,
+                   num_executed_iterations,
+                   *this,
+                   topk,
+                   thread_block_size,
+                   result_buffer_size,
+                   smem_size,
+                   hash_bitlen,
+                   hashmap.data(),
+                   num_cta_per_query,
+                   num_seeds,
+                   sample_filter,
+                   stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     // Select the top-k results from the intermediate results
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
index cb63c0e03..3153a3a9f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
@@ -39,8 +39,6 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 """
 
@@ -48,7 +46,6 @@
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
 """
 
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
 # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
 # mxelem = [64, 128, 256]
 load_types = ["uint4"]
@@ -66,13 +63,12 @@
 )
 # knn
 for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu"
-        with open(path, "w") as f:
-            f.write(header)
-            f.write(
-                    f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-            )
-            f.write(trailer)
-            # For pasting into CMakeLists.txt
-        print(f"src/neighbors/detail/cagra/{path}")
+    path = f"search_multi_cta_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(
+                f"instantiate_kernel_selection(\n  {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
+        )
+        f.write(trailer)
+        # For pasting into CMakeLists.txt
+    print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
index 2a14699f4..fae5a9387 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
deleted file mode 100644
index 0bf4a192f..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
deleted file mode 100644
index a77859b7d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
deleted file mode 100644
index ab49fa9f2..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
index 157942dc5..88167b843 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
deleted file mode 100644
index c38eeb009..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
deleted file mode 100644
index 3094ddaeb..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
deleted file mode 100644
index 91725d185..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
deleted file mode 100644
index 0f452a6fa..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
index ea38b60c0..9606d510f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu
deleted file mode 100644
index cfe7a7aef..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
deleted file mode 100644
index 292859382..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
index ee2400037..dafb89cc3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
deleted file mode 100644
index 13044f12d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
deleted file mode 100644
index 2ce6f292d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
deleted file mode 100644
index 2d607eb8d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
index b1cfaf870..036a4e414 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
@@ -21,30 +21,26 @@
 
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T>(      \
-    DATASET_DESC_T dataset_desc,                                                                  \
-    raft::device_matrix_view<const typename DATASET_DESC_T::INDEX_T, int64_t, raft::row_major>    \
-      graph,                                                                                      \
-    typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr,                                     \
-    typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr,                                \
-    const typename DATASET_DESC_T::DATA_T* const queries_ptr,                                     \
-    const uint32_t num_queries,                                                                   \
-    const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr,                                         \
-    uint32_t* const num_executed_iterations,                                                      \
-    const search_params& ps,                                                                      \
-    uint32_t topk,                                                                                \
-    uint32_t block_size,                                                                          \
-    uint32_t result_buffer_size,                                                                  \
-    uint32_t smem_size,                                                                           \
-    int64_t hash_bitlen,                                                                          \
-    typename DATASET_DESC_T::INDEX_T* hashmap_ptr,                                                \
-    uint32_t num_cta_per_query,                                                                   \
-    uint32_t num_seeds,                                                                           \
-    SAMPLE_FILTER_T sample_filter,                                                                \
-    cuvs::distance::DistanceType metric,                                                          \
+#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
+  template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
+    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
+    IndexT* topk_indices_ptr,                                                 \
+    DistanceT* topk_distances_ptr,                                            \
+    const DataT* queries_ptr,                                                 \
+    uint32_t num_queries,                                                     \
+    const IndexT* dev_seed_ptr,                                               \
+    uint32_t* num_executed_iterations,                                        \
+    const search_params& ps,                                                  \
+    uint32_t topk,                                                            \
+    uint32_t block_size,                                                      \
+    uint32_t result_buffer_size,                                              \
+    uint32_t smem_size,                                                       \
+    int64_t hash_bitlen,                                                      \
+    IndexT* hashmap_ptr,                                                      \
+    uint32_t num_cta_per_query,                                               \
+    uint32_t num_seeds,                                                       \
+    SampleFilterT sample_filter,                                              \
     cudaStream_t stream);
 
-#define COMMA ,
-
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
new file mode 100644
index 000000000..a3322c435
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_multi_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_multi_cta_00_generate.py
+ *
+ */
+
+#include "search_multi_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+instantiate_kernel_selection(int8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
deleted file mode 100644
index c28adbf80..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
deleted file mode 100644
index af5f13397..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
deleted file mode 100644
index bcc7b9b8c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
deleted file mode 100644
index 916196c35..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
deleted file mode 100644
index e907568f5..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "compute_distance_vpq.cuh"
-#include <cuvs/neighbors/common.hpp>    // none_cagra_sample_filter
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-#include <cuda_fp16.h>
-
-namespace cuvs::neighbors::cagra::detail {
-namespace multi_cta_search {
-
-#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  // multi_cta_search (params struct)
-  uint32_t block_size,  //
-  uint32_t result_buffer_size,
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  uint32_t num_cta_per_query,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream) RAFT_EXPLICIT;
-#endif  // CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-#define instantiate_kernel_selection(                                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                     \
-  extern template void select_and_run<                                                          \
-    TEAM_SIZE,                                                                                  \
-    MAX_DATASET_DIM,                                                                            \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>, \
-    SAMPLE_FILTER_T>(                                                                           \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>  \
-      dataset_desc,                                                                             \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_kernel_selection(
-  32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_kernel_selection
-
-#define instantiate_q_kernel_selection(TEAM_SIZE,                                               \
-                                       MAX_DATASET_DIM,                                         \
-                                       CODE_BOOK_T,                                             \
-                                       PQ_BITS,                                                 \
-                                       PQ_CODE_BOOK_DIM,                                        \
-                                       DATA_T,                                                  \
-                                       INDEX_T,                                                 \
-                                       DISTANCE_T,                                              \
-                                       SAMPLE_FILTER_T)                                         \
-  extern template void                                                                          \
-  select_and_run<TEAM_SIZE,                                                                     \
-                 MAX_DATASET_DIM,                                                               \
-                 cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,           \
-                                                                              CODE_BOOK_T,      \
-                                                                              PQ_BITS,          \
-                                                                              PQ_CODE_BOOK_DIM, \
-                                                                              DISTANCE_T,       \
-                                                                              INDEX_T>,         \
-                 SAMPLE_FILTER_T>(                                                              \
-    cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,                        \
-                                                                 CODE_BOOK_T,                   \
-                                                                 PQ_BITS,                       \
-                                                                 PQ_CODE_BOOK_DIM,              \
-                                                                 DISTANCE_T,                    \
-                                                                 INDEX_T> dataset_desc,         \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               half,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               half,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_q_kernel_selection
-}  // namespace multi_cta_search
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 4d2030c6c..dd74ba44b 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -15,12 +15,14 @@
  */
 #pragma once
 
+#include "search_multi_cta_kernel.cuh"
+
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -53,11 +55,12 @@ namespace multi_cta_search {
 // #define _CLK_BREAKDOWN
 
 template <class INDEX_T>
-__device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [search_width]
-                                    const uint32_t search_width,
-                                    INDEX_T* const itopk_indices,  // [num_itopk]
-                                    const size_t num_itopk,
-                                    uint32_t* const terminate_flag)
+RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(
+  INDEX_T* const next_parent_indices,  // [search_width]
+  const uint32_t search_width,
+  INDEX_T* const itopk_indices,  // [num_itopk]
+  const size_t num_itopk,
+  uint32_t* const terminate_flag)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   const unsigned warp_id             = threadIdx.x / 32;
@@ -93,10 +96,11 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [sea
 }
 
 template <unsigned MAX_ELEMENTS, class INDEX_T>
-__device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
-                                            INDEX_T* indices,  // [num_elements]
-                                            const uint32_t num_elements,
-                                            const uint32_t num_itopk  // num_itopk <= num_elements
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+  float* distances,  // [num_elements]
+  INDEX_T* indices,  // [num_elements]
+  const uint32_t num_elements,
+  const uint32_t num_itopk  // num_itopk <= num_elements
 )
 {
   const unsigned warp_id = threadIdx.x / 32;
@@ -130,17 +134,13 @@ __device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
 //
 // multiple CTAs per single query
 //
-template <int32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          std::uint32_t MAX_ELEMENTS,
-          class DATASET_DESCRIPTOR_T,
-          class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+template <std::uint32_t MAX_ELEMENTS, class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const
     result_indices_ptr,  // [num_queries, num_cta_per_query, itopk_size]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
     result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
-  DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
   const uint32_t graph_degree,
@@ -156,13 +156,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const uint32_t min_iteration,
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using QUERY_T    = typename DATASET_DESCRIPTOR_T::QUERY_T;
 
   const auto num_queries       = gridDim.y;
   const auto query_id          = blockIdx.y;
@@ -184,7 +182,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
   _CLK_START();
 
-  extern __shared__ uint32_t smem[];
+  extern __shared__ uint8_t smem[];
 
   // Layout of result_buffer
   // +----------------+------------------------------+---------+
@@ -192,26 +190,21 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // | <itopk_size>   | <search_width * graph_degree> | upto 32 |
   // +----------------+------------------------------+---------+
   // |<---          result_buffer_size           --->|
-  uint32_t result_buffer_size    = itopk_size + (search_width * graph_degree);
-  uint32_t result_buffer_size_32 = result_buffer_size;
-  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
+  const auto result_buffer_size    = itopk_size + (search_width * graph_degree);
+  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
   assert(result_buffer_size_32 <= MAX_ELEMENTS);
 
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  auto query_buffer          = reinterpret_cast<QUERY_T*>(smem);
-  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + query_smem_buffer_length);
-  auto result_distances_buffer =
+  // Set smem working buffer for the distance calculation
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+
+  auto* __restrict__ result_indices_buffer =
+    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
+  auto* __restrict__ result_distances_buffer =
     reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
-  auto parent_indices_buffer =
+  auto* __restrict__ parent_indices_buffer =
     reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
-  auto distance_work_buffer_ptr =
-    reinterpret_cast<std::uint8_t*>(parent_indices_buffer + search_width);
-  auto terminate_flag = reinterpret_cast<uint32_t*>(distance_work_buffer_ptr +
-                                                    DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte);
-
-  // Set smem working buffer for the distance calculation
-  dataset_desc.set_smem_ptr(distance_work_buffer_ptr);
+  auto* __restrict__ terminate_flag =
+    reinterpret_cast<uint32_t*>(parent_indices_buffer + search_width);
 
 #if 0
     /* debug */
@@ -220,9 +213,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
         result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
     }
 #endif
-  const DATA_T* const query_ptr = queries_ptr + (dataset_desc.dim * query_id);
-  dataset_desc.template copy_query<DATASET_BLOCK_DIM>(
-    query_ptr, query_buffer, query_smem_buffer_length);
 
   if (threadIdx.x == 0) { terminate_flag[0] = 0; }
   INDEX_T* const local_visited_hashmap_ptr =
@@ -236,20 +226,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
   uint32_t num_blocks                 = num_cta_per_query * num_queries;
 
-  device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
-                                                                         result_distances_buffer,
-                                                                         query_buffer,
-                                                                         dataset_desc,
-                                                                         result_buffer_size,
-                                                                         num_distilation,
-                                                                         rand_xor_mask,
-                                                                         local_seed_ptr,
-                                                                         num_seeds,
-                                                                         local_visited_hashmap_ptr,
-                                                                         hash_bitlen,
-                                                                         metric,
-                                                                         block_id,
-                                                                         num_blocks);
+  device::compute_distance_to_random_nodes(result_indices_buffer,
+                                           result_distances_buffer,
+                                           *dataset_desc,
+                                           result_buffer_size,
+                                           num_distilation,
+                                           rand_xor_mask,
+                                           local_seed_ptr,
+                                           num_seeds,
+                                           local_visited_hashmap_ptr,
+                                           hash_bitlen,
+                                           block_id,
+                                           num_blocks);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -279,21 +267,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 
     // compute the norms between child nodes and query node
     _CLK_START();
-    // constexpr unsigned max_n_frags = 16;
-    constexpr unsigned max_n_frags = 0;
-    device::compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, max_n_frags>(
-      result_indices_buffer + itopk_size,
-      result_distances_buffer + itopk_size,
-      query_buffer,
-      dataset_desc,
-      knn_graph,
-      graph_degree,
-      local_visited_hashmap_ptr,
-      hash_bitlen,
-      parent_indices_buffer,
-      result_indices_buffer,
-      search_width,
-      metric);
+    device::compute_distance_to_child_nodes(result_indices_buffer + itopk_size,
+                                            result_distances_buffer + itopk_size,
+                                            *dataset_desc,
+                                            knn_graph,
+                                            graph_degree,
+                                            local_visited_hashmap_ptr,
+                                            hash_bitlen,
+                                            parent_indices_buffer,
+                                            result_indices_buffer,
+                                            search_width);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
@@ -409,84 +392,58 @@ void set_value_batch(T* const dev_ptr,
     <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
 }
 
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
+template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
 struct search_kernel_config {
   // Search kernel function type. Note that the actual values for the template value
   // parameters do not matter, because they are not part of the function signature. The
   // second to fourth value parameters will be selected by the choose_* functions below.
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           DATASET_BLOCK_DIM,
-                                           128,
-                                           DATASET_DESCRIPTOR_T,
-                                           SAMPLE_FILTER_T>);
+  using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
 
   static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
   {
     if (result_buffer_size <= 64) {
-      return search_kernel<TEAM_SIZE, DATASET_BLOCK_DIM, 64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+      return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     } else if (result_buffer_size <= 128) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           128,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     } else if (result_buffer_size <= 256) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           256,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     }
     THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  // multi_cta_search (params struct)
-  uint32_t block_size,  //
-  uint32_t result_buffer_size,
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  uint32_t num_cta_per_query,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream)
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    // multi_cta_search (params struct)
+                    uint32_t block_size,  //
+                    uint32_t result_buffer_size,
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    uint32_t num_cta_per_query,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream)
 {
   auto kernel =
-    search_kernel_config<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::
-      choose_buffer_size(result_buffer_size, block_size);
+    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
+                         SampleFilterT>::choose_buffer_size(result_buffer_size, block_size);
 
-  RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
-                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                     smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
+  RAFT_CUDA_TRY(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   // Initialize hash table
   const uint32_t hash_size = hashmap::get_size(hash_bitlen);
-  set_value_batch(hashmap_ptr,
-                  hash_size,
-                  utils::get_max_value<typename DATASET_DESCRIPTOR_T::INDEX_T>(),
-                  hash_size,
-                  num_queries,
-                  stream);
+  set_value_batch(
+    hashmap_ptr, hash_size, utils::get_max_value<IndexT>(), hash_size, num_queries, stream);
 
   dim3 block_dims(block_size, 1, 1);
   dim3 grid_dims(num_cta_per_query, num_queries, 1);
@@ -513,8 +470,7 @@ void select_and_run(
                                                        ps.min_iterations,
                                                        ps.max_iterations,
                                                        num_executed_iterations,
-                                                       sample_filter,
-                                                       metric);
+                                                       sample_filter);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
index 673fc5473..1ef35f947 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,32 @@
  */
 #pragma once
 
-#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-#include "search_multi_cta_kernel-inl.cuh"
-#endif
+#include "compute_distance-ext.cuh"
 
-#ifdef RAFT_COMPILED
-#include "search_multi_cta_kernel-ext.cuh"
-#endif
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    // multi_cta_search (params struct)
+                    uint32_t block_size,  //
+                    uint32_t result_buffer_size,
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    uint32_t num_cta_per_query,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream);
+
+}
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
new file mode 100644
index 000000000..51fc6526f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_multi_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_multi_cta_00_generate.py
+ *
+ */
+
+#include "search_multi_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+instantiate_kernel_selection(uint8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
deleted file mode 100644
index 3fa12d933..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
deleted file mode 100644
index e2f25a1c2..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
deleted file mode 100644
index 4cd206d8c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
deleted file mode 100644
index 56989a1d5..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index bc1266fb4..7b3ecabf3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -15,12 +15,11 @@
  */
 #pragma once
 
-#include "compute_distance.hpp"
-#include "compute_distance_vpq.cuh"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  //todo replace with raft kernel
+#include "topk_for_cagra/topk.h"  //todo replace with raft kernel
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -93,9 +92,9 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre
 }
 
 // MAX_DATASET_DIM : must equal to or greater than dataset_dim
-template <unsigned TEAM_SIZE, unsigned DATASET_BLOCK_DIM, class DATASET_DESCRIPTOR_T>
+template <class DATASET_DESCRIPTOR_T>
 RAFT_KERNEL random_pickup_kernel(
-  const DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const std::size_t num_pickup,
   const unsigned num_distilation,
@@ -106,30 +105,19 @@ RAFT_KERNEL random_pickup_kernel(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
   const std::uint32_t ldr,                                                // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric)
+  const std::uint32_t hash_bitlen)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
+  const auto team_size_bits    = dataset_desc->team_size_bitshift();
   const auto ldb               = hashmap::get_size(hash_bitlen);
-  const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE;
+  const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) >> team_size_bits;
   const uint32_t query_id      = blockIdx.y;
   if (global_team_index >= num_pickup) { return; }
-  // Load a query
-  extern __shared__ float query_buffer[];
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-    unsigned j = device::swizzling(i);
-    if (i < dataset_desc.dim) {
-      query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping<float>{}(
-        (queries_ptr + query_id * dataset_desc.dim)[i]);
-    } else {
-      query_buffer[j] = 0.0;
-    }
-  }
+  extern __shared__ uint8_t smem[];
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
   __syncthreads();
 
   INDEX_T best_index_team_local;
@@ -141,27 +129,10 @@ RAFT_KERNEL random_pickup_kernel(
     } else {
       // Chose a seed node randomly
       seed_index =
-        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size;
-    }
-
-    DISTANCE_T norm2;
-    switch (metric) {
-      case cuvs::distance::DistanceType::L2Expanded:
-        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                         TEAM_SIZE,
-                                                         cuvs::distance::DistanceType::L2Expanded>(
-          query_buffer, seed_index, true);
-        break;
-      case cuvs::distance::DistanceType::InnerProduct:
-        norm2 =
-          dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                   TEAM_SIZE,
-                                                   cuvs::distance::DistanceType::InnerProduct>(
-            query_buffer, seed_index, true);
-        break;
-      default: break;
+        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size;
     }
 
+    DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true);
     if (norm2 < best_norm2_team_local) {
       best_norm2_team_local = norm2;
       best_index_team_local = seed_index;
@@ -169,7 +140,7 @@ RAFT_KERNEL random_pickup_kernel(
   }
 
   const auto store_gmem_index = global_team_index + (ldr * query_id);
-  if (threadIdx.x % TEAM_SIZE == 0) {
+  if ((threadIdx.x & ((1u << team_size_bits) - 1u)) == 0) {
     if (hashmap::insert(
           visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) {
       result_distances_ptr[store_gmem_index] = best_norm2_team_local;
@@ -182,47 +153,40 @@ RAFT_KERNEL random_pickup_kernel(
 }
 
 // MAX_DATASET_DIM : must be equal to or greater than dataset_dim
-template <unsigned TEAM_SIZE, unsigned DATASET_BLOCK_DIM, class DATASET_DESCRIPTOR_T>
-void random_pickup(
-  const DATASET_DESCRIPTOR_T dataset_desc,
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const std::size_t num_queries,
-  const std::size_t num_pickup,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
-  const uint32_t num_seeds,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
-  const std::size_t ldr,                                                  // (*) ldr >= num_pickup
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric,
-  cudaStream_t const cuda_stream = 0)
+template <typename DataT, typename IndexT, typename DistanceT>
+void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+                   const DataT* queries_ptr,  // [num_queries, dataset_dim]
+                   std::size_t num_queries,
+                   std::size_t num_pickup,
+                   unsigned num_distilation,
+                   uint64_t rand_xor_mask,
+                   const IndexT* seed_ptr,  // [num_queries, num_seeds]
+                   uint32_t num_seeds,
+                   IndexT* result_indices_ptr,       // [num_queries, ldr]
+                   DistanceT* result_distances_ptr,  // [num_queries, ldr]
+                   std::size_t ldr,                  // (*) ldr >= num_pickup
+                   IndexT* visited_hashmap_ptr,      // [num_queries, 1 << bitlen]
+                   std::uint32_t hash_bitlen,
+                   cudaStream_t cuda_stream)
 {
   const auto block_size                = 256u;
-  const auto num_teams_per_threadblock = block_size / TEAM_SIZE;
+  const auto num_teams_per_threadblock = block_size / dataset_desc.team_size;
   const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock,
                        num_queries);
 
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  const auto smem_size = query_smem_buffer_length * sizeof(float);
-
-  random_pickup_kernel<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T>
-    <<<grid_size, block_size, smem_size, cuda_stream>>>(dataset_desc,
-                                                        queries_ptr,
-                                                        num_pickup,
-                                                        num_distilation,
-                                                        rand_xor_mask,
-                                                        seed_ptr,
-                                                        num_seeds,
-                                                        result_indices_ptr,
-                                                        result_distances_ptr,
-                                                        ldr,
-                                                        visited_hashmap_ptr,
-                                                        hash_bitlen,
-                                                        metric);
+  random_pickup_kernel<<<grid_size, block_size, dataset_desc.smem_ws_size_in_bytes, cuda_stream>>>(
+    dataset_desc.dev_ptr(),
+    queries_ptr,
+    num_pickup,
+    num_distilation,
+    rand_xor_mask,
+    seed_ptr,
+    num_seeds,
+    result_indices_ptr,
+    result_distances_ptr,
+    ldr,
+    visited_hashmap_ptr,
+    hash_bitlen);
 }
 
 template <class INDEX_T>
@@ -325,9 +289,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr,  // [num_queries,
                                                   terminate_flag);
 }
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class DATASET_DESCRIPTOR_T,
+template <class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
 RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const
@@ -338,7 +300,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
     parent_distance_ptr,  // [num_queries, search_width]
   const std::size_t lds,
   const std::uint32_t search_width,
-  const DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const
     neighbor_graph_ptr,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -349,29 +311,22 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
-  SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
+  const auto team_size_bits = dataset_desc->team_size_bitshift();
+  const auto team_size      = 1u << team_size_bits;
   const uint32_t ldb        = hashmap::get_size(hash_bitlen);
   const auto tid            = threadIdx.x + blockDim.x * blockIdx.x;
-  const auto global_team_id = tid / TEAM_SIZE;
+  const auto global_team_id = tid >> team_size_bits;
   const auto query_id       = blockIdx.y;
 
-  extern __shared__ float query_buffer[];
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-    unsigned j = device::swizzling(i);
-    if (i < dataset_desc.dim) {
-      query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping<float>{}(
-        (query_ptr + query_id * dataset_desc.dim)[i]);
-    } else {
-      query_buffer[j] = 0.0;
-    }
-  }
+  extern __shared__ uint8_t smem[];
+  // Load a query
+  dataset_desc = dataset_desc->setup_workspace(smem, query_ptr, query_id);
+
   __syncthreads();
   if (global_team_id >= search_width * graph_degree) { return; }
 
@@ -393,33 +348,18 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
 
   const std::size_t child_id = neighbor_list_head_ptr[global_team_id % graph_degree];
 
-  const auto compute_distance_flag = hashmap::insert<TEAM_SIZE, INDEX_T>(
-    visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
-
-  DISTANCE_T norm2;
-  switch (metric) {
-    case cuvs::distance::DistanceType::L2Expanded:
-      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                       TEAM_SIZE,
-                                                       cuvs::distance::DistanceType::L2Expanded>(
-        query_buffer, child_id, compute_distance_flag);
-      break;
-    case cuvs::distance::DistanceType::InnerProduct:
-      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                       TEAM_SIZE,
-                                                       cuvs::distance::DistanceType::InnerProduct>(
-        query_buffer, child_id, compute_distance_flag);
-      break;
-    default: break;
-  }
+  const auto compute_distance_flag = hashmap::insert<INDEX_T>(
+    team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
+
+  DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, compute_distance_flag);
 
   if (compute_distance_flag) {
-    if (threadIdx.x % TEAM_SIZE == 0) {
+    if ((threadIdx.x & (team_size - 1)) == 0) {
       result_indices_ptr[ldd * blockIdx.y + global_team_id]   = child_id;
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2;
     }
   } else {
-    if (threadIdx.x % TEAM_SIZE == 0) {
+    if ((threadIdx.x & (team_size - 1)) == 0) {
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     }
   }
@@ -434,66 +374,52 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   }
 }
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class SAMPLE_FILTER_T,
-          class DATASET_DESCRIPTOR_T>
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          class SAMPLE_FILTER_T>
 void compute_distance_to_child_nodes(
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    parent_node_list,  // [num_queries, search_width]
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    parent_candidates_ptr,  // [num_queries, search_width]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
-    parent_distance_ptr,  // [num_queries, search_width]
-  const std::size_t lds,
-  const uint32_t search_width,
-  const DATASET_DESCRIPTOR_T dataset_desc,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    neighbor_graph_ptr,  // [dataset_size, graph_degree]
-  const std::uint32_t graph_degree,
-  const typename DATASET_DESCRIPTOR_T::DATA_T* query_ptr,  // [num_queries, data_dim]
-  const std::uint32_t num_queries,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
-  const std::uint32_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
-  const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
+  const IndexT* parent_node_list,        // [num_queries, search_width]
+  IndexT* const parent_candidates_ptr,   // [num_queries, search_width]
+  DistanceT* const parent_distance_ptr,  // [num_queries, search_width]
+  std::size_t lds,
+  uint32_t search_width,
+  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+  const IndexT* neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  std::uint32_t graph_degree,
+  const DataT* query_ptr,  // [num_queries, data_dim]
+  std::uint32_t num_queries,
+  IndexT* visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  std::uint32_t hash_bitlen,
+  IndexT* result_indices_ptr,       // [num_queries, ldd]
+  DistanceT* result_distances_ptr,  // [num_queries, ldd]
+  std::uint32_t ldd,                // (*) ldd >= search_width * graph_degree
   SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric,
-  cudaStream_t cuda_stream = 0)
+  cudaStream_t cuda_stream)
 {
-  const auto block_size = 128;
-  const dim3 grid_size(
-    (search_width * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE),
-    num_queries);
-
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-
-  const auto smem_size =
-    query_smem_buffer_length * sizeof(float) + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
-
-  compute_distance_to_child_nodes_kernel<TEAM_SIZE,
-                                         DATASET_BLOCK_DIM,
-                                         DATASET_DESCRIPTOR_T,
-                                         SAMPLE_FILTER_T>
-    <<<grid_size, block_size, smem_size, cuda_stream>>>(parent_node_list,
-                                                        parent_candidates_ptr,
-                                                        parent_distance_ptr,
-                                                        lds,
-                                                        search_width,
-                                                        dataset_desc,
-                                                        neighbor_graph_ptr,
-                                                        graph_degree,
-                                                        query_ptr,
-                                                        visited_hashmap_ptr,
-                                                        hash_bitlen,
-                                                        result_indices_ptr,
-                                                        result_distances_ptr,
-                                                        ldd,
-                                                        sample_filter,
-                                                        metric);
+  const auto block_size      = 128;
+  const auto teams_per_block = block_size / dataset_desc.team_size;
+  const dim3 grid_size((search_width * graph_degree + teams_per_block - 1) / teams_per_block,
+                       num_queries);
+
+  compute_distance_to_child_nodes_kernel<<<grid_size,
+                                           block_size,
+                                           dataset_desc.smem_ws_size_in_bytes,
+                                           cuda_stream>>>(parent_node_list,
+                                                          parent_candidates_ptr,
+                                                          parent_distance_ptr,
+                                                          lds,
+                                                          search_width,
+                                                          dataset_desc.dev_ptr(),
+                                                          neighbor_graph_ptr,
+                                                          graph_degree,
+                                                          query_ptr,
+                                                          visited_hashmap_ptr,
+                                                          hash_bitlen,
+                                                          result_indices_ptr,
+                                                          result_distances_ptr,
+                                                          ldd,
+                                                          sample_filter);
 }
 
 template <class INDEX_T>
@@ -639,49 +565,48 @@ void set_value_batch(T* const dev_ptr,
 // |<---                 result_buffer_allocation_size                 --->|
 // |<---                       result_buffer_size  --->|                     // Double buffer (A)
 //                      |<---  result_buffer_size                      --->| // Double buffer (B)
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
 
   static_assert(std::is_same_v<DISTANCE_T, float>, "Only float is supported as resulting distance");
 
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   size_t result_buffer_allocation_size;
   rmm::device_uvector<INDEX_T> result_indices;       // results_indices_buffer
@@ -699,12 +624,11 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric),
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk),
       result_indices(0, raft::resource::get_cuda_stream(res)),
       result_distances(0, raft::resource::get_cuda_stream(res)),
       parent_node_list(0, raft::resource::get_cuda_stream(res)),
@@ -837,7 +761,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 
   void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
                   raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                   INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
                   DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
@@ -865,21 +788,20 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     }
 
     // Choose initial entry point candidates at random
-    random_pickup<TEAM_SIZE, DATASET_BLOCK_DIM>(dataset_desc,
-                                                queries_ptr,
-                                                num_queries,
-                                                result_buffer_size,
-                                                num_random_samplings,
-                                                rand_xor_mask,
-                                                dev_seed_ptr,
-                                                num_seeds,
-                                                result_indices.data(),
-                                                result_distances.data(),
-                                                result_buffer_allocation_size,
-                                                hashmap.data(),
-                                                hash_bitlen,
-                                                this->metric,
-                                                stream);
+    random_pickup<DataT, IndexT, DistanceT>(dataset_desc,
+                                            queries_ptr,
+                                            num_queries,
+                                            result_buffer_size,
+                                            num_random_samplings,
+                                            rand_xor_mask,
+                                            dev_seed_ptr,
+                                            num_seeds,
+                                            result_indices.data(),
+                                            result_distances.data(),
+                                            result_buffer_allocation_size,
+                                            hashmap.data(),
+                                            hash_bitlen,
+                                            stream);
 
     unsigned iter = 0;
     while (1) {
@@ -931,7 +853,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       }
 
       // Compute distance to child nodes that are adjacent to the parent node
-      compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, SAMPLE_FILTER_T>(
+      compute_distance_to_child_nodes(
         parent_node_list.data(),
         result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
         result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
@@ -948,7 +870,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
         sample_filter,
-        this->metric,
         stream);
 
       iter++;
@@ -1025,70 +946,5 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class CODE_BOOK_T,
-          unsigned PQ_BITS,
-          unsigned PQ_CODE_BOOK_DIM,
-          class DATA_T,
-          class DISTANCE_T_,
-          class INDEX_T_,
-          typename SAMPLE_FILTER_T>
-struct search<TEAM_SIZE,
-              DATASET_BLOCK_DIM,
-              cagra_q_dataset_descriptor_t<DATA_T,
-                                           CODE_BOOK_T,
-                                           PQ_BITS,
-                                           PQ_CODE_BOOK_DIM,
-                                           DISTANCE_T_,
-                                           INDEX_T_>,
-              SAMPLE_FILTER_T>
-  : public search_plan_impl<cagra_q_dataset_descriptor_t<DATA_T,
-                                                         CODE_BOOK_T,
-                                                         PQ_BITS,
-                                                         PQ_CODE_BOOK_DIM,
-
-                                                         DISTANCE_T_,
-                                                         INDEX_T_>,
-                            SAMPLE_FILTER_T> {
-  using DATASET_DESCRIPTOR_T = cagra_q_dataset_descriptor_t<DATA_T,
-                                                            CODE_BOOK_T,
-                                                            PQ_BITS,
-                                                            PQ_CODE_BOOK_DIM,
-
-                                                            DISTANCE_T_,
-                                                            INDEX_T_>;
-  using INDEX_T              = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T           = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  search(raft::resources const& res,
-         search_params params,
-         int64_t dim,
-         int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric)
-  {
-    THROW("The multi-kernel mode does not support VPQ");
-  }
-
-  void set_params(raft::resources const& res) {}
-
-  void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
-                  raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
-                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
-                  const uint32_t num_queries,
-                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
-                  uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk,
-                  SAMPLE_FILTER_T sample_filter)
-  {
-  }
-};
-
 }  // namespace multi_kernel_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index 0543224b3..16864ed19 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -18,10 +18,11 @@
 
 #include "hashmap.hpp"
 
+#include "compute_distance-ext.cuh"
 #include <cuvs/neighbors/common.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 // #include "search_single_cta_inst.cuh"
-// #include "topk_for_cagra/topk_core.cuh"
+// #include "topk_for_cagra/topk.h"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
@@ -34,19 +35,12 @@
 namespace cuvs::neighbors::cagra::detail {
 
 struct search_plan_impl_base : public search_params {
-  int64_t dataset_block_dim;
   int64_t dim;
   int64_t graph_degree;
   uint32_t topk;
-  cuvs::distance::DistanceType metric;
-  search_plan_impl_base(search_params params,
-                        int64_t dim,
-                        int64_t graph_degree,
-                        uint32_t topk,
-                        cuvs::distance::DistanceType metric)
-    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
+  search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk)
   {
-    set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
       const size_t num_sm = raft::getMultiProcessorCount();
       if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
@@ -61,29 +55,13 @@ struct search_plan_impl_base : public search_params {
       }
     }
   }
-
-  void set_dataset_block_and_team_size(int64_t dim)
-  {
-    constexpr int64_t max_dataset_block_dim = 512;
-    dataset_block_dim                       = 128;
-    while (dataset_block_dim < dim && dataset_block_dim < max_dataset_block_dim) {
-      dataset_block_dim *= 2;
-    }
-    // To keep binary size in check we limit only one team size specialization for each max_dim.
-    // TODO(tfeher): revise this decision.
-    switch (dataset_block_dim) {
-      case 128: team_size = 8; break;
-      case 256: team_size = 16; break;
-      default: team_size = 32; break;
-    }
-  }
 };
 
-template <class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
 struct search_plan_impl : public search_plan_impl_base {
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  using DATA_T     = DataT;
+  using INDEX_T    = IndexT;
+  using DISTANCE_T = DistanceT;
 
   int64_t hash_bitlen;
 
@@ -100,23 +78,24 @@ struct search_plan_impl : public search_plan_impl_base {
   rmm::device_uvector<INDEX_T> hashmap;
   rmm::device_uvector<uint32_t> num_executed_iterations;  // device or managed?
   rmm::device_uvector<INDEX_T> dev_seed;
+  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,
+                   const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                    int64_t dim,
                    int64_t graph_degree,
-                   uint32_t topk,
-                   cuvs::distance::DistanceType metric)
-    : search_plan_impl_base(params, dim, graph_degree, topk, metric),
+                   uint32_t topk)
+    : search_plan_impl_base(params, dim, graph_degree, topk),
       hashmap(0, raft::resource::get_cuda_stream(res)),
       num_executed_iterations(0, raft::resource::get_cuda_stream(res)),
       dev_seed(0, raft::resource::get_cuda_stream(res)),
-      num_seeds(0)
+      num_seeds(0),
+      dataset_desc(dataset_desc)
   {
     adjust_search_params();
     check_params();
     calc_hashmap_params(res);
-    set_dataset_block_and_team_size(dim);
     num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res));
     RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
   }
@@ -124,7 +103,6 @@ struct search_plan_impl : public search_plan_impl_base {
   virtual ~search_plan_impl() {}
 
   virtual void operator()(raft::resources const& res,
-                          DATASET_DESCRIPTOR_T dataset_desc,
                           raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                           INDEX_T* const result_indices_ptr,       // [num_queries, topk]
                           DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
@@ -160,6 +138,7 @@ struct search_plan_impl : public search_plan_impl_base {
                      itopk32);
       itopk_size = itopk32;
     }
+    team_size = dataset_desc.team_size;
   }
 
   // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size
@@ -292,10 +271,6 @@ struct search_plan_impl : public search_plan_impl_base {
         algo != search_algo::MULTI_KERNEL) {
       error_message += "An invalid kernel mode has been given: " + std::to_string((int)algo) + "";
     }
-    if (team_size != 0 && team_size != 4 && team_size != 8 && team_size != 16 && team_size != 32) {
-      error_message +=
-        "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given.";
-    }
     if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 &&
         thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) {
       error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " +
@@ -330,20 +305,4 @@ struct search_plan_impl : public search_plan_impl_base {
   }
 };
 
-// template <class DATA_T, class DISTANCE_T, class INDEX_T>
-// struct search_plan {
-//   search_plan(raft::resources const& res,
-//               search_params param,
-//               int64_t dim,
-//               int64_t graph_degree)
-//     : plan(res, param, dim, graph_degree)
-//   {
-//   }
-//   void check(uint32_t topk) { plan.check(topk); }
-
-//   // private:
-//   detail::search_plan_impl<DATA_T, DISTANCE_T, INDEX_T> plan;
-// };
-/** @} */  // end group cagra
-
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 0a101cbfe..4abed6760 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -16,13 +16,13 @@
 #pragma once
 
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
 #include "search_single_cta_kernel.cuh"
 #include "topk_by_radix.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -49,58 +49,56 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace single_cta_search {
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
+
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   uint32_t num_itopk_candidates;
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric)
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk)
   {
     set_params(res);
   }
@@ -128,14 +126,11 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     constexpr unsigned max_block_size       = 1024;
     //
     const std::uint32_t topk_ws_size = 3;
-    const auto query_smem_buffer_length =
-      raft::ceildiv<uint32_t>(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
     const std::uint32_t base_smem_size =
-      sizeof(float) * query_smem_buffer_length +
+      dataset_desc.smem_ws_size_in_bytes +
       (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
       sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width +
-      sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t) +
-      DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
+      sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t);
     smem_size = base_smem_size;
     if (num_itopk_candidates > 256) {
       // Tentatively calculate the required share memory size when radix
@@ -212,7 +207,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 
   void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
                   raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                   INDEX_T* const result_indices_ptr,       // [num_queries, topk]
                   DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
@@ -224,28 +218,26 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
                   SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-    select_and_run<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T>(
-      dataset_desc,
-      graph,
-      result_indices_ptr,
-      result_distances_ptr,
-      queries_ptr,
-      num_queries,
-      dev_seed_ptr,
-      num_executed_iterations,
-      *this,
-      topk,
-      num_itopk_candidates,
-      static_cast<uint32_t>(thread_block_size),
-      smem_size,
-      hash_bitlen,
-      hashmap.data(),
-      small_hash_bitlen,
-      small_hash_reset_interval,
-      num_seeds,
-      sample_filter,
-      this->metric,
-      stream);
+    select_and_run(dataset_desc.dev_ptr(),
+                   graph,
+                   result_indices_ptr,
+                   result_distances_ptr,
+                   queries_ptr,
+                   num_queries,
+                   dev_seed_ptr,
+                   num_executed_iterations,
+                   *this,
+                   topk,
+                   num_itopk_candidates,
+                   static_cast<uint32_t>(thread_block_size),
+                   smem_size,
+                   hash_bitlen,
+                   hashmap.data(),
+                   small_hash_bitlen,
+                   small_hash_reset_interval,
+                   num_seeds,
+                   sample_filter,
+                   stream);
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
index a361269a6..e37ceb1fa 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
@@ -39,8 +39,6 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 """
 
@@ -48,7 +46,6 @@
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
 """
 
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
 # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
 # itopk_candidates = [64, 128, 256]
 # itopk_size = [64, 128, 256, 512]
@@ -69,14 +66,13 @@
 
 # knn
 for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        path = f"search_single_cta_{type_path}_dim{mxdim}_t{team}.cu"
-        with open(path, "w") as f:
-            f.write(header)
-            f.write(
-                    f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA  {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-            )
+    path = f"search_single_cta_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(
+                f"instantiate_kernel_selection(\n  {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
+        )
 
-            f.write(trailer)
-            # For pasting into CMakeLists.txt
-            print(f"src/neighbors/detail/cagra/{path}")
+        f.write(trailer)
+        # For pasting into CMakeLists.txt
+        print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
index c2cfb13c4..f8495bc01 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
deleted file mode 100644
index 4cf4a26f7..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
deleted file mode 100644
index 692710476..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
deleted file mode 100644
index ed3a900ff..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
index 1e2b83492..0ef5c366f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
deleted file mode 100644
index 2c4da00db..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
deleted file mode 100644
index 8b26a595f..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
deleted file mode 100644
index a93f893d4..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
deleted file mode 100644
index 4a7502e3e..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
index 7d3e86f38..c21e6d1f4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu
deleted file mode 100644
index 6c13df91a..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
deleted file mode 100644
index 12aa72a24..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
index cfae9e367..b96ed0b22 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
deleted file mode 100644
index 84a173d6d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
deleted file mode 100644
index d9c5198eb..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
deleted file mode 100644
index 3ba8f4e4d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
index a4581d15e..26ca7b672 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
@@ -21,31 +21,27 @@
 
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T>(      \
-    DATASET_DESC_T dataset_desc,                                                                  \
-    raft::device_matrix_view<const typename DATASET_DESC_T::INDEX_T, int64_t, raft::row_major>    \
-      graph,                                                                                      \
-    typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr,                                     \
-    typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr,                                \
-    const typename DATASET_DESC_T::DATA_T* const queries_ptr,                                     \
-    const uint32_t num_queries,                                                                   \
-    const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr,                                         \
-    uint32_t* const num_executed_iterations,                                                      \
-    const search_params& ps,                                                                      \
-    uint32_t topk,                                                                                \
-    uint32_t num_itopk_candidates,                                                                \
-    uint32_t block_size,                                                                          \
-    uint32_t smem_size,                                                                           \
-    int64_t hash_bitlen,                                                                          \
-    typename DATASET_DESC_T::INDEX_T* hashmap_ptr,                                                \
-    size_t small_hash_bitlen,                                                                     \
-    size_t small_hash_reset_interval,                                                             \
-    uint32_t num_seeds,                                                                           \
-    SAMPLE_FILTER_T sample_filter,                                                                \
-    cuvs::distance::DistanceType metric,                                                          \
+#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
+  template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
+    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
+    IndexT* topk_indices_ptr,                                                 \
+    DistanceT* topk_distances_ptr,                                            \
+    const DataT* queries_ptr,                                                 \
+    uint32_t num_queries,                                                     \
+    const IndexT* dev_seed_ptr,                                               \
+    uint32_t* num_executed_iterations,                                        \
+    const search_params& ps,                                                  \
+    uint32_t topk,                                                            \
+    uint32_t num_itopk_candidates,                                            \
+    uint32_t block_size,                                                      \
+    uint32_t smem_size,                                                       \
+    int64_t hash_bitlen,                                                      \
+    IndexT* hashmap_ptr,                                                      \
+    size_t small_hash_bitlen,                                                 \
+    size_t small_hash_reset_interval,                                         \
+    uint32_t num_seeds,                                                       \
+    SampleFilterT sample_filter,                                              \
     cudaStream_t stream);
 
-#define COMMA ,
-
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
new file mode 100644
index 000000000..56a0d8ba9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_single_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_single_cta_00_generate.py
+ *
+ */
+
+#include "search_single_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+instantiate_kernel_selection(int8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
deleted file mode 100644
index ad2ca16fc..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
deleted file mode 100644
index 6130a84bc..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
deleted file mode 100644
index 1e7bee57c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
deleted file mode 100644
index 7f789e3d0..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
deleted file mode 100644
index 10dda0389..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ /dev/null
@@ -1,588 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuvs/neighbors/common.hpp>
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-#include <cuda_fp16.h>
-
-namespace cuvs::neighbors::cagra::detail {
-namespace single_cta_search {
-
-#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  uint32_t num_itopk_candidates,
-  uint32_t block_size,  //
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  size_t small_hash_bitlen,
-  size_t small_hash_reset_interval,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream) RAFT_EXPLICIT;
-
-#endif  // CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                     \
-  extern template void select_and_run<                                                          \
-    TEAM_SIZE,                                                                                  \
-    MAX_DATASET_DIM,                                                                            \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>, \
-    SAMPLE_FILTER_T>(                                                                           \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>  \
-      dataset,                                                                                  \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_single_cta_select_and_run(
-  32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_single_cta_select_and_run
-
-#define instantiate_q_single_cta_select_and_run(TEAM_SIZE,                                      \
-                                                MAX_DATASET_DIM,                                \
-                                                CODE_BOOK_T,                                    \
-                                                PQ_BITS,                                        \
-                                                PQ_CODE_BOOK_DIM,                               \
-                                                DATA_T,                                         \
-                                                INDEX_T,                                        \
-                                                DISTANCE_T,                                     \
-                                                SAMPLE_FILTER_T)                                \
-  extern template void                                                                          \
-  select_and_run<TEAM_SIZE,                                                                     \
-                 MAX_DATASET_DIM,                                                               \
-                 cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,           \
-                                                                              CODE_BOOK_T,      \
-                                                                              PQ_BITS,          \
-                                                                              PQ_CODE_BOOK_DIM, \
-                                                                              DISTANCE_T,       \
-                                                                              INDEX_T>,         \
-                 SAMPLE_FILTER_T>(                                                              \
-    cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,                        \
-                                                                 CODE_BOOK_T,                   \
-                                                                 PQ_BITS,                       \
-                                                                 PQ_CODE_BOOK_DIM,              \
-                                                                 DISTANCE_T,                    \
-                                                                 INDEX_T> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        half,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        half,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 1024, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 1024, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_q_single_cta_select_and_run
-
-}  // namespace single_cta_search
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index a101cdc1f..d10313c5b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -15,13 +15,15 @@
  */
 #pragma once
 
+#include "search_single_cta_kernel.cuh"
+
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
 #include "topk_by_radix.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk
 #include "utils.hpp"
 
 #include <cuvs/distance/distance.hpp>
@@ -56,12 +58,11 @@ namespace single_cta_search {
 // #define _CLK_BREAKDOWN
 
 template <unsigned TOPK_BY_BITONIC_SORT, class INDEX_T>
-__device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
-                                    INDEX_T* const next_parent_indices,
-                                    INDEX_T* const internal_topk_indices,
-                                    const std::size_t internal_topk_size,
-                                    const std::size_t dataset_size,
-                                    const std::uint32_t search_width)
+RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag,
+                                                     INDEX_T* const next_parent_indices,
+                                                     INDEX_T* const internal_topk_indices,
+                                                     const std::size_t internal_topk_size,
+                                                     const std::uint32_t search_width)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   // if (threadIdx.x >= 32) return;
@@ -99,11 +100,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
 }
 
 template <unsigned MAX_CANDIDATES, class IdxT = void>
-__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  // [num_candidates]
-                                                IdxT* candidate_indices,     // [num_candidates]
-                                                const std::uint32_t num_candidates,
-                                                const std::uint32_t num_itopk,
-                                                unsigned MULTI_WARPS = 0)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  const std::uint32_t num_itopk,
+  unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -202,15 +204,16 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  //
 }
 
 template <unsigned MAX_ITOPK, class IdxT = void>
-__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num_itopk]
-                                                IdxT* itopk_indices,     // [num_itopk]
-                                                const std::uint32_t num_itopk,
-                                                float* candidate_distances,  // [num_candidates]
-                                                IdxT* candidate_indices,     // [num_candidates]
-                                                const std::uint32_t num_candidates,
-                                                std::uint32_t* work_buf,
-                                                const bool first,
-                                                unsigned MULTI_WARPS = 0)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
+  float* itopk_distances,  // [num_itopk]
+  IdxT* itopk_indices,     // [num_itopk]
+  const std::uint32_t num_itopk,
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  std::uint32_t* work_buf,
+  const bool first,
+  unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -410,16 +413,17 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           class IdxT>
-__device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
-                                     IdxT* itopk_indices,     // [num_itopk]
-                                     const std::uint32_t num_itopk,
-                                     float* candidate_distances,  // [num_candidates]
-                                     IdxT* candidate_indices,     // [num_candidates]
-                                     const std::uint32_t num_candidates,
-                                     std::uint32_t* work_buf,
-                                     const bool first,
-                                     const unsigned MULTI_WARPS_1,
-                                     const unsigned MULTI_WARPS_2)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+  float* itopk_distances,  // [num_itopk]
+  IdxT* itopk_indices,     // [num_itopk]
+  const std::uint32_t num_itopk,
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  std::uint32_t* work_buf,
+  const bool first,
+  const unsigned MULTI_WARPS_1,
+  const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
   topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
@@ -439,11 +443,11 @@ __device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
 }
 
 template <class INDEX_T>
-__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
-                                       const size_t hashmap_bitlen,
-                                       const INDEX_T* itopk_indices,
-                                       const uint32_t itopk_size,
-                                       const uint32_t first_tid = 0)
+RAFT_DEVICE_INLINE_FUNCTION void hashmap_restore(INDEX_T* const hashmap_ptr,
+                                                 const size_t hashmap_bitlen,
+                                                 const INDEX_T* itopk_indices,
+                                                 const uint32_t itopk_size,
+                                                 const uint32_t first_tid = 0)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   if (threadIdx.x < first_tid) return;
@@ -454,18 +458,16 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
 }
 
 // One query one thread block
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          unsigned MAX_ITOPK,
+template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           unsigned TOPK_BY_BITONIC_SORT,
           class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
   const std::uint32_t top_k,
-  DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -483,15 +485,13 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using QUERY_T    = typename DATASET_DESCRIPTOR_T::QUERY_T;
 
   const auto query_id = blockIdx.y;
 
@@ -512,7 +512,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
   _CLK_START();
 
-  extern __shared__ std::uint32_t smem[];
+  extern __shared__ uint8_t smem[];
 
   // Layout of result_buffer
   // +----------------------+------------------------------+---------+
@@ -520,37 +520,28 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // | <internal_topk_size> | <search_width * graph_degree> | upto 32 |
   // +----------------------+------------------------------+---------+
   // |<---             result_buffer_size              --->|
-  std::uint32_t result_buffer_size    = internal_topk + (search_width * graph_degree);
-  std::uint32_t result_buffer_size_32 = result_buffer_size;
-  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
-  const auto small_hash_size = hashmap::get_size(small_hash_bitlen);
-
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  auto query_buffer          = reinterpret_cast<QUERY_T*>(smem);
-  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + query_smem_buffer_length);
-  auto result_distances_buffer =
-    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
-  auto visited_hash_buffer =
-    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
-  auto parent_list_buffer = reinterpret_cast<INDEX_T*>(visited_hash_buffer + small_hash_size);
-  auto distance_work_buffer_ptr =
-    reinterpret_cast<std::uint8_t*>(parent_list_buffer + search_width);
-  auto topk_ws        = reinterpret_cast<std::uint32_t*>(distance_work_buffer_ptr +
-                                                  DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte);
-  auto terminate_flag = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
-  auto smem_work_ptr  = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
+  const auto result_buffer_size    = internal_topk + (search_width * graph_degree);
+  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
+  const auto small_hash_size       = hashmap::get_size(small_hash_bitlen);
 
   // Set smem working buffer for the distance calculation
-  dataset_desc.set_smem_ptr(distance_work_buffer_ptr);
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+
+  auto* __restrict__ result_indices_buffer =
+    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
+  auto* __restrict__ result_distances_buffer =
+    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
+  auto* __restrict__ visited_hash_buffer =
+    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
+  auto* __restrict__ parent_list_buffer =
+    reinterpret_cast<INDEX_T*>(visited_hash_buffer + small_hash_size);
+  auto* __restrict__ topk_ws = reinterpret_cast<std::uint32_t*>(parent_list_buffer + search_width);
+  auto* terminate_flag       = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
+  auto* __restrict__ smem_work_ptr = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
 
   // A flag for filtering.
   auto filter_flag = terminate_flag;
 
-  const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc.dim;
-  dataset_desc.template copy_query<DATASET_BLOCK_DIM>(
-    query_ptr, query_buffer, query_smem_buffer_length);
-
   if (threadIdx.x == 0) {
     terminate_flag[0] = 0;
     topk_ws[0]        = ~0u;
@@ -570,18 +561,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // compute distance to randomly selecting nodes
   _CLK_START();
   const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
-  device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
-                                                                         result_distances_buffer,
-                                                                         query_buffer,
-                                                                         dataset_desc,
-                                                                         result_buffer_size,
-                                                                         num_distilation,
-                                                                         rand_xor_mask,
-                                                                         local_seed_ptr,
-                                                                         num_seeds,
-                                                                         local_visited_hashmap_ptr,
-                                                                         hash_bitlen,
-                                                                         metric);
+  device::compute_distance_to_random_nodes(result_indices_buffer,
+                                           result_distances_buffer,
+                                           *dataset_desc,
+                                           result_buffer_size,
+                                           num_distilation,
+                                           rand_xor_mask,
+                                           local_seed_ptr,
+                                           num_seeds,
+                                           local_visited_hashmap_ptr,
+                                           hash_bitlen);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -666,7 +655,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
         nullptr,
         topk_ws,
         true,
-        reinterpret_cast<std::uint32_t*>(smem_work_ptr));
+        smem_work_ptr);
       _CLK_REC(clk_topk);
 
       // reset small-hash table
@@ -683,12 +672,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
     // pick up next parents
     if (threadIdx.x < 32) {
       _CLK_START();
-      pickup_next_parents<TOPK_BY_BITONIC_SORT, INDEX_T>(terminate_flag,
-                                                         parent_list_buffer,
-                                                         result_indices_buffer,
-                                                         internal_topk,
-                                                         dataset_desc.size,
-                                                         search_width);
+      pickup_next_parents<TOPK_BY_BITONIC_SORT, INDEX_T>(
+        terminate_flag, parent_list_buffer, result_indices_buffer, internal_topk, search_width);
       _CLK_REC(clk_pickup_parents);
     }
 
@@ -706,20 +691,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 
     // compute the norms between child nodes and query node
     _CLK_START();
-    constexpr unsigned max_n_frags = 8;
-    device::compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, max_n_frags>(
-      result_indices_buffer + internal_topk,
-      result_distances_buffer + internal_topk,
-      query_buffer,
-      dataset_desc,
-      knn_graph,
-      graph_degree,
-      local_visited_hashmap_ptr,
-      hash_bitlen,
-      parent_list_buffer,
-      result_indices_buffer,
-      search_width,
-      metric);
+    device::compute_distance_to_child_nodes(result_indices_buffer + internal_topk,
+                                            result_distances_buffer + internal_topk,
+                                            *dataset_desc,
+                                            knn_graph,
+                                            graph_degree,
+                                            local_visited_hashmap_ptr,
+                                            hash_bitlen,
+                                            parent_list_buffer,
+                                            result_indices_buffer,
+                                            search_width);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
@@ -815,50 +796,33 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
 }
 
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
+template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           DATASET_BLOCK_DIM,
-                                           64,
-                                           64,
-                                           0,
-                                           DATASET_DESCRIPTOR_T,
-                                           SAMPLE_FILTER_T>);
+  using kernel_t = decltype(&search_kernel<64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
 
   template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
   static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
     if (itopk_size <= 64) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           64,
+      return search_kernel<64,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 128) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           128,
+      return search_kernel<128,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 256) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           256,
+      return search_kernel<256,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 512) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           512,
+      return search_kernel<512,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
@@ -882,21 +846,9 @@ struct search_kernel_config {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             256,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return search_kernel<256, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
       } else if (itopk_size <= 512) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             512,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return search_kernel<512, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -905,40 +857,35 @@ struct search_kernel_config {
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  uint32_t num_itopk_candidates,
-  uint32_t block_size,  //
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  size_t small_hash_bitlen,
-  size_t small_hash_reset_interval,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream)
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    uint32_t num_itopk_candidates,
+                    uint32_t block_size,  //
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    size_t small_hash_bitlen,
+                    size_t small_hash_reset_interval,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream)
 {
   auto kernel =
-    search_kernel_config<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::
-      choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
-  RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
-                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                     smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
+    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
+                         SampleFilterT>::choose_itopk_and_mx_candidates(ps.itopk_size,
+                                                                        num_itopk_candidates,
+                                                                        block_size);
+  RAFT_CUDA_TRY(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   dim3 thread_dims(block_size, 1, 1);
   dim3 block_dims(1, num_queries, 1);
   RAFT_LOG_DEBUG(
@@ -963,9 +910,9 @@ void select_and_run(
                                                          hash_bitlen,
                                                          small_hash_bitlen,
                                                          small_hash_reset_interval,
-                                                         sample_filter,
-                                                         metric);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+                                                         sample_filter);
+  // RAFT_CUDA_TRY(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 }  // namespace single_cta_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
index 1ccec9219..7b7f44db7 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,32 @@
  */
 #pragma once
 
-#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-#include "search_single_cta_kernel-inl.cuh"
-#endif
+#include "compute_distance-ext.cuh"
 
-#ifdef RAFT_COMPILED
-#include "search_single_cta_kernel-ext.cuh"
-#endif
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    uint32_t num_itopk_candidates,
+                    uint32_t block_size,  //
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    size_t small_hash_bitlen,
+                    size_t small_hash_reset_interval,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream);
+
+}
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
new file mode 100644
index 000000000..ee6427170
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_single_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_single_cta_00_generate.py
+ *
+ */
+
+#include "search_single_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+instantiate_kernel_selection(uint8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
deleted file mode 100644
index 35e04ea6a..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
deleted file mode 100644
index 614e6ca01..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
deleted file mode 100644
index 005afb566..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
deleted file mode 100644
index af30b2e24..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
index 67173026b..b6f97cb26 100644
--- a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
+++ b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
@@ -32,17 +32,17 @@ struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {};
 template <unsigned MAX_INTERNAL_TOPK, class IdxT>
 struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
   : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
-  __device__ void operator()(uint32_t topk,
-                             uint32_t batch_size,
-                             uint32_t len_x,
-                             const uint32_t* _x,
-                             const IdxT* _in_vals,
-                             uint32_t* _y,
-                             IdxT* _out_vals,
-                             uint32_t* work,
-                             uint32_t* _hints,
-                             bool sort,
-                             uint32_t* _smem)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk,
+                                              uint32_t batch_size,
+                                              uint32_t len_x,
+                                              const uint32_t* _x,
+                                              const IdxT* _in_vals,
+                                              uint32_t* _y,
+                                              IdxT* _out_vals,
+                                              uint32_t* work,
+                                              uint32_t* _hints,
+                                              bool sort,
+                                              uint32_t* _smem)
   {
     std::uint8_t* const state = reinterpret_cast<std::uint8_t*>(work);
     topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
@@ -60,17 +60,17 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERN
     IdxT,                                                                            \
     std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>>     \
     : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {                                   \
-    __device__ void operator()(uint32_t topk,                                        \
-                               uint32_t batch_size,                                  \
-                               uint32_t len_x,                                       \
-                               const uint32_t* _x,                                   \
-                               const IdxT* _in_vals,                                 \
-                               uint32_t* _y,                                         \
-                               IdxT* _out_vals,                                      \
-                               uint32_t* work,                                       \
-                               uint32_t* _hints,                                     \
-                               bool sort,                                            \
-                               uint32_t* _smem)                                      \
+    RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk,                       \
+                                                uint32_t batch_size,                 \
+                                                uint32_t len_x,                      \
+                                                const uint32_t* _x,                  \
+                                                const IdxT* _in_vals,                \
+                                                uint32_t* _y,                        \
+                                                IdxT* _out_vals,                     \
+                                                uint32_t* work,                      \
+                                                uint32_t* _hints,                    \
+                                                bool sort,                           \
+                                                uint32_t* _smem)                     \
     {                                                                                \
       assert(blockDim.x >= V / 4);                                                   \
       std::uint8_t* state = (std::uint8_t*)work;                                     \
diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu
new file mode 100644
index 000000000..72ff2cb85
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "topk_core.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+//
+size_t _cuann_find_topk_bufferSize(uint32_t topK,
+                                   uint32_t sizeBatch,
+                                   uint32_t numElements,
+                                   cudaDataType_t sampleDtype)
+{
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  size_t workspaceSize = 1;
+  // state
+  if (stateBitLen == 8) {
+    workspaceSize = _cuann_aligned(
+      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
+  }
+
+  return workspaceSize;
+}
+
+template <class ValT>
+void _cuann_find_topk(uint32_t topK,
+                      uint32_t sizeBatch,
+                      uint32_t numElements,
+                      const float* inputKeys,  // [sizeBatch, ldIK,]
+                      uint32_t ldIK,           // (*) ldIK >= numElements
+                      const ValT* inputVals,   // [sizeBatch, ldIV,]
+                      uint32_t ldIV,           // (*) ldIV >= numElements
+                      float* outputKeys,       // [sizeBatch, ldOK,]
+                      uint32_t ldOK,           // (*) ldOK >= topK
+                      ValT* outputVals,        // [sizeBatch, ldOV,]
+                      uint32_t ldOV,           // (*) ldOV >= topK
+                      void* workspace,
+                      bool sort,
+                      uint32_t* hints,
+                      cudaStream_t stream)
+{
+  assert(ldIK >= numElements);
+  assert(ldIV >= numElements);
+  assert(ldOK >= topK);
+  assert(ldOV >= topK);
+
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  uint8_t* state = NULL;
+  if (stateBitLen == 8) { state = (uint8_t*)workspace; }
+
+  dim3 threads(numThreads, 1, 1);
+  dim3 blocks(sizeBatch, 1, 1);
+
+  void (*cta_kernel)(uint32_t,
+                     uint32_t,
+                     uint32_t,
+                     const uint32_t*,
+                     uint32_t,
+                     const ValT*,
+                     uint32_t,
+                     uint32_t*,
+                     uint32_t,
+                     ValT*,
+                     uint32_t,
+                     uint8_t*,
+                     uint32_t*,
+                     bool) = nullptr;
+
+  // V:vecLen, K:maxTopk, T:numSortThreads
+#define SET_KERNEL_VKT(V, K, T, ValT)                          \
+  do {                                                         \
+    assert(numThreads >= T);                                   \
+    assert((K % T) == 0);                                      \
+    assert((K / T) <= 4);                                      \
+    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
+  } while (0)
+
+  // V: vecLen
+#define SET_KERNEL_V(V, ValT)                                \
+  do {                                                       \
+    if (topK <= 32) {                                        \
+      SET_KERNEL_VKT(V, 32, 32, ValT);                       \
+    } else if (topK <= 64) {                                 \
+      SET_KERNEL_VKT(V, 64, 32, ValT);                       \
+    } else if (topK <= 96) {                                 \
+      SET_KERNEL_VKT(V, 96, 32, ValT);                       \
+    } else if (topK <= 128) {                                \
+      SET_KERNEL_VKT(V, 128, 32, ValT);                      \
+    } else if (topK <= 192) {                                \
+      SET_KERNEL_VKT(V, 192, 64, ValT);                      \
+    } else if (topK <= 256) {                                \
+      SET_KERNEL_VKT(V, 256, 64, ValT);                      \
+    } else if (topK <= 384) {                                \
+      SET_KERNEL_VKT(V, 384, 128, ValT);                     \
+    } else if (topK <= 512) {                                \
+      SET_KERNEL_VKT(V, 512, 128, ValT);                     \
+    } else if (topK <= 768) {                                \
+      SET_KERNEL_VKT(V, 768, 256, ValT);                     \
+    } else if (topK <= 1024) {                               \
+      SET_KERNEL_VKT(V, 1024, 256, ValT);                    \
+    } \
+        /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \
+        /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \
+        /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \
+        /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \
+        else {                                                      \
+      RAFT_FAIL("topk must be lower than or equal to 1024"); \
+    }                                                        \
+  } while (0)
+
+  int _vecLen = _get_vecLen(ldIK, 2);
+  if (_vecLen == 2) {
+    SET_KERNEL_V(2, ValT);
+  } else if (_vecLen == 1) {
+    SET_KERNEL_V(1, ValT);
+  }
+
+  cta_kernel<<<blocks, threads, 0, stream>>>(topK,
+                                             sizeBatch,
+                                             numElements,
+                                             (const uint32_t*)inputKeys,
+                                             ldIK,
+                                             inputVals,
+                                             ldIV,
+                                             (uint32_t*)outputKeys,
+                                             ldOK,
+                                             outputVals,
+                                             ldOV,
+                                             state,
+                                             hints,
+                                             sort);
+
+  return;
+}
+
+template void _cuann_find_topk<uint32_t>(uint32_t topK,
+                                         uint32_t sizeBatch,
+                                         uint32_t numElements,
+                                         const float* inputKeys,     // [sizeBatch, ldIK,]
+                                         uint32_t ldIK,              // (*) ldIK >= numElements
+                                         const uint32_t* inputVals,  // [sizeBatch, ldIV,]
+                                         uint32_t ldIV,              // (*) ldIV >= numElements
+                                         float* outputKeys,          // [sizeBatch, ldOK,]
+                                         uint32_t ldOK,              // (*) ldOK >= topK
+                                         uint32_t* outputVals,       // [sizeBatch, ldOV,]
+                                         uint32_t ldOV,              // (*) ldOV >= topK
+                                         void* workspace,
+                                         bool sort,
+                                         uint32_t* hint,
+                                         cudaStream_t stream);
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index cbf99a556..65f9cfade 100644
--- a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -14,10 +14,15 @@
  * limitations under the License.
  */
 #pragma once
+
+#include "../utils.hpp"
 #include "topk.h"
 
 #include <cub/cub.cuh>
 
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/error.hpp>
+
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
@@ -25,7 +30,7 @@
 
 namespace cuvs::neighbors::cagra::detail {
 //
-__device__ inline uint32_t convert(uint32_t x)
+RAFT_DEVICE_INLINE_FUNCTION constexpr uint32_t convert(uint32_t x)
 {
   if (x & 0x80000000) {
     return x ^ 0xffffffff;
@@ -35,7 +40,7 @@ __device__ inline uint32_t convert(uint32_t x)
 }
 
 //
-__device__ inline uint16_t convert(uint16_t x)
+RAFT_DEVICE_INLINE_FUNCTION constexpr uint16_t convert(uint16_t x)
 {
   if (x & 0x8000) {
     return x ^ 0xffff;
@@ -62,7 +67,7 @@ struct u16_vector {
 
 //
 template <int vecLen>
-__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i)
+RAFT_DEVICE_INLINE_FUNCTION void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i)
 {
   if (vecLen == 1) {
     vec.x1 = ((uint1*)(x + i))[0];
@@ -77,7 +82,7 @@ __device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x
 
 //
 template <int vecLen>
-__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i)
+RAFT_DEVICE_INLINE_FUNCTION void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i)
 {
   if (vecLen == 1) {
     vec.x1 = ((ushort1*)(x + i))[0];
@@ -92,7 +97,7 @@ __device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x
 
 //
 template <int vecLen>
-__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i)
 {
   uint32_t xi;
   if (vecLen == 1) {
@@ -134,7 +139,7 @@ __device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, i
 
 //
 template <int vecLen>
-__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i)
+RAFT_DEVICE_INLINE_FUNCTION uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i)
 {
   uint16_t xi;
   if (vecLen == 1) {
@@ -175,7 +180,7 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i
 }
 
 template <typename T>
-__device__ inline void block_scan(const T input, T& output)
+RAFT_DEVICE_INLINE_FUNCTION void block_scan(const T input, T& output)
 {
   switch (blockDim.x) {
     case 32: {
@@ -214,19 +219,19 @@ __device__ inline void block_scan(const T input, T& output)
 
 //
 template <typename T, int stateBitLen, int vecLen>
-__device__ inline void update_histogram(int itr,
-                                        uint32_t thread_id,
-                                        uint32_t num_threads,
-                                        uint32_t hint,
-                                        uint32_t threshold,
-                                        uint32_t& num_bins,
-                                        uint32_t& shift,
-                                        const T* x,  // [nx,]
-                                        uint32_t nx,
-                                        uint32_t* hist,  // [num_bins]
-                                        uint8_t* state,
-                                        uint32_t* output,  // [topk]
-                                        uint32_t* output_count)
+RAFT_DEVICE_INLINE_FUNCTION void update_histogram(int itr,
+                                                  uint32_t thread_id,
+                                                  uint32_t num_threads,
+                                                  uint32_t hint,
+                                                  uint32_t threshold,
+                                                  uint32_t& num_bins,
+                                                  uint32_t& shift,
+                                                  const T* x,  // [nx,]
+                                                  uint32_t nx,
+                                                  uint32_t* hist,  // [num_bins]
+                                                  uint8_t* state,
+                                                  uint32_t* output,  // [topk]
+                                                  uint32_t* output_count)
 {
   if (sizeof(T) == 4) {
     // 32-bit (uint32_t)
@@ -324,15 +329,16 @@ __device__ inline void update_histogram(int itr,
 }
 
 template <unsigned blockDim_x>
-__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index,
-                                                                 uint32_t& my_csum,
-                                                                 const unsigned num_bins,
-                                                                 const uint32_t* const hist,
-                                                                 const uint32_t nx_below_threshold,
-                                                                 const uint32_t max_threshold,
-                                                                 const uint32_t threshold,
-                                                                 const uint32_t shift,
-                                                                 const uint32_t topk)
+RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold_core(
+  uint32_t& my_index,
+  uint32_t& my_csum,
+  const unsigned num_bins,
+  const uint32_t* const hist,
+  const uint32_t nx_below_threshold,
+  const uint32_t max_threshold,
+  const uint32_t threshold,
+  const uint32_t shift,
+  const uint32_t topk)
 {
   typedef cub::BlockScan<uint32_t, blockDim_x> BlockScanT;
   __shared__ typename BlockScanT::TempStorage temp_storage;
@@ -370,7 +376,7 @@ __device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_in
 }
 
 //
-__device__ inline void select_best_index_for_next_threshold(
+RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold(
   const uint32_t topk,
   const uint32_t threshold,
   const uint32_t max_threshold,
@@ -469,17 +475,17 @@ __device__ inline void select_best_index_for_next_threshold(
 
 //
 template <typename T, int stateBitLen, int vecLen>
-__device__ inline void output_index_below_threshold(const uint32_t topk,
-                                                    const uint32_t thread_id,
-                                                    const uint32_t num_threads,
-                                                    const uint32_t threshold,
-                                                    const uint32_t nx_below_threshold,
-                                                    const T* const x,  // [nx,]
-                                                    const uint32_t nx,
-                                                    const uint8_t* state,
-                                                    uint32_t* const output,  // [topk]
-                                                    uint32_t* const output_count,
-                                                    uint32_t* const output_count_eq)
+RAFT_DEVICE_INLINE_FUNCTION void output_index_below_threshold(const uint32_t topk,
+                                                              const uint32_t thread_id,
+                                                              const uint32_t num_threads,
+                                                              const uint32_t threshold,
+                                                              const uint32_t nx_below_threshold,
+                                                              const T* const x,  // [nx,]
+                                                              const uint32_t nx,
+                                                              const uint8_t* state,
+                                                              uint32_t* const output,  // [topk]
+                                                              uint32_t* const output_count,
+                                                              uint32_t* const output_count_eq)
 {
   int ii = 0;
   for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) {
@@ -530,7 +536,7 @@ __device__ inline void output_index_below_threshold(const uint32_t topk,
 
 //
 template <typename T>
-__device__ inline void swap(T& val1, T& val2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr void swap(T& val1, T& val2)
 {
   const T val0 = val1;
   val1         = val2;
@@ -539,7 +545,7 @@ __device__ inline void swap(T& val1, T& val2)
 
 //
 template <typename K>
-__device__ inline bool swap_if_needed(K& key1, K& key2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2)
 {
   if (key1 > key2) {
     swap<K>(key1, key2);
@@ -550,7 +556,7 @@ __device__ inline bool swap_if_needed(K& key1, K& key2)
 
 //
 template <typename K, typename V>
-__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
 {
   if (key1 > key2) {
     swap<K>(key1, key2);
@@ -562,7 +568,8 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
 
 //
 template <typename K, typename V>
-__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(
+  K& key1, K& key2, V& val1, V& val2, bool ascending)
 {
   if (key1 == key2) { return false; }
   if ((key1 > key2) == ascending) {
@@ -575,20 +582,20 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 
 //
 template <typename T>
-__device__ inline T max_value_of();
+RAFT_DEVICE_INLINE_FUNCTION T max_value_of();
 template <>
-__device__ inline float max_value_of<float>()
+RAFT_DEVICE_INLINE_FUNCTION float max_value_of<float>()
 {
   return FLT_MAX;
 }
 template <>
-__device__ inline uint32_t max_value_of<uint32_t>()
+RAFT_DEVICE_INLINE_FUNCTION uint32_t max_value_of<uint32_t>()
 {
   return ~0u;
 }
 
 template <int stateBitLen, unsigned BLOCK_SIZE = 0>
-__device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
+RAFT_INLINE_FUNCTION constexpr uint32_t get_state_size(uint32_t len_x)
 {
 #ifdef __CUDA_ARCH__
   const uint32_t num_threads = blockDim.x;
@@ -605,16 +612,16 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 
 //
 template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
-__device__ inline void topk_cta_11_core(uint32_t topk,
-                                        uint32_t len_x,
-                                        const uint32_t* _x,    // [size_batch, ld_x,]
-                                        const ValT* _in_vals,  // [size_batch, ld_iv,]
-                                        uint32_t* _y,          // [size_batch, ld_y,]
-                                        ValT* _out_vals,       // [size_batch, ld_ov,]
-                                        uint8_t* _state,       // [size_batch, ...,]
-                                        uint32_t* _hint,
-                                        bool sort,
-                                        uint32_t* _smem)
+RAFT_DEVICE_INLINE_FUNCTION void topk_cta_11_core(uint32_t topk,
+                                                  uint32_t len_x,
+                                                  const uint32_t* _x,    // [size_batch, ld_x,]
+                                                  const ValT* _in_vals,  // [size_batch, ld_iv,]
+                                                  uint32_t* _y,          // [size_batch, ld_y,]
+                                                  ValT* _out_vals,       // [size_batch, ld_ov,]
+                                                  uint8_t* _state,       // [size_batch, ...,]
+                                                  uint32_t* _hint,
+                                                  bool sort,
+                                                  uint32_t* _smem)
 {
   uint32_t* const smem_out_vals = _smem;
   uint32_t* const hist          = &(_smem[2 * maxTopk]);
@@ -904,137 +911,4 @@ __launch_bounds__(1024, 1) RAFT_KERNEL
     _smem);
 }
 
-//
-size_t inline _cuann_find_topk_bufferSize(uint32_t topK,
-                                          uint32_t sizeBatch,
-                                          uint32_t numElements,
-                                          cudaDataType_t sampleDtype)
-{
-  constexpr int numThreads  = NUM_THREADS;
-  constexpr int stateBitLen = STATE_BIT_LENGTH;
-  assert(stateBitLen == 0 || stateBitLen == 8);
-
-  size_t workspaceSize = 1;
-  // state
-  if (stateBitLen == 8) {
-    workspaceSize = _cuann_aligned(
-      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
-  }
-
-  return workspaceSize;
-}
-
-template <class ValT>
-inline void _cuann_find_topk(uint32_t topK,
-                             uint32_t sizeBatch,
-                             uint32_t numElements,
-                             const float* inputKeys,  // [sizeBatch, ldIK,]
-                             uint32_t ldIK,           // (*) ldIK >= numElements
-                             const ValT* inputVals,   // [sizeBatch, ldIV,]
-                             uint32_t ldIV,           // (*) ldIV >= numElements
-                             float* outputKeys,       // [sizeBatch, ldOK,]
-                             uint32_t ldOK,           // (*) ldOK >= topK
-                             ValT* outputVals,        // [sizeBatch, ldOV,]
-                             uint32_t ldOV,           // (*) ldOV >= topK
-                             void* workspace,
-                             bool sort,
-                             uint32_t* hints,
-                             cudaStream_t stream)
-{
-  assert(ldIK >= numElements);
-  assert(ldIV >= numElements);
-  assert(ldOK >= topK);
-  assert(ldOV >= topK);
-
-  constexpr int numThreads  = NUM_THREADS;
-  constexpr int stateBitLen = STATE_BIT_LENGTH;
-  assert(stateBitLen == 0 || stateBitLen == 8);
-
-  uint8_t* state = NULL;
-  if (stateBitLen == 8) { state = (uint8_t*)workspace; }
-
-  dim3 threads(numThreads, 1, 1);
-  dim3 blocks(sizeBatch, 1, 1);
-
-  void (*cta_kernel)(uint32_t,
-                     uint32_t,
-                     uint32_t,
-                     const uint32_t*,
-                     uint32_t,
-                     const ValT*,
-                     uint32_t,
-                     uint32_t*,
-                     uint32_t,
-                     ValT*,
-                     uint32_t,
-                     uint8_t*,
-                     uint32_t*,
-                     bool) = nullptr;
-
-  // V:vecLen, K:maxTopk, T:numSortThreads
-#define SET_KERNEL_VKT(V, K, T, ValT)                          \
-  do {                                                         \
-    assert(numThreads >= T);                                   \
-    assert((K % T) == 0);                                      \
-    assert((K / T) <= 4);                                      \
-    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
-  } while (0)
-
-  // V: vecLen
-#define SET_KERNEL_V(V, ValT)                                \
-  do {                                                       \
-    if (topK <= 32) {                                        \
-      SET_KERNEL_VKT(V, 32, 32, ValT);                       \
-    } else if (topK <= 64) {                                 \
-      SET_KERNEL_VKT(V, 64, 32, ValT);                       \
-    } else if (topK <= 96) {                                 \
-      SET_KERNEL_VKT(V, 96, 32, ValT);                       \
-    } else if (topK <= 128) {                                \
-      SET_KERNEL_VKT(V, 128, 32, ValT);                      \
-    } else if (topK <= 192) {                                \
-      SET_KERNEL_VKT(V, 192, 64, ValT);                      \
-    } else if (topK <= 256) {                                \
-      SET_KERNEL_VKT(V, 256, 64, ValT);                      \
-    } else if (topK <= 384) {                                \
-      SET_KERNEL_VKT(V, 384, 128, ValT);                     \
-    } else if (topK <= 512) {                                \
-      SET_KERNEL_VKT(V, 512, 128, ValT);                     \
-    } else if (topK <= 768) {                                \
-      SET_KERNEL_VKT(V, 768, 256, ValT);                     \
-    } else if (topK <= 1024) {                               \
-      SET_KERNEL_VKT(V, 1024, 256, ValT);                    \
-    } \
-        /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \
-        /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \
-        /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \
-        /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \
-        else {                                                      \
-      RAFT_FAIL("topk must be lower than or equal to 1024"); \
-    }                                                        \
-  } while (0)
-
-  int _vecLen = _get_vecLen(ldIK, 2);
-  if (_vecLen == 2) {
-    SET_KERNEL_V(2, ValT);
-  } else if (_vecLen == 1) {
-    SET_KERNEL_V(1, ValT);
-  }
-
-  cta_kernel<<<blocks, threads, 0, stream>>>(topK,
-                                             sizeBatch,
-                                             numElements,
-                                             (const uint32_t*)inputKeys,
-                                             ldIK,
-                                             inputVals,
-                                             ldIV,
-                                             (uint32_t*)outputKeys,
-                                             ldOK,
-                                             outputVals,
-                                             ldOV,
-                                             state,
-                                             hints,
-                                             sort);
-
-  return;
-}
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 8ce20ec5c..0f8309328 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -125,24 +125,24 @@ union fp_conv {
   FP_T fp;
 };
 template <class T>
-_RAFT_HOST_DEVICE inline T get_max_value();
+_RAFT_HOST_DEVICE constexpr inline T get_max_value();
 template <>
-_RAFT_HOST_DEVICE inline float get_max_value<float>()
+_RAFT_HOST_DEVICE constexpr inline float get_max_value<float>()
 {
   return FLT_MAX;
 };
 template <>
-_RAFT_HOST_DEVICE inline half get_max_value<half>()
+_RAFT_HOST_DEVICE constexpr inline half get_max_value<half>()
 {
   return fp_conv<std::uint16_t, half>{.bs = 0x7aff}.fp;
 };
 template <>
-_RAFT_HOST_DEVICE inline std::uint32_t get_max_value<std::uint32_t>()
+_RAFT_HOST_DEVICE constexpr inline std::uint32_t get_max_value<std::uint32_t>()
 {
   return 0xffffffffu;
 };
 template <>
-_RAFT_HOST_DEVICE inline std::uint64_t get_max_value<std::uint64_t>()
+_RAFT_HOST_DEVICE constexpr inline std::uint64_t get_max_value<std::uint64_t>()
 {
   return 0xfffffffffffffffflu;
 };
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 9d2f9c175..4ce0849fd 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -706,7 +706,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::AUTO},
     {10},
-    {0, 4, 8, 16, 32},  // team_size
+    {0, 8, 16, 32},  // team_size
     {64},
     {1},
     {cuvs::distance::DistanceType::L2Expanded},