diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fec1248bb..d8d554648 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -199,6 +199,96 @@ endif() # ################################################################################################## # * cuvs --------------------------------------------------------------------- +add_library( + cuvs-cagra-search STATIC + src/neighbors/cagra_search_float.cu + src/neighbors/cagra_search_int8.cu + src/neighbors/cagra_search_uint8.cu + src/neighbors/detail/cagra/compute_distance.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64.cu +) + +file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu") +set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) + +set_target_properties( + cuvs-cagra-search + PROPERTIES BUILD_RPATH "\$ORIGIN" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + CUDA_SEPARABLE_COMPILATION ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + POSITION_INDEPENDENT_CODE ON +) +target_link_libraries(cuvs-cagra-search PRIVATE raft::raft) +target_include_directories( + cuvs-cagra-search PRIVATE "$" +) +target_compile_options( + cuvs-cagra-search PRIVATE "$<$:${CUVS_CXX_FLAGS}>" + "$<$:${CUVS_CUDA_FLAGS}>" +) add_library( cuvs SHARED @@ -266,109 +356,11 @@ add_library( src/neighbors/cagra_extend_int8.cu src/neighbors/cagra_extend_uint8.cu src/neighbors/cagra_optimize.cu - src/neighbors/cagra_search_float.cu - src/neighbors/cagra_search_int8.cu - src/neighbors/cagra_search_uint8.cu src/neighbors/cagra_serialize_float.cu src/neighbors/cagra_serialize_int8.cu src/neighbors/cagra_serialize_uint8.cu src/neighbors/detail/cagra/cagra_build.cpp - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/topk_for_cagra/topk.cu $<$:src/neighbors/hnsw.cpp> src/neighbors/ivf_flat_index.cpp src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu @@ -463,7 +455,7 @@ if(NOT BUILD_CPU_ONLY) target_link_libraries( cuvs PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES} - PRIVATE nvidia::cutlass::cutlass $ + PRIVATE nvidia::cutlass::cutlass $ cuvs-cagra-search ) endif() @@ -539,7 +531,7 @@ target_compile_options( "$<$:${CUVS_CUDA_FLAGS}>" ) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries -target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") +target_link_options(cuvs PRIVATE $) # ################################################################################################## # * cuvs_c ------------------------------------------------------------------------------- diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 414438067..8218b5f52 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -172,6 +172,22 @@ struct owning_dataset : public strided_dataset { }; }; +template +struct is_strided_dataset : std::false_type {}; + +template +struct is_strided_dataset> : std::true_type {}; + +template +struct is_strided_dataset> : std::true_type {}; + +template +struct is_strided_dataset> + : std::true_type {}; + +template +inline constexpr bool is_strided_dataset_v = is_strided_dataset::value; + /** * @brief Contstruct a strided matrix from any mdarray or mdspan. * @@ -284,23 +300,25 @@ auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t */ template struct vpq_dataset : public dataset { + using index_type = IdxT; + using math_type = MathT; /** Vector Quantization codebook - "coarse cluster centers". */ - raft::device_matrix vq_code_book; + raft::device_matrix vq_code_book; /** Product Quantization codebook - "fine cluster centers". */ - raft::device_matrix pq_code_book; + raft::device_matrix pq_code_book; /** Compressed dataset. */ - raft::device_matrix data; + raft::device_matrix data; - vpq_dataset(raft::device_matrix&& vq_code_book, - raft::device_matrix&& pq_code_book, - raft::device_matrix&& data) + vpq_dataset(raft::device_matrix&& vq_code_book, + raft::device_matrix&& pq_code_book, + raft::device_matrix&& data) : vq_code_book{std::move(vq_code_book)}, pq_code_book{std::move(pq_code_book)}, data{std::move(data)} { } - [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); } + [[nodiscard]] auto n_rows() const noexcept -> index_type final { return data.extent(0); } [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); } [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; } @@ -354,6 +372,15 @@ struct vpq_dataset : public dataset { } }; +template +struct is_vpq_dataset : std::false_type {}; + +template +struct is_vpq_dataset> : std::true_type {}; + +template +inline constexpr bool is_vpq_dataset_v = is_vpq_dataset::value; + namespace filtering { /* A filter that filters nothing. This is the default behavior. */ diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 1db2dca64..29f790ec5 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -224,7 +224,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream) } template -RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) +static __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); IdxT i = gid / len_b; @@ -234,12 +234,12 @@ RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* } template -RAFT_KERNEL block_copy_kernel(const IdxT* in_offsets, - const IdxT* out_offsets, - IdxT n_blocks, - const T* in_data, - T* out_data, - IdxT n_mult) +static __global__ void block_copy_kernel(const IdxT* in_offsets, + const IdxT* out_offsets, + IdxT n_blocks, + const T* in_data, + T* out_data, + IdxT n_mult) { IdxT i = static_cast(blockDim.x) * static_cast(blockIdx.x) + threadIdx.x; // find the source offset using the binary search. @@ -317,7 +317,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s } template -RAFT_KERNEL copy_selected_kernel( +static __global__ void copy_selected_kernel( IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); diff --git a/cpp/src/neighbors/detail/cagra/bitonic.hpp b/cpp/src/neighbors/detail/cagra/bitonic.hpp index 26195bd9c..ed609d6fd 100644 --- a/cpp/src/neighbors/detail/cagra/bitonic.hpp +++ b/cpp/src/neighbors/detail/cagra/bitonic.hpp @@ -26,7 +26,7 @@ namespace bitonic { namespace detail { template -_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc) +RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc) { if ((k0 != k1) && ((k0 < k1) != asc)) { const auto tmp_k = k0; @@ -39,7 +39,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool a } template -_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset, const bool asc) +RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, + V& v0, + const unsigned lane_offset, + const bool asc) { auto k1 = __shfl_xor_sync(~0u, k0, lane_offset); auto v1 = __shfl_xor_sync(~0u, v0, lane_offset); @@ -51,7 +54,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[N], V v[N], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[N], + V v[N], + const std::uint32_t range, + const bool asc) { const auto lane_id = threadIdx.x % warp_size; @@ -93,7 +99,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[6], V v[6], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[6], + V v[6], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 6; const auto lane_id = threadIdx.x % warp_size; @@ -141,7 +150,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[3], V v[3], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[3], + V v[3], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 3; const auto lane_id = threadIdx.x % warp_size; @@ -171,7 +183,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[2], V v[2], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[2], + V v[2], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 2; const auto lane_id = threadIdx.x % warp_size; @@ -197,7 +212,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[1], V v[1], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[1], + V v[1], + const std::uint32_t range, + const bool asc) { const auto lane_id = threadIdx.x % warp_size; const std::uint32_t b = range; @@ -211,14 +229,15 @@ struct warp_merge_core { } // namespace detail template -__device__ void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true) +RAFT_DEVICE_INLINE_FUNCTION void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true) { detail::warp_merge_core{}(k, v, range, asc); } template -__device__ void warp_sort(K k[N], V v[N], const bool asc = true) +RAFT_DEVICE_INLINE_FUNCTION void warp_sort(K k[N], V v[N], const bool asc = true) { +#pragma unroll for (std::uint32_t range = 1; range <= warp_size; range <<= 1) { warp_merge(k, v, range, asc); } diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index cfb5f7919..6dc601f32 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -16,7 +16,6 @@ #pragma once -#include "compute_distance_vpq.cuh" #include "factory.cuh" #include "search_plan.cuh" #include "search_single_cta_inst.cuh" @@ -85,29 +84,22 @@ inline return filter; } -template -void search_main_core( - raft::resources const& res, - search_params params, - DatasetDescriptorT dataset_desc, - raft::device_matrix_view - graph, - raft::device_matrix_view - queries, - raft::device_matrix_view - neighbors, - raft::device_matrix_view - distances, - CagraSampleFilterT sample_filter = CagraSampleFilterT(), - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) +template +void search_main_core(raft::resources const& res, + search_params params, + const dataset_descriptor_host& dataset_desc, + raft::device_matrix_view graph, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + CagraSampleFilterT sample_filter = CagraSampleFilterT()) { RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", - static_cast(dataset_desc.size), - static_cast(dataset_desc.dim)); + static_cast(graph.extent(0)), + static_cast(queries.extent(1))); RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n", static_cast(queries.extent(0)), static_cast(queries.extent(1))); - RAFT_EXPECTS(queries.extent(1) == dataset_desc.dim, "Queries and index dim must match"); const uint32_t topk = neighbors.extent(1); cudaDeviceProp deviceProp = raft::resource::get_device_properties(res); @@ -119,12 +111,12 @@ void search_main_core( "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, - dataset_desc.dim); + queries.extent(1)); using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector::type; - std::unique_ptr> plan = - factory::create( - res, params, dataset_desc.dim, graph.extent(1), topk, metric); + std::unique_ptr> plan = + factory::create( + res, params, dataset_desc, queries.extent(1), graph.extent(1), topk); plan->check(topk); @@ -134,21 +126,17 @@ void search_main_core( for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) { const uint32_t n_queries = std::min(max_queries, queries.extent(0) - qid); - auto _topk_indices_ptr = - reinterpret_cast(neighbors.data_handle()) + - (topk * qid); + auto _topk_indices_ptr = reinterpret_cast(neighbors.data_handle()) + (topk * qid); auto _topk_distances_ptr = distances.data_handle() + (topk * qid); // todo(tfeher): one could keep distances optional and pass nullptr const auto* _query_ptr = queries.data_handle() + (query_dim * qid); const auto* _seed_ptr = plan->num_seeds > 0 - ? reinterpret_cast(plan->dev_seed.data()) + - (plan->num_seeds * qid) + ? reinterpret_cast(plan->dev_seed.data()) + (plan->num_seeds * qid) : nullptr; uint32_t* _num_executed_iterations = nullptr; (*plan)(res, - dataset_desc, graph, _topk_indices_ptr, _topk_distances_ptr, @@ -161,77 +149,6 @@ void search_main_core( } } -template -void launch_vpq_search_main_core( - raft::resources const& res, - const vpq_dataset* vpq_dset, - search_params params, - raft::device_matrix_view graph, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances, - CagraSampleFilterT sample_filter, - const cuvs::distance::DistanceType metric) -{ - RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now"); - RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4, - "Only pq_len 2 or 4 is supported for now"); - RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0, - "dim must be a multiple of pq_dim at the moment"); - - const float vq_scale = 1.0f; - const float pq_scale = 1.0f; - - if (vpq_dset->pq_bits() == 8) { - if (vpq_dset->pq_len() == 2) { - using dataset_desc_t = cagra_q_dataset_descriptor_t; - dataset_desc_t dataset_desc(vpq_dset->data.data_handle(), - vpq_dset->encoded_row_length(), - vpq_dset->pq_dim(), - vpq_dset->vq_code_book.data_handle(), - vq_scale, - vpq_dset->pq_code_book.data_handle(), - pq_scale, - size_t(vpq_dset->n_rows()), - vpq_dset->dim()); - search_main_core( - res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric); - } else if (vpq_dset->pq_len() == 4) { - using dataset_desc_t = cagra_q_dataset_descriptor_t; - dataset_desc_t dataset_desc(vpq_dset->data.data_handle(), - vpq_dset->encoded_row_length(), - vpq_dset->pq_dim(), - vpq_dset->vq_code_book.data_handle(), - vq_scale, - vpq_dset->pq_code_book.data_handle(), - pq_scale, - size_t(vpq_dset->n_rows()), - vpq_dset->dim()); - search_main_core( - res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric); - } else { - RAFT_FAIL("Subspace dimension must be 2 or 4"); - } - } else { - RAFT_FAIL("Only 8-bit PQ is supported now"); - } -} - /** * @brief Search ANN using the constructed index. * @@ -264,6 +181,7 @@ void search_main(raft::resources const& res, raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT()) { + auto stream = raft::resource::get_cuda_stream(res); const auto& graph = index.graph(); auto graph_internal = raft::make_device_matrix_view( reinterpret_cast(graph.data_handle()), graph.extent(0), graph.extent(1)); @@ -273,39 +191,21 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { - // Set TEAM_SIZE and DATASET_BLOCK_SIZE to zero tentatively since these parameters cannot be - // determined here. They are set just before kernel launch. - using dataset_desc_t = standard_dataset_descriptor_t; // Search using a plain (strided) row-major dataset - const dataset_desc_t dataset_desc(strided_dset->view().data_handle(), - strided_dset->n_rows(), - strided_dset->dim(), - strided_dset->stride()); - search_main_core(res, - params, - dataset_desc, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto& desc = dataset_descriptor_init_with_cache( + res, params, *strided_dset, index.metric()); + search_main_core( + res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { // Search using a compressed dataset RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - launch_vpq_search_main_core( - res, - vpq_dset, - params, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto& desc = dataset_descriptor_init_with_cache( + res, params, *vpq_dset, index.metric()); + search_main_core( + res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* empty_dset = dynamic_cast*>(&index.data()); empty_dset != nullptr) { // Forgot to add a dataset. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh new file mode 100644 index 000000000..8407ef055 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#pragma once + +#include "compute_distance_standard.hpp" +#include "compute_distance_vpq.hpp" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; + +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; + +extern template struct instance_selector< + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; + +using descriptor_instances = instance_selector< + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; + +template +auto dataset_descriptor_init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + auto [init, priority] = + descriptor_instances::select(params, dataset, metric); + if (init == nullptr || priority < 0) { + RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); + } + return init(params, dataset, metric, stream); +} + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu new file mode 100644 index 000000000..45316e59b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance-ext.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; + +template struct instance_selector< + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 2b0c750ff..4bed275ab 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -20,303 +20,363 @@ #include "utils.hpp" #include +#include +#include +#include #include // TODO: This shouldn't be invoking spatial/knn #include "../ann_utils.cuh" +#include #include +#include +#include #include namespace cuvs::neighbors::cagra::detail { -namespace device { -// using LOAD_256BIT_T = ulonglong4; -using LOAD_128BIT_T = uint4; -using LOAD_64BIT_T = uint64_t; - -template -_RAFT_DEVICE constexpr unsigned get_vlen() -{ - return utils::size_of() / utils::size_of(); -} - -template -_RAFT_DEVICE void compute_distance_to_random_nodes( - INDEX_T* const result_indices_ptr, // [num_pickup] - DISTANCE_T* const result_distances_ptr, // [num_pickup] - const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer, - const DATASET_DESCRIPTOR_T& dataset_desc, - const std::size_t num_pickup, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const INDEX_T* const seed_ptr, // [num_seeds] - const uint32_t num_seeds, - INDEX_T* const visited_hash_ptr, - const uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, - const uint32_t block_id = 0, - const uint32_t num_blocks = 1) -{ - uint32_t max_i = num_pickup; - if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); } - - for (uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += blockDim.x / TEAM_SIZE) { - const bool valid_i = (i < num_pickup); - - INDEX_T best_index_team_local; - DISTANCE_T best_norm2_team_local = utils::get_max_value(); - for (uint32_t j = 0; j < num_distilation; j++) { - // Select a node randomly and compute the distance to it - INDEX_T seed_index; - if (valid_i) { - // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id))); - uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j))); - if (seed_ptr && (gid < num_seeds)) { - seed_index = seed_ptr[gid]; - } else { - seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size; - } - } - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, valid_i); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, valid_i); - break; - default: break; - } - - if (valid_i && (norm2 < best_norm2_team_local)) { - best_norm2_team_local = norm2; - best_index_team_local = seed_index; +/** + * @brief Dataset and distance description. + * + * This is the base type for the dataset/distance descriptors. + * The actual implementations are hidden in `compute_distance_***-impl.cuh` files, which should be + * included only in `compute_distance_***.cu` files to enforce separable compilation. + * + * [Note: manual dispatch] + * The descriptor type hierarchy declared here resembles the usual C++ inheritance: the search + * kernels take a pointer to the base type as an argument, but the actual implementation types are + * passed by the host. The kernels only ever need two functions `setup_workspace` and + * `compute_distance`; the choice of the implementation happens at the runtime. + * + * However, for performance reasons, we don't use the C++ virtual dispatch mechanics here. + * The extra pointer-chasing and register usage overheads associated with virtual tables turn out to + * cause a significant slowdown in the performance-critical `compute_distance`. + * Instead, we manually dispatch the two polymorphic functions and store them as fields in the + * descriptor structure. + * + * [Note: initialization/dispatch] + * The host doesn't know the addresses of the device symbols. That means we either need to resolve + * the device functions and store them in the descriptor directly on the device, or use + * `cudaMemcpyFromSymbolAsync` to fetch them (note, there is same problem with classes: if an object + * is created on the host, its pointer to the vtable would be invalid on device). + * We take the first approach: there's an `***_init_kernel` for each descriptor instance that is + * called before the search kernel; all it does is call a (placement) new with an appropriate type + * and arguments in a single GPU thread. + * + */ +template +struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using LOAD_T = device::LOAD_128BIT_T; + using DATA_T = DataT; + using INDEX_T = IndexT; + using DISTANCE_T = DistanceT; + + /** + * @brief "polymorphic" `compute_distance` arguments. + * + * This is a tightly-packed POD arguments of `compute_distance`. + * **Important** this structure is passed by value to `compute_distance`; it's important it + * remains small. + * + * [Note: arguments layout] + * The descriptor implementations require different sets of arguments (with couple arguments + * overlapping). At the same time the `compute_distance` is defined such that it accepts the + * `args_t` by value. That means the layout of the struct must be identical for all descriptor + * implementations. We workaround this requirement by defining generic fields in this struct and + * assignging the meaning to them on the implementation side. + */ + struct alignas(LOAD_T) args_t { + void* extra_ptr1; + void* extra_ptr2; + /** Pointer to the workspace in the shared memory (filled in every copy by a thread block). */ + uint32_t smem_ws_ptr; + /** Dimensionality of the data/queries. */ + uint32_t dim; + uint32_t extra_word1; + uint32_t extra_word2; + + /** + * Load this struct from shared memory. + * + * NB: until `compute_distance` is called, the arguments struct is stored in the shared memory + * as a member of the descriptor struct. This helper functions saves a few instructions by + * forcing the compiler to assume it is indeed in the shared memory address space. + */ + RAFT_DEVICE_INLINE_FUNCTION auto load() const -> args_t + { + constexpr int kCount = sizeof(*this) / sizeof(LOAD_T); + using blob_type = LOAD_T[kCount]; + args_t r; + auto& src = reinterpret_cast(*this); + auto& dst = reinterpret_cast(r); +#pragma unroll + for (int i = 0; i < kCount; i++) { + device::lds(dst[i], src + i); } + return r; } - - const unsigned lane_id = threadIdx.x % TEAM_SIZE; - if (valid_i && lane_id == 0) { - if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { - result_distances_ptr[i] = best_norm2_team_local; - result_indices_ptr[i] = best_index_team_local; - } else { - result_distances_ptr[i] = utils::get_max_value(); - result_indices_ptr[i] = utils::get_max_value(); - } + }; + + /** Shared memory usage and team_size packed into a single uint32_t to save on memory requests. */ + struct smem_and_team_size_t { + uint32_t value; + RAFT_INLINE_FUNCTION constexpr smem_and_team_size_t(uint32_t smem_size_bytes, + uint32_t team_size_bitshift) + : value{(team_size_bitshift << 24) | smem_size_bytes} + { } - } -} - -template -_RAFT_DEVICE void compute_distance_to_child_nodes( - INDEX_T* const result_child_indices_ptr, - DISTANCE_T* const result_child_distances_ptr, - // query - const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer, - // [dataset_dim, dataset_size] - const DATASET_DESCRIPTOR_T& dataset_desc, - // [knn_k, dataset_size] - const INDEX_T* const knn_graph, - const std::uint32_t knn_k, - // hashmap - INDEX_T* const visited_hashmap_ptr, - const std::uint32_t hash_bitlen, - const INDEX_T* const parent_indices, - const INDEX_T* const internal_topk_list, - const std::uint32_t search_width, - const cuvs::distance::DistanceType metric) -{ - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - const INDEX_T invalid_index = utils::get_max_value(); - - // Read child indices of parents from knn graph and check if the distance - // computaiton is necessary. - for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) { - const INDEX_T smem_parent_id = parent_indices[i / knn_k]; - INDEX_T child_id = invalid_index; - if (smem_parent_id != invalid_index) { - const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask; - child_id = knn_graph[(i % knn_k) + (static_cast(knn_k) * parent_id)]; + /** Total dynamic shared memory required by the descriptor. */ + RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t + { + return value & 0xffffffu; } - if (child_id != invalid_index) { - if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) { - child_id = invalid_index; - } + RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t + { + return (value >> 24) & 0xffu; + } + /** How many threads are involved in computing a single distance. */ + RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t + { + return 1u << team_size_bitshift(); } - result_child_indices_ptr[i] = child_id; + }; + static_assert(sizeof(smem_and_team_size_t) == sizeof(uint32_t)); + + using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); + using compute_distance_type = DISTANCE_T(const args_t, const INDEX_T); + + args_t args; + + /** Copy the descriptor and the query into shared memory and do any other work, such as + * initializing the codebook. */ + setup_workspace_type* setup_workspace_impl; + /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector + * given by the dataset_index. */ + compute_distance_type* compute_distance_impl; + /** A placeholder for an implementation-specific pointer. */ + void* extra_ptr3; + smem_and_team_size_t smem_and_team_size; + + /** Number of records in the database. */ + INDEX_T size; + + RAFT_INLINE_FUNCTION dataset_descriptor_base_t(setup_workspace_type* setup_workspace_impl, + compute_distance_type* compute_distance_impl, + INDEX_T size, + uint32_t dim, + uint32_t team_size_bitshift, + uint32_t smem_ws_size_in_bytes) + : setup_workspace_impl(setup_workspace_impl), + compute_distance_impl(compute_distance_impl), + size(size), + smem_and_team_size(smem_ws_size_in_bytes, team_size_bitshift), + args{nullptr, nullptr, 0, dim, 0, 0} + { } - __syncthreads(); - // Compute the distance to child nodes - std::uint32_t max_i = knn_k * search_width; - if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); } - for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) { - const auto i = tid / TEAM_SIZE; - const bool valid_i = (i < (knn_k * search_width)); - INDEX_T child_id = invalid_index; - if (valid_i) { child_id = result_child_indices_ptr[i]; } + /** Total dynamic shared memory required by the descriptor. */ + RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t + { + return smem_and_team_size.smem_ws_size_in_bytes(); + } + RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t + { + return smem_and_team_size.team_size_bitshift(); + } + RAFT_DEVICE_INLINE_FUNCTION constexpr auto team_size_bitshift_from_smem() const noexcept + -> uint32_t + { + uint32_t sts; + raft::lds(sts, reinterpret_cast(&smem_and_team_size)); + return reinterpret_cast(sts).team_size_bitshift(); + } - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, child_id != invalid_index); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - query_buffer, child_id, child_id != invalid_index); - break; - default: break; - } + /** How many threads are involved in computing a single distance. */ + RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t + { + return smem_and_team_size.team_size(); + } - // Store the distance - const unsigned lane_id = threadIdx.x % TEAM_SIZE; - if (valid_i && lane_id == 0) { - if (child_id != invalid_index) { - result_child_distances_ptr[i] = norm2; - } else { - result_child_distances_ptr[i] = utils::get_max_value(); - } - } + RAFT_DEVICE_INLINE_FUNCTION auto setup_workspace(void* smem_ptr, + const DATA_T* queries_ptr, + uint32_t query_id) const -> const base_type* + { + return setup_workspace_impl(this, smem_ptr, queries_ptr, query_id); } -} -} // namespace device + RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const + -> DISTANCE_T + { + auto per_thread_distances = valid ? compute_distance_impl(args.load(), dataset_index) : 0; + return device::team_sum(per_thread_distances, team_size_bitshift_from_smem()); + } +}; -template -struct dataset_descriptor_base_t { - using INDEX_T = INDEX_T_; - using QUERY_T = QUERY_T_; - using DISTANCE_T = DISTANCE_T_; +/** + * @brief Hosting a device descriptor. + * + * The dataset descriptor is initialized on the device side and stays there. + * The host struct manages the lifetime of the associated device pointer and a couple parameters + * affecting the search kernel launch config. + * + */ +template +struct dataset_descriptor_host { + using dev_descriptor_t = dataset_descriptor_base_t; + uint32_t smem_ws_size_in_bytes = 0; + uint32_t team_size = 0; + + template + dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream) + : dev_ptr_{[stream]() { + dev_descriptor_t* p; + RAFT_CUDA_TRY(cudaMallocAsync(&p, sizeof(DescriptorImpl), stream)); + return p; + }(), + [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }}, + smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, + team_size{dd_host.team_size()} + { + } - const INDEX_T size; - const std::uint32_t dim; + [[nodiscard]] auto dev_ptr() const -> const dev_descriptor_t* { return dev_ptr_.get(); } + [[nodiscard]] auto dev_ptr() -> dev_descriptor_t* { return dev_ptr_.get(); } - dataset_descriptor_base_t(const INDEX_T size, const std::uint32_t dim) : size(size), dim(dim) {} + private: + std::unique_ptr> dev_ptr_; }; -template -struct standard_dataset_descriptor_t - : public dataset_descriptor_base_t { - using LOAD_T = device::LOAD_128BIT_T; - using DATA_T = DATA_T_; - using QUERY_T = typename dataset_descriptor_base_t::QUERY_T; - - const DATA_T* const ptr; - const std::size_t ld; - using dataset_descriptor_base_t::size; - using dataset_descriptor_base_t::dim; - - standard_dataset_descriptor_t(const DATA_T* const ptr, - const std::size_t size, - const std::uint32_t dim, - const std::size_t ld) - : dataset_descriptor_base_t(size, dim), ptr(ptr), ld(ld) +/** + * @brief The signature for descriptor initialization. + * + * There is an init function associated with every descriptor implementation. It's responsible for + * initializing the device-side descriptor instance (calling the init kernel). + * + */ +template +using init_desc_type = + dataset_descriptor_host (*)(const cagra::search_params&, + const DatasetT&, + cuvs::distance::DistanceType, + rmm::cuda_stream_view); + +/** + * @brief Descriptor instance specification. + * + * This type provides a decentralized way for selecting a descriptor instance best suitable for the + * given dataset and distance metric. + * There is a spec for every descriptor (described in the interface files + * `compute_distance_***.hpp`). + * + * The `instance_spec` implementation must have the following static member template functions: + * * constexpr bool accepts_dataset() + * - tells whether the spec is compatible with the dataset type, executed at compile time. + * * double priority(..) + * - tells how to select a single spec out of possibly several compatible specs + * * init_desc_type init + * - (see `init_desc_type` above) the function to initialize the descriptor. + */ +template +struct instance_spec { + using data_type = DataT; + using index_type = IndexT; + using distance_type = DistanceT; + using host_type = dataset_descriptor_host; + /** Use this to constrain the input dataset type. */ + template + constexpr static inline bool accepts_dataset() { + return false; } +}; - static const std::uint32_t smem_buffer_size_in_byte = 0; - __device__ void set_smem_ptr(void* const){}; - - template - __device__ void copy_query(const DATA_T* const dmem_query_ptr, - QUERY_T* const smem_query_ptr, - const std::uint32_t query_smem_buffer_length) - { - for (unsigned i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dim) { - smem_query_ptr[j] = - cuvs::spatial::knn::detail::utils::mapping{}(dmem_query_ptr[i]); - } else { - smem_query_ptr[j] = 0.0; - } - } +/** Whether the descriptor is compatible with the dataset and arguments at the type level + * (compile-time check). + */ +template +constexpr bool spec_sound = std::is_same_v && + std::is_same_v && + std::is_same_v && + InstanceSpec::template accepts_dataset(); + +/** + * @brief Get the init function and the priority of the descriptor given by the InstanceSpec. + * + * @return (init function, priority) + */ +template +constexpr auto spec_match(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::tuple, double> +{ + if constexpr (spec_sound) { + return std::make_tuple(InstanceSpec::template init, + InstanceSpec::template priority(params, dataset, metric)); } + return std::make_tuple(nullptr, -1.0); +} - template - std::enable_if_t __device__ - dist_op(T a, T b) const +/** + * @brief Select the best matching descriptor instance from the given type-level list. + * + * This is a helper struct that goes through the given list of specs (given as template arguments), + * filters is (partially at compile time and partially at runtime), and selects the descriptor with + * the highest priority. + * + * There is a single point in the codebase, where all specs are brought together; it's in the + * `neighbors/detail/cagra/compute_distance-ext.cuh`, which is generated by + * `neighbors/detail/cagra/compute_distance_00_generate.py`. + * Hence, `compute_distance_00_generate.py` is the only place you need to manually change to modify + * or extend the list supported dataset descriptors. + * The logic of selecting the descriptor is fully defined in this file, whereas the priorities of + * specific implementations are defined next to the implementations. + */ +template +struct instance_selector { + template + static auto select(const cagra::search_params&, const DatasetT&, cuvs::distance::DistanceType) + -> std::tuple, double> { - T diff = a - b; - return diff * diff; + return std::make_tuple(nullptr, -1.0); } +}; - template - std::enable_if_t __device__ - dist_op(T a, T b) const +template +struct instance_selector { + template + static auto select(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, + std::tuple, double>> { - return -a * b; + auto s0 = spec_match(params, dataset, metric); + auto ss = instance_selector::template select( + params, dataset, metric); + return std::get<1>(s0) >= std::get<1>(ss) ? s0 : ss; } - template - __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr, - const INDEX_T dataset_i, - const bool valid) const + template + static auto select(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, + std::tuple, double>> { - const auto dataset_ptr = ptr + dataset_i * ld; - const unsigned lane_id = threadIdx.x % TEAM_SIZE; - constexpr unsigned vlen = device::get_vlen(); - // #include (DATASET_BLOCK_DIM, TEAM_SIZE * vlen); - raft::TxN_t dl_buff[reg_nelem]; - - DISTANCE_T norm2 = 0; - if (valid) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) { -#pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset; - if (k >= dim) break; - dl_buff[e].load(dataset_ptr, k); - } -#pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset; - if (k >= dim) break; -#pragma unroll - for (uint32_t v = 0; v < vlen; v++) { - const uint32_t kv = k + v; - // Note this loop can go above the dataset_dim for padded arrays. This is not a problem - // because: - // - Above the last element (dataset_dim-1), the query array is filled with zeros. - // - The data buffer has to be also padded with zeros. - DISTANCE_T d = query_ptr[device::swizzling(kv)]; - norm2 += dist_op( - d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); - } - } - } - } - for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) { - norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); - } - return norm2; + return instance_selector::template select( + params, dataset, metric); } }; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py new file mode 100644 index 000000000..52a15e2a1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import glob + +template = """/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +{includes} + +namespace cuvs::neighbors::cagra::detail {{ + +using namespace cuvs::distance; +{content} + +}} // namespace cuvs::neighbors::cagra::detail +""" + +mxdim_team = [(128, 8), (256, 16), (512, 32)] +#mxdim_team = [(64, 8), (128, 16), (256, 32)] +#mxdim_team = [(32, 8), (64, 16), (128, 32)] + +pq_bits = [8] +pq_lens = [2, 4] + +# rblock = [(256, 4), (512, 2), (1024, 1)] +# rcandidates = [32] +# rsize = [256, 512] +code_book_types = ["half"] + +search_types = dict( + float_uint32=("float", "uint32_t", "float"), # data_t, idx_t, distance_t + half_uint32=("half", "uint32_t", "float"), + int8_uint32=("int8_t", "uint32_t", "float"), + uint8_uint32=("uint8_t", "uint32_t", "float"), + # float_uint64=("float", "uint64_t", "float"), + # half_uint64=("half", "uint64_t", "float"), +) + +metric_prefix = 'DistanceType::' + +specs = [] +descs = [] +cmake_list = [] + + + + +# Cleanup first +for f in glob.glob("compute_distance_standard_*.cu"): + os.remove(f) +for f in glob.glob("compute_distance_vpq_*.cu"): + os.remove(f) + +# Generate new files +for type_path, (data_t, idx_t, distance_t) in search_types.items(): + for (mxdim, team) in mxdim_team: + # CAGRA + for metric in ['L2Expanded', 'InnerProduct']: + path = f"compute_distance_standard_{metric}_{type_path}_dim{mxdim}_t{team}.cu" + includes = '#include "compute_distance_standard-impl.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}" + spec = f"standard_descriptor_spec<{params}>" + content = f"""template struct {spec};""" + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") + + # CAGRA-Q + for code_book_t in code_book_types: + for pq_len in pq_lens: + for pq_bit in pq_bits: + for metric in ['L2Expanded']: + path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" + includes = '#include "compute_distance_vpq-impl.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" + spec = f"vpq_descriptor_spec<{params}>" + content = f"""template struct {spec};""" + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") + +with open("compute_distance-ext.cuh", "w") as f: + includes = ''' +#pragma once + +#include "compute_distance_standard.hpp" +#include "compute_distance_vpq.hpp" +''' + newline = "\n" + contents = f''' +{newline.join(map(lambda s: "extern template struct " + s + ";", specs))} + +extern template struct + instance_selector<{("," + newline + " ").join(specs)}>; + +using descriptor_instances = + instance_selector<{("," + newline + " ").join(specs)}>; + +template +auto dataset_descriptor_init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{{ + auto [init, priority] = descriptor_instances::select(params, dataset, metric); + if (init == nullptr || priority < 0) {{ + RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); + }} + return init(params, dataset, metric, stream); +}} +''' + f.write(template.format(includes=includes, content=contents)) + + +with open("compute_distance.cu", "w") as f: + includes = '#include "compute_distance-ext.cuh"' + newline = "\n" + contents = f''' +template struct instance_selector<{("," + newline + " ").join(specs)}>; +''' + f.write(template.format(includes=includes, content=contents)) + +cmake_list.sort() +for path in cmake_list: + print(path) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh new file mode 100644 index 000000000..b0205508a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_distance_standard.hpp" + +#include +#include +#include + +#include + +namespace cuvs::neighbors::cagra::detail { +namespace { +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b) + -> std::enable_if_t +{ + T diff = a - b; + return diff * diff; +} + +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b) + -> std::enable_if_t +{ + return -a * b; +} +} // namespace + +template +struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using QUERY_T = float; + using base_type::args; + using base_type::smem_ws_size_in_bytes; + using typename base_type::args_t; + using typename base_type::compute_distance_type; + using typename base_type::DATA_T; + using typename base_type::DISTANCE_T; + using typename base_type::INDEX_T; + using typename base_type::LOAD_T; + using typename base_type::setup_workspace_type; + constexpr static inline auto kMetric = Metric; + constexpr static inline auto kTeamSize = TeamSize; + constexpr static inline auto kDatasetBlockDim = DatasetBlockDim; + + static constexpr RAFT_INLINE_FUNCTION auto ptr(const args_t& args) noexcept + -> const DATA_T* const& + { + return (const DATA_T* const&)(args.extra_ptr1); + } + static constexpr RAFT_INLINE_FUNCTION auto ptr(args_t& args) noexcept -> const DATA_T*& + { + return (const DATA_T*&)(args.extra_ptr1); + } + + static constexpr RAFT_INLINE_FUNCTION auto ld(const args_t& args) noexcept -> const uint32_t& + { + return args.extra_word1; + } + static constexpr RAFT_INLINE_FUNCTION auto ld(args_t& args) noexcept -> uint32_t& + { + return args.extra_word1; + } + + _RAFT_HOST_DEVICE standard_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, + compute_distance_type* compute_distance_impl, + const DATA_T* ptr, + INDEX_T size, + uint32_t dim, + uint32_t ld) + : base_type(setup_workspace_impl, + compute_distance_impl, + size, + dim, + raft::Pow2::Log2, + get_smem_ws_size_in_bytes(dim)) + { + standard_dataset_descriptor_t::ptr(args) = ptr; + standard_dataset_descriptor_t::ld(args) = ld; + static_assert(sizeof(*this) == sizeof(base_type)); + static_assert(alignof(standard_dataset_descriptor_t) == alignof(base_type)); + } + + private: + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t + { + return sizeof(standard_dataset_descriptor_t) + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +_RAFT_DEVICE __noinline__ auto setup_workspace_standard( + const DescriptorT* that, + void* smem_ptr, + const typename DescriptorT::DATA_T* queries_ptr, + uint32_t query_id) -> const DescriptorT* +{ + using DATA_T = typename DescriptorT::DATA_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using base_type = typename DescriptorT::base_type; + using QUERY_T = typename DescriptorT::QUERY_T; + using word_type = uint32_t; + constexpr auto kTeamSize = DescriptorT::kTeamSize; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + auto* r = reinterpret_cast(smem_ptr); + auto* buf = reinterpret_cast(r + 1); + if (r != that) { + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type); + using blob_type = word_type[kCount]; + auto& src = reinterpret_cast(*that); + auto& dst = reinterpret_cast(*r); + for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { + dst[i] = src[i]; + } + const auto smem_ptr_offset = + reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) { + r->args.smem_ws_ptr = uint32_t(__cvta_generic_to_shared(buf)); + } + __syncthreads(); + } + + uint32_t dim = r->args.dim; + auto buf_len = raft::round_up_safe(dim, kDatasetBlockDim); + constexpr auto vlen = device::get_vlen(); + queries_ptr += dim * query_id; + for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { + unsigned j = device::swizzling(i); + if (i < dim) { + buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(queries_ptr[i]); + } else { + buf[j] = 0.0; + } + } + + return const_cast(r); +} + +template +RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker( + const typename DescriptorT::DATA_T* __restrict__ dataset_ptr, + uint32_t dim, + uint32_t query_smem_ptr) -> typename DescriptorT::DISTANCE_T +{ + using DATA_T = typename DescriptorT::DATA_T; + using DISTANCE_T = typename DescriptorT::DISTANCE_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using QUERY_T = typename DescriptorT::QUERY_T; + constexpr auto kTeamSize = DescriptorT::kTeamSize; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto vlen = device::get_vlen(); + constexpr auto reg_nelem = + raft::div_rounding_up_unsafe(kDatasetBlockDim, kTeamSize * vlen); + + DISTANCE_T r = 0; + for (uint32_t elem_offset = (threadIdx.x % kTeamSize) * vlen; elem_offset < dim; + elem_offset += kDatasetBlockDim) { + DATA_T data[reg_nelem][vlen]; +#pragma unroll + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = e * (kTeamSize * vlen) + elem_offset; + if (k >= dim) break; + device::ldg_cg(reinterpret_cast(data[e]), + reinterpret_cast(dataset_ptr + k)); + } +#pragma unroll + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = e * (kTeamSize * vlen) + elem_offset; + if (k >= dim) break; +#pragma unroll + for (uint32_t v = 0; v < vlen; v++) { + // Note this loop can go above the dataset_dim for padded arrays. This is not a problem + // because: + // - Above the last element (dataset_dim-1), the query array is filled with zeros. + // - The data buffer has to be also padded with zeros. + DISTANCE_T d; + device::lds( + d, + query_smem_ptr + + sizeof(QUERY_T) * device::swizzling(k + v)); + r += dist_op( + d, cuvs::spatial::knn::detail::utils::mapping{}(data[e][v])); + } + } + } + return r; +} + +template +_RAFT_DEVICE __noinline__ auto compute_distance_standard( + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T +{ + return compute_distance_standard_worker( + DescriptorT::ptr(args) + (static_cast(DescriptorT::ld(args)) * dataset_index), + args.dim, + args.smem_ws_ptr); +} + +template +RAFT_KERNEL __launch_bounds__(1, 1) + standard_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld) +{ + using desc_type = + standard_dataset_descriptor_t; + using base_type = typename desc_type::base_type; + new (out) desc_type(reinterpret_cast( + &setup_workspace_standard), + reinterpret_cast( + &compute_distance_standard), + ptr, + size, + dim, + ld); +} + +template +dataset_descriptor_host +standard_descriptor_spec::init_( + const cagra::search_params& params, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld, + rmm::cuda_stream_view stream) +{ + using desc_type = + standard_dataset_descriptor_t; + using base_type = typename desc_type::base_type; + desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld}; + host_type result{dd_host, stream}; + + standard_dataset_descriptor_init_kernel + <<<1, 1, 0, stream>>>(result.dev_ptr(), ptr, size, dim, desc_type::ld(dd_host.args)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + return result; +} + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp new file mode 100644 index 000000000..df1b77e86 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_distance.hpp" + +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct standard_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; + + template + constexpr static inline bool accepts_dataset() + { + return is_strided_dataset_v; + } + + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) -> host_type + { + return init_(params, + dataset.view().data_handle(), + IndexT(dataset.n_rows()), + dataset.dim(), + dataset.stride(), + stream); + } + + template + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (Metric != metric) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); + } + + private: + static dataset_descriptor_host init_(const cagra::search_params& params, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld, + rmm::cuda_stream_view stream); +}; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu similarity index 51% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu index 1116eaaa4..af5e89a76 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu @@ -15,22 +15,24 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_standard-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu similarity index 51% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu index 7e3ec363d..332eb6bf9 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu @@ -15,22 +15,24 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_standard-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu similarity index 51% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu index af60c776a..3e5c11240 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu @@ -15,22 +15,24 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_standard-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu new file mode 100644 index 000000000..92ca114f7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu similarity index 51% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu index 5dd79a79b..cfad79f3a 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu @@ -15,22 +15,24 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_standard-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu new file mode 100644 index 000000000..8c208044b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu new file mode 100644 index 000000000..929df5bbe --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu new file mode 100644 index 000000000..3cc4a2c95 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu new file mode 100644 index 000000000..a87e866eb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..650d9ecac --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu new file mode 100644 index 000000000..6f7f4b97f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu new file mode 100644 index 000000000..e7b96ab49 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu new file mode 100644 index 000000000..b45cf3669 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu new file mode 100644 index 000000000..7d1206c37 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu new file mode 100644 index 000000000..251316b2c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu new file mode 100644 index 000000000..e3870df40 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu new file mode 100644 index 000000000..1253d7cd4 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu new file mode 100644 index 000000000..792532c2c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu new file mode 100644 index 000000000..c9c960cf9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu new file mode 100644 index 000000000..d7a12804b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu new file mode 100644 index 000000000..a4f06c283 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..199f05e49 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu new file mode 100644 index 000000000..0962ecd82 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu new file mode 100644 index 000000000..9c7e4ab03 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh new file mode 100644 index 000000000..86c592502 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "compute_distance_vpq.hpp" + +#include +#include +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using CODE_BOOK_T = CodebookT; + using QUERY_T = half; + using base_type::args; + using base_type::extra_ptr3; + using typename base_type::args_t; + using typename base_type::compute_distance_type; + using typename base_type::DATA_T; + using typename base_type::DISTANCE_T; + using typename base_type::INDEX_T; + using typename base_type::LOAD_T; + using typename base_type::setup_workspace_type; + constexpr static inline auto kMetric = Metric; + constexpr static inline auto kTeamSize = TeamSize; + constexpr static inline auto kDatasetBlockDim = DatasetBlockDim; + constexpr static inline auto kPqBits = PQ_BITS; + constexpr static inline auto kPqLen = PQ_LEN; + + static_assert(std::is_same_v, "Only CODE_BOOK_T = `half` is supported now"); + + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(args_t& args) noexcept + -> const uint8_t*& + { + return (const uint8_t*&)args.extra_ptr1; + } + RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(args_t& args) noexcept + -> const CODE_BOOK_T*& + { + return (const CODE_BOOK_T*&)args.extra_ptr2; + } + RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() noexcept -> const CODE_BOOK_T*& + { + return (const CODE_BOOK_T*&)extra_ptr3; + } + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(args_t& args) noexcept -> uint32_t& + { + return args.extra_word1; + } + + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(const args_t& args) noexcept + -> const uint8_t* const& + { + return (const uint8_t*&)args.extra_ptr1; + } + RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(const args_t& args) noexcept + -> const CODE_BOOK_T* const& + { + return (const CODE_BOOK_T*&)args.extra_ptr2; + } + RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() const noexcept -> const CODE_BOOK_T* const& + { + return (const CODE_BOOK_T*&)extra_ptr3; + } + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(const args_t& args) noexcept + -> const uint32_t& + { + return args.extra_word1; + } + + static constexpr std::uint32_t kSMemCodeBookSizeInBytes = + (1 << PQ_BITS) * PQ_LEN * utils::size_of(); + + _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, + compute_distance_type* compute_distance_impl, + const std::uint8_t* encoded_dataset_ptr, + std::uint32_t encoded_dataset_dim, + const CODE_BOOK_T* vq_code_book_ptr, + const CODE_BOOK_T* pq_code_book_ptr, + IndexT size, + std::uint32_t dim) + : base_type(setup_workspace_impl, + compute_distance_impl, + size, + dim, + raft::Pow2::Log2, + get_smem_ws_size_in_bytes(dim)) + { + cagra_q_dataset_descriptor_t::encoded_dataset_ptr(args) = encoded_dataset_ptr; + cagra_q_dataset_descriptor_t::vq_code_book_ptr(args) = vq_code_book_ptr; + this->pq_code_book_ptr() = pq_code_book_ptr; + cagra_q_dataset_descriptor_t::encoded_dataset_dim(args) = encoded_dataset_dim; + static_assert(sizeof(*this) == sizeof(base_type)); + static_assert(alignof(cagra_q_dataset_descriptor_t) == alignof(base_type)); + } + + private: + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t + { + /* SMEM workspace layout: + 1. The descriptor itself + 2. Codebook (kSMemCodeBookSizeInBytes bytes) + 3. Queries (smem_query_buffer_length elems) + */ + return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto transpose(T x) -> T +{ + auto i = x % Block; + auto j = x / Block; + auto k = i % Stride; + auto l = i / Stride; + return j * Block + k * (Block / Stride) + l; +} + +template +_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, + void* smem_ptr, + const typename DescriptorT::DATA_T* queries_ptr, + uint32_t query_id) -> const DescriptorT* +{ + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + + auto* r = reinterpret_cast(smem_ptr); + + if (r != that) { + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type); + using blob_type = word_type[kCount]; + auto& src = reinterpret_cast(*that); + auto& dst = reinterpret_cast(*r); + for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { + dst[i] = src[i]; + } + + auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1)); + const auto smem_ptr_offset = + reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) { + r->args.smem_ws_ptr = codebook_buf; + } + __syncthreads(); + + // Copy PQ table + for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { + half2 buf2; + buf2.x = r->pq_code_book_ptr()[i]; + buf2.y = r->pq_code_book_ptr()[i + 1]; + + // Change the order of PQ code book array to reduce the + // frequency of bank conflicts. + constexpr auto num_elements_per_bank = 4 / utils::size_of(); + constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; + const auto j = i / num_elements_per_bank; + const auto smem_index = + (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); + + device::sts(codebook_buf + smem_index * sizeof(half2), buf2); + } + } + + uint32_t dim = r->args.dim; + queries_ptr += dim * query_id; + + constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + auto smem_query_ptr = + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { + half2 buf2{0, 0}; + if (i < dim) { buf2.x = mapping(queries_ptr[i]); } + if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } + if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { + // Transpose the queries buffer to avoid bank conflicts in compute_distance. + constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + constexpr auto kStride = vlen * PQ_LEN / 2; + reinterpret_cast(smem_query_ptr)[transpose(i / 2)] = + buf2; + } else { + (reinterpret_cast(smem_query_ptr + i))[0] = buf2; + } + } + + return const_cast(r); +} + +template +_RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( + const uint8_t* __restrict__ dataset_ptr, + const typename DescriptorT::CODE_BOOK_T* __restrict__ vq_code_book_ptr, + uint32_t dim, + uint32_t pq_codebook_ptr) -> typename DescriptorT::DISTANCE_T +{ + using DISTANCE_T = typename DescriptorT::DISTANCE_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + constexpr auto TeamSize = DescriptorT::kTeamSize; + constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + + const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; + static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); + constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + constexpr uint32_t nelem = + raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + + constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; + constexpr auto kTeamVLen = TeamSize * vlen; + + const auto n_subspace = raft::div_rounding_up_unsafe(dim, PQ_LEN); + const auto laneId = threadIdx.x & kTeamMask; + DISTANCE_T norm = 0; + for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim; + elem_offset += DatasetBlockDim / PQ_LEN) { + // Loading PQ codes + uint32_t pq_codes[nelem]; +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; + if (k >= n_subspace) break; + // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) + device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); + } + // + if constexpr (PQ_LEN % 2 == 0) { + // **** Use half2 for distance computation **** +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; + if (k >= n_subspace) break; + // Loading VQ code-book + half2 vq_vals[PQ_LEN][vlen / 2]; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; +#pragma unroll + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { + constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); + const std::uint32_t d1 = m + (PQ_LEN / 2) * v; + const std::uint32_t d = + d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + half2 q2, c2; + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * d); + // Loading PQ code book from smem + device::lds(c2, + pq_codebook_ptr + + sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff)))); + // L2 distance + auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } + pq_code >>= 8; + } + } + } else { + // **** Use float for distance computation **** +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; + if (k >= n_subspace) break; + // Loading VQ code-book + CODE_BOOK_T vq_vals[PQ_LEN][vlen]; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device memory) + device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; +#pragma unroll + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; + CODE_BOOK_T pq_vals[PQ_LEN]; + device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d1 = m + (PQ_LEN * v); + const std::uint32_t d = d1 + (PQ_LEN * k); + // if (d >= dataset_dim) break; + DISTANCE_T diff; + device::lds(diff, query_ptr + sizeof(QUERY_T) * d); + diff -= static_cast(pq_vals[m]); + diff -= + static_cast(reinterpret_cast(vq_vals)[d1]); + norm += diff * diff; + } + pq_code >>= 8; + } + } + } + } + return norm; +} + +template +_RAFT_DEVICE __noinline__ auto compute_distance_vpq( + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T +{ + const auto* dataset_ptr = + DescriptorT::encoded_dataset_ptr(args) + + (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index); + uint32_t vq_code; + device::ldg_cg(vq_code, reinterpret_cast(dataset_ptr)); + return compute_distance_vpq_worker( + dataset_ptr /* advance dataset pointer by the size of vq_code */, + DescriptorT::vq_code_book_ptr(args) + args.dim * vq_code, + args.dim, + args.smem_ws_ptr); +} + +template +RAFT_KERNEL __launch_bounds__(1, 1) + vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim) +{ + using desc_type = cagra_q_dataset_descriptor_t; + using base_type = typename desc_type::base_type; + new (out) desc_type( + reinterpret_cast(&setup_workspace_vpq), + reinterpret_cast(&compute_distance_vpq), + encoded_dataset_ptr, + encoded_dataset_dim, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim); +} + +template +dataset_descriptor_host +vpq_descriptor_spec::init_(const cagra::search_params& params, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim, + rmm::cuda_stream_view stream) +{ + using desc_type = cagra_q_dataset_descriptor_t; + using base_type = typename desc_type::base_type; + + desc_type dd_host{nullptr, + nullptr, + encoded_dataset_ptr, + encoded_dataset_dim, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim}; + host_type result{dd_host, stream}; + vpq_dataset_descriptor_init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr(), + encoded_dataset_ptr, + encoded_dataset_dim, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + return result; +} + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh deleted file mode 100644 index 68973662f..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "compute_distance.hpp" - -#include -#include - -namespace cuvs::neighbors::cagra::detail { -template -struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { - using LOAD_T = device::LOAD_128BIT_T; - using DATA_T = DATA_T_; - using CODE_BOOK_T = CODE_BOOK_T_; - using QUERY_T = typename dataset_descriptor_base_t::QUERY_T; - - static_assert(std::is_same_v, "Only CODE_BOOK_T = `half` is supported now"); - - const std::uint8_t* encoded_dataset_ptr; - const std::uint32_t encoded_dataset_dim; - const std::uint32_t n_subspace; - const CODE_BOOK_T* vq_code_book_ptr; - const float vq_scale; - const CODE_BOOK_T* pq_code_book_ptr; - const float pq_scale; - using dataset_descriptor_base_t::size; - using dataset_descriptor_base_t::dim; - - // Set on device - CODE_BOOK_T* smem_pq_code_book_ptr; - static const std::uint32_t smem_buffer_size_in_byte = - (1 << PQ_BITS) * PQ_LEN * utils::size_of(); - - __device__ void set_smem_ptr(void* const smem_ptr) - { - smem_pq_code_book_ptr = reinterpret_cast(smem_ptr); - - // Copy PQ table - for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { - half2 buf2; - buf2.x = pq_code_book_ptr[i]; - buf2.y = pq_code_book_ptr[i + 1]; - - // Change the order of PQ code book array to reduce the - // frequency of bank conflicts. - constexpr auto num_elements_per_bank = 4 / utils::size_of(); - constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; - const auto j = i / num_elements_per_bank; - const auto smem_index = - (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - reinterpret_cast(smem_pq_code_book_ptr)[smem_index] = buf2; - } - } - - cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr, - const std::uint32_t encoded_dataset_dim, - const std::uint32_t n_subspace, - const CODE_BOOK_T* const vq_code_book_ptr, - const float vq_scale, - const CODE_BOOK_T* const pq_code_book_ptr, - const float pq_scale, - const std::size_t size, - const std::uint32_t dim) - : dataset_descriptor_base_t(size, dim), - encoded_dataset_ptr(encoded_dataset_ptr), - encoded_dataset_dim(encoded_dataset_dim), - n_subspace(n_subspace), - vq_code_book_ptr(vq_code_book_ptr), - vq_scale(vq_scale), - pq_code_book_ptr(pq_code_book_ptr), - pq_scale(pq_scale) - { - } - - template - __device__ void copy_query(const DATA_T* const dmem_query_ptr, - QUERY_T* const smem_query_ptr, - const std::uint32_t query_smem_buffer_length) - { - constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; - for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { - half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(dmem_query_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(dmem_query_ptr[i + 1]); } - if ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { - // Use swizzling in the condition to reduce bank conflicts in shared - // memory, which are likely to occur when pq_code_book_dim is large. - ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = - buf2; - } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf2; - } - } - } - - template - __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr, - const INDEX_T node_id, - const bool valid) const - { - float norm = 0; - if (valid) { - const unsigned lane_id = threadIdx.x % TEAM_SIZE; - const uint32_t vq_code = *(reinterpret_cast( - encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * node_id))); - if (PQ_BITS == 8) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) { - constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** - constexpr unsigned nelem = - raft::div_rounding_up_unsafe(DATASET_BLOCK_DIM / PQ_LEN, TEAM_SIZE * vlen); - // Loading PQ codes - uint32_t pq_codes[nelem]; -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - pq_codes[e] = *(reinterpret_cast( - encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * node_id) + - 4 + k)); - } - // - if constexpr (PQ_LEN % 2 == 0) { - // **** Use half2 for distance computation **** - half2 norm2{0, 0}; -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { - const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - vq_vals[m].load( - reinterpret_cast(vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; -#pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // Loading query vector in smem - half2 diff2 = (reinterpret_cast( - query_ptr))[device::swizzling(d / 2)]; - // Loading PQ code book in smem - diff2 -= *(reinterpret_cast( - smem_pq_code_book_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); - diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; - norm2 += diff2 * diff2; - } - pq_code >>= 8; - } - } - norm += static_cast(norm2.x + norm2.y); - } else { - // **** Use float for distance computation **** -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device - // memory) - vq_vals[m].load( - reinterpret_cast(vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; -#pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; - raft::TxN_t pq_vals; - pq_vals.load( - reinterpret_cast(smem_pq_code_book_ptr + PQ_LEN * (pq_code & 0xff)), - 0); // (from L1$ or smem) -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // if (d >= dataset_dim) break; - DISTANCE_T diff = query_ptr[d]; // (from smem) - diff -= pq_scale * static_cast(pq_vals.data[m]); - diff -= vq_scale * static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); - norm += diff * diff; - } - pq_code >>= 8; - } - } - } - } - } - } - for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) { - norm += __shfl_xor_sync(0xffffffff, norm, offset); - } - return norm; - } -}; - -} // namespace cuvs::neighbors::cagra::detail \ No newline at end of file diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp new file mode 100644 index 000000000..378d2943e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "compute_distance.hpp" + +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct vpq_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; + + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return std::is_same_v; + } + + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return false; + } + + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) -> host_type + { + return init_(params, + dataset.data.data_handle(), + dataset.encoded_row_length(), + dataset.vq_code_book.data_handle(), + dataset.pq_code_book.data_handle(), + IndexT(dataset.n_rows()), + dataset.dim(), + stream); + } + + template + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } + // Match codebook params + if (dataset.pq_bits() != PqBits) { return -1.0; } + if (dataset.pq_len() != PqLen) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); + } + + private: + static dataset_descriptor_host init_( + const cagra::search_params& params, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim, + rmm::cuda_stream_view stream); +}; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu similarity index 50% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu index 9ec7ce3dd..a56a5a9df 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu @@ -15,22 +15,27 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_vpq-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu similarity index 50% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu index 292a1429a..f58a8c7df 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu @@ -15,22 +15,27 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_vpq-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..bdc072e61 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..301c8c55b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..05ebeae2b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..e343d938c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu similarity index 50% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu index 1a5ad50e3..5d950351f 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu @@ -15,22 +15,27 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_vpq-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu similarity index 50% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu index 0ab23d7eb..453e15df3 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu @@ -15,22 +15,27 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" +#include "compute_distance_vpq-impl.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..c79cb74b6 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..dee326d54 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..a1ef9ba92 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..f2f01c8d4 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..1afccb8fd --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..28ea523ee --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..eca36cc36 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..89aed8afc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..ff646b22c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..633a805c7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..3a09161ea --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..85331d243 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..a7719074a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..7dd028b82 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..78f37b135 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..d3eb20a05 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 192d81aa8..b7cb9c42d 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -15,10 +15,15 @@ */ #pragma once +#include "hashmap.hpp" #include "utils.hpp" +#include + // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors #include +#include +#include #include @@ -31,6 +36,16 @@ namespace device { // warpSize for compile time calculation constexpr unsigned warp_size = 32; +// using LOAD_256BIT_T = ulonglong4; +using LOAD_128BIT_T = uint4; +using LOAD_64BIT_T = uint64_t; + +template +RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen() +{ + return utils::size_of() / utils::size_of(); +} + /** Xorshift rondem number generator. * * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference. @@ -43,18 +58,299 @@ _RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u) return u * 0x2545F4914F6CDD1DULL; } -template -_RAFT_DEVICE inline T swizzling(T x) +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto swizzling(T x) -> T { // Address swizzling reduces bank conflicts in shared memory, but increases // the amount of operation instead. // return x; - if constexpr (X_MAX <= 1024) { - return (x) ^ ((x) >> 5); + if constexpr (Stride <= 32) { + return x; + } else if constexpr (Dim <= 1024) { + return x ^ (x >> 5); } else { - return (x) ^ (((x) >> 5) & 0x1f); + return x ^ ((x >> 5) & 0x1f); + } +} + +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T +{ +#pragma unroll + for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) { + x += raft::shfl_xor(x, stride, TeamSize); + } + return x; +} + +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size_bitshift) -> T +{ + switch (team_size_bitshift) { + case 5: x += raft::shfl_xor(x, 16); + case 4: x += raft::shfl_xor(x, 8); + case 3: x += raft::shfl_xor(x, 4); + case 2: x += raft::shfl_xor(x, 2); + case 1: x += raft::shfl_xor(x, 1); + default: return x; + } +} + +template +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( + IndexT* __restrict__ result_indices_ptr, // [num_pickup] + DistanceT* __restrict__ result_distances_ptr, // [num_pickup] + const DATASET_DESCRIPTOR_T& dataset_desc, + const uint32_t num_pickup, + const uint32_t num_distilation, + const uint64_t rand_xor_mask, + const IndexT* __restrict__ seed_ptr, // [num_seeds] + const uint32_t num_seeds, + IndexT* __restrict__ visited_hash_ptr, + const uint32_t hash_bitlen, + const uint32_t block_id = 0, + const uint32_t num_blocks = 1) +{ + const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); + const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); + const auto compute_distance = dataset_desc.compute_distance_impl; + + for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { + const bool valid_i = (i < num_pickup); + + IndexT best_index_team_local; + DistanceT best_norm2_team_local = raft::upper_bound(); + for (uint32_t j = 0; j < num_distilation; j++) { + // Select a node randomly and compute the distance to it + IndexT seed_index; + if (valid_i) { + // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id))); + uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j))); + if (seed_ptr && (gid < num_seeds)) { + seed_index = seed_ptr[gid]; + } else { + seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size; + } + } + + const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); + + if (valid_i && (norm2 < best_norm2_team_local)) { + best_norm2_team_local = norm2; + best_index_team_local = seed_index; + } + } + + const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u); + if (valid_i && lane_id == 0) { + if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { + result_distances_ptr[i] = best_norm2_team_local; + result_indices_ptr[i] = best_index_team_local; + } else { + result_distances_ptr[i] = raft::upper_bound(); + result_indices_ptr[i] = raft::upper_bound(); + } + } } } +template +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( + IndexT* __restrict__ result_child_indices_ptr, + DistanceT* __restrict__ result_child_distances_ptr, + // [dataset_dim, dataset_size] + const DATASET_DESCRIPTOR_T& dataset_desc, + // [knn_k, dataset_size] + const IndexT* __restrict__ knn_graph, + const uint32_t knn_k, + // hashmap + IndexT* __restrict__ visited_hashmap_ptr, + const uint32_t hash_bitlen, + const IndexT* __restrict__ parent_indices, + const IndexT* __restrict__ internal_topk_list, + const uint32_t search_width) +{ + constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask::value; + constexpr IndexT invalid_index = raft::upper_bound(); + + // Read child indices of parents from knn graph and check if the distance + // computaiton is necessary. + for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) { + const IndexT smem_parent_id = parent_indices[i / knn_k]; + IndexT child_id = invalid_index; + if (smem_parent_id != invalid_index) { + const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask; + child_id = knn_graph[(i % knn_k) + (static_cast(knn_k) * parent_id)]; + } + if (child_id != invalid_index) { + if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) { + child_id = invalid_index; + } + } + result_child_indices_ptr[i] = child_id; + } + __syncthreads(); + + // Compute the distance to child nodes + const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); + const auto num_k = knn_k * search_width; + const auto max_i = raft::round_up_safe(num_k, warp_size >> team_size_bits); + const auto compute_distance = dataset_desc.compute_distance_impl; + const auto args = dataset_desc.args.load(); + const bool lead_lane = (threadIdx.x & ((1u << team_size_bits) - 1u)) == 0; + for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += blockDim.x >> team_size_bits) { + const bool valid_i = i < num_k; + const auto child_id = valid_i ? result_child_indices_ptr[i] : invalid_index; + + // We should be calling `dataset_desc.compute_distance(..)` here as follows: + // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index); + // Instead, we manually inline this function for performance reasons. + // This allows us to move the fetching of the arguments from shared memory out of the loop. + const DistanceT child_dist = device::team_sum( + (child_id != invalid_index) ? compute_distance(args, child_id) + : (lead_lane ? raft::upper_bound() : 0), + team_size_bits); + + // Store the distance + if (valid_i && lead_lane) { result_child_distances_ptr[i] = child_dist; } + } +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr) +{ + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half& x, uint32_t addr) +{ + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(reinterpret_cast(x)) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half2& x, uint32_t addr) +{ + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(reinterpret_cast(x)) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[1], uint32_t addr) +{ + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast(x)) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[2], uint32_t addr) +{ + asm volatile("ld.shared.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[4], uint32_t addr) +{ + asm volatile("ld.shared.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr) +{ + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr) +{ + lds(x, uint32_t(__cvta_generic_to_shared(addr))); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr) +{ + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr) +{ + lds(x, uint32_t(__cvta_generic_to_shared(addr))); +} + +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) +{ + asm volatile("st.shared.v2.u16 [%0], {%1, %2};" + : + : "r"(addr), + "h"(reinterpret_cast(x.x)), + "h"(reinterpret_cast(x.y))); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint4& x, const uint4* addr) +{ + asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint4& x, const uint4* addr) +{ + asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint32_t& x, const uint32_t* addr) +{ + asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint32_t& x, const uint32_t* addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half& x, const half* addr) +{ + asm volatile("ld.global.ca.u16 {%0}, [%1];" + : "=h"(reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[1], const half* addr) +{ + asm volatile("ld.global.ca.u16 {%0}, [%1];" + : "=h"(*reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[2], const half* addr) +{ + asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[4], const half* addr) +{ + asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "l"(reinterpret_cast(addr))); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2& x, const half* addr) +{ + asm volatile("ld.global.ca.u32 %0, [%1];" + : "=r"(reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[1], const half* addr) +{ + asm volatile("ld.global.ca.u32 %0, [%1];" + : "=r"(*reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[2], const half* addr) +{ + asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];" + : "=r"(*reinterpret_cast(x)), "=r"(*reinterpret_cast(x + 1)) + : "l"(reinterpret_cast(addr))); +} + } // namespace device } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 183d6051f..1c99f72f7 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -16,6 +16,7 @@ #pragma once +#include "compute_distance-ext.cuh" #include "search_multi_cta.cuh" #include "search_multi_kernel.cuh" #include "search_plan.cuh" @@ -25,71 +26,153 @@ namespace cuvs::neighbors::cagra::detail { -template class factory { - using T = typename DATASET_DESCRIPTOR_T::DATA_T; - using IdxT = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DistanceT = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - public: /** * Create a search structure for dataset with dim features. */ - static std::unique_ptr> create( + static std::unique_ptr> create( raft::resources const& res, search_params const& params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - const cuvs::distance::DistanceType metric) + uint32_t topk) { - search_plan_impl_base plan(params, dim, graph_degree, topk, metric); - switch (plan.dataset_block_dim) { - case 128: - switch (plan.team_size) { - case 8: return dispatch_kernel<128, 8>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - case 256: - switch (plan.team_size) { - case 16: return dispatch_kernel<256, 16>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - case 512: - switch (plan.team_size) { - case 32: return dispatch_kernel<512, 32>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - default: THROW("Incorrect dataset_block_dim (%lu)\n", plan.dataset_block_dim); - } - return std::unique_ptr>(); + search_plan_impl_base plan(params, dim, graph_degree, topk); + return dispatch_kernel(res, plan, dataset_desc); } private: - template - static std::unique_ptr> - dispatch_kernel(raft::resources const& res, search_plan_impl_base& plan) + static std::unique_ptr> + dispatch_kernel(raft::resources const& res, + search_plan_impl_base& plan, + const dataset_descriptor_host& dataset_desc) { if (plan.algo == search_algo::SINGLE_CTA) { - return std::unique_ptr>( - new single_cta_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + return std::make_unique< + single_cta_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } else if (plan.algo == search_algo::MULTI_CTA) { - return std::unique_ptr>( - new multi_cta_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + return std::make_unique< + multi_cta_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } else { - return std::unique_ptr>( - new multi_kernel_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + return std::make_unique< + multi_kernel_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } } }; + +/* +Caching of dataset/distance descriptor initialization + (see `dataset_descriptor_init_with_cache` below). + */ +namespace descriptor_cache { + +/** + * The key for caching consists of a minimal set of fields that uniquely define the descriptor. + * The key field names are the same as of the descriptor and the contents are not relevant for + * caching. + */ +struct key { + uint64_t data_ptr; + uint64_t n_rows; + uint32_t dim; + uint32_t extra_val; // this one has different meanings for different descriptor types + uint32_t team_size; + uint32_t metric; +}; + +template +auto make_key(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, key> +{ + return key{reinterpret_cast(dataset.view().data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + dataset.stride(), + uint32_t(params.team_size), + uint32_t(metric)}; +} + +template +auto make_key(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, key> +{ + return key{reinterpret_cast(dataset.data.data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + uint32_t(reinterpret_cast(dataset.pq_code_book.data_handle()) >> 6), + uint32_t(params.team_size), + uint32_t(metric)}; +} + +inline auto operator==(const key& a, const key& b) -> bool +{ + return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim && + a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric; +} + +struct key_hash { + inline auto operator()(const key& x) const noexcept -> std::size_t + { + return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} + + (size_t{x.team_size} ^ size_t{x.metric}); + } +}; + +template +struct store { + /** Number of descriptors to cache. */ + static constexpr size_t kDefaultSize = 100; + raft::cache::lru, + std::shared_ptr>> + value{kDefaultSize}; +}; + +} // namespace descriptor_cache + +/** + * Call `dataset_descriptor_init` with memoization. + * (NB: `dataset_descriptor_init` is a function in a generated header file + * `neighbors/detail/cagra/compute_distance-ext.cuh`). + * + * `dataset_descriptor_init` involves calling a CUDA kernel to resolve device symbols before the + * main search kernel runs. This adds an extra unwanted latency. + * Caching the the descriptor helps to hide this latency for repeated searches. + * + */ +template +auto dataset_descriptor_init_with_cache(const raft::resources& res, + const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> const dataset_descriptor_host& +{ + using desc_t = dataset_descriptor_host; + auto key = descriptor_cache::make_key(params, dataset, metric); + auto& cache = + raft::resource::get_custom_resource>(res) + ->value; + std::shared_ptr desc{nullptr}; + if (!cache.get(key, &desc)) { + desc = std::make_shared(std::move(dataset_descriptor_init( + params, dataset, metric, raft::resource::get_cuda_stream(res)))); + cache.set(key, desc); + } + return *desc; +} + }; // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 515be75df..9edbbf5c1 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -73,12 +73,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a } template -RAFT_KERNEL kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dataset_dim] - const IdxT dataset_size, - const uint32_t dataset_dim, - IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree) +__global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dataset_dim] + const IdxT dataset_size, + const uint32_t dataset_dim, + IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree) { const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize; if (srcNode >= graph_size) { return; } @@ -129,15 +129,15 @@ RAFT_KERNEL kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, data } template -RAFT_KERNEL kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree, - const uint32_t degree, - const uint32_t batch_size, - const uint32_t batch_id, - uint8_t* const detour_count, // [graph_chunk_size, graph_degree] - uint32_t* const num_no_detour_edges, // [graph_size] - uint64_t* const stats) +__global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree, + const uint32_t degree, + const uint32_t batch_size, + const uint32_t batch_id, + uint8_t* const detour_count, // [graph_chunk_size, graph_degree] + uint32_t* const num_no_detour_edges, // [graph_size] + uint64_t* const stats) { __shared__ uint32_t smem_num_detour[MAX_DEGREE]; uint64_t* const num_retain = stats; @@ -192,11 +192,11 @@ RAFT_KERNEL kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph } template -RAFT_KERNEL kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] - IdxT* const rev_graph, // [size, degree] - uint32_t* const rev_graph_count, // [graph_size] - const uint32_t graph_size, - const uint32_t degree) +__global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] + IdxT* const rev_graph, // [size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree) { const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); const uint32_t tnum = blockDim.x * gridDim.x; @@ -221,16 +221,16 @@ __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label) } template -RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph, // [graph_size, graph_degree] - const IdxT* candidate_edges, // [graph_size] - IdxT* outgoing_num_edges, // [graph_size] - IdxT* incoming_num_edges, // [graph_size] - const IdxT* outgoing_max_edges, // [graph_size] - const IdxT* incoming_max_edges, // [graph_size] - const IdxT* label, // [graph_size] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_update_graph(IdxT* mst_graph, // [graph_size, graph_degree] + const IdxT* candidate_edges, // [graph_size] + IdxT* outgoing_num_edges, // [graph_size] + IdxT* incoming_num_edges, // [graph_size] + const IdxT* outgoing_max_edges, // [graph_size] + const IdxT* incoming_max_edges, // [graph_size] + const IdxT* label, // [graph_size] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -310,11 +310,11 @@ RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph, // [graph } template -RAFT_KERNEL kern_mst_opt_labeling(IdxT* label, // [graph_size] - const IdxT* mst_graph, // [graph_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_labeling(IdxT* label, // [graph_size] + const IdxT* mst_graph, // [graph_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -348,10 +348,10 @@ RAFT_KERNEL kern_mst_opt_labeling(IdxT* label, // [graph_size] } template -RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] - const IdxT* label, // [graph_size] - const uint32_t graph_size, - uint64_t* stats) +__global__ void kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] + const IdxT* label, // [graph_size] + const uint32_t graph_size, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -375,14 +375,14 @@ RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] } template -RAFT_KERNEL kern_mst_opt_postprocessing(IdxT* outgoing_num_edges, // [graph_size] - IdxT* incoming_num_edges, // [graph_size] - IdxT* outgoing_max_edges, // [graph_size] - IdxT* incoming_max_edges, // [graph_size] - const IdxT* cluster_size, // [graph_size] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges, // [graph_size] + IdxT* incoming_num_edges, // [graph_size] + IdxT* outgoing_max_edges, // [graph_size] + IdxT* incoming_max_edges, // [graph_size] + const IdxT* cluster_size, // [graph_size] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; diff --git a/cpp/src/neighbors/detail/cagra/hashmap.hpp b/cpp/src/neighbors/detail/cagra/hashmap.hpp index dd6c6c844..2c62dda90 100644 --- a/cpp/src/neighbors/detail/cagra/hashmap.hpp +++ b/cpp/src/neighbors/detail/cagra/hashmap.hpp @@ -29,10 +29,12 @@ namespace cuvs::neighbors::cagra::detail { namespace hashmap { -_RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; } +RAFT_INLINE_FUNCTION uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; } template -_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0) +RAFT_DEVICE_INLINE_FUNCTION void init(IdxT* const table, + const unsigned bitlen, + unsigned FIRST_TID = 0) { if (threadIdx.x < FIRST_TID) return; for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) { @@ -41,7 +43,9 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned } template -_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key) +RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table, + const uint32_t bitlen, + const IdxT key) { // Open addressing is used for collision resolution const uint32_t size = get_size(bitlen); @@ -68,7 +72,9 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co } template -_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key) +RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table, + const uint32_t bitlen, + const IdxT key) { IdxT ret = 0; if (threadIdx.x % TEAM_SIZE == 0) { ret = insert(table, bitlen, key); } @@ -78,5 +84,17 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co return ret; } +template +RAFT_DEVICE_INLINE_FUNCTION uint32_t +insert(unsigned team_size, IdxT* const table, const uint32_t bitlen, const IdxT key) +{ + IdxT ret = 0; + if (threadIdx.x % team_size == 0) { ret = insert(table, bitlen, key); } + for (unsigned offset = 1; offset < team_size; offset *= 2) { + ret |= __shfl_xor_sync(0xffffffff, ret, offset); + } + return ret; +} + } // namespace hashmap } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py deleted file mode 100644 index 63171373f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -header = """/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -""" - -trailer = """ -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search -""" - -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] -pq_bits = [8] -subspace_dims = [2, 4] -# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] -# mxelem = [64, 128, 256] -load_types = ["uint4"] -code_book_types = ["half"] -search_types = dict( - float_uint32=( - "float", - "uint32_t", - "float", - ), # data_t, vec_idx_t, distance_t - half_uint32=("half", "uint32_t", "float"), - int8_uint32=("int8_t", "uint32_t", "float"), - uint8_uint32=("uint8_t", "uint32_t", "float"), - float_uint64=("float", "uint64_t", "float"), - half_uint64=("half", "uint64_t", "float"), -) -# knn -for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - for code_book_t in code_book_types: - for subspace_dim in subspace_dims: - for pq_bit in pq_bits: - path = f"q_search_multi_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 5d94a501a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 56534dc05..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 7ff962058..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 3387a32a3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 2d3f2cb1d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 73dd8cd4b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index b5e33602d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 32fe0d628..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index e2726ea26..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index b4ebd49c4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 72f198c92..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index dfb667a7f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index c583569f6..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index fedfb5146..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 2b6e8e3da..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 4a97fb752..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 675cd3c93..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index b42b3289c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 0db4296f1..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 4a2610dc7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index b1c15662e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 201f68fb5..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 26744ed76..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 1bce71bef..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 694304f3c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index e6a563731..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 5c554af3f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 965b43c07..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 97a4f8092..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index bdd1719b3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e39bc1e2d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 599cf327a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 621c5a249..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index cbed3ef8a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 7428bfd9e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 70efefdb0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 4039b8582..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 022eb0e05..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e48b2ed71..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 64f08530f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py deleted file mode 100644 index bc5f506ac..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -header = """/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -""" - -trailer = """ -} // namespace cuvs::neighbors::cagra::detail::single_cta_search -""" - -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] -# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] -# itopk_candidates = [64, 128, 256] -# itopk_size = [64, 128, 256, 512] -# mxelem = [64, 128, 256] - -pq_bits = [8] -subspace_dims = [2, 4] - -# rblock = [(256, 4), (512, 2), (1024, 1)] -# rcandidates = [32] -# rsize = [256, 512] -code_book_types = ["half"] - -search_types = dict( - float_uint32=("float", "uint32_t", "float"), # data_t, idx_t, distance_t - half_uint32=("half", "uint32_t", "float"), - int8_uint32=("int8_t", "uint32_t", "float"), - uint8_uint32=("uint8_t", "uint32_t", "float"), - float_uint64=("float", "uint64_t", "float"), - half_uint64=("half", "uint64_t", "float"), -) - -# knn -for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - for code_book_t in code_book_types: - for subspace_dim in subspace_dims: - for pq_bit in pq_bits: - path = f"q_search_single_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index b40322741..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 36273d0d4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index ef483437a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index d9ebb1b85..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index e86524ee0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 9f2b7fbc7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 1ce4f5520..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 2d6f93ef0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 5f3267410..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 631ac7938..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index ea8faee1c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 061b1a04e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 15610d853..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index f984b46f0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 45299f272..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index fcb91be8c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index b594fedab..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index a82be6b55..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index d80fef52c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index e2c3ef4f7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 98889811d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index f5e9d12c9..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 4f14910b4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 67d52f8d5..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 1420918a1..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index eb0a72da3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 7a98b59a9..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 7e07033c7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 857f32712..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 3c00c5223..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e5c4c7b69..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 22359d71b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 37c783f19..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 0a4049d79..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 773f567c4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index dfc176abd..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 680c32655..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index e57881e82..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 525004f2e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 7af2ef124..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 0fd36c31b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index d4cc5f449..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index aa58ac2b7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 189c3ed9c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 9dc9aaae3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 100110313..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 8d4e0aeee..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 4c7318735..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index efbf9b56d..9bcccd9f9 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -16,12 +16,12 @@ #pragma once #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_multi_cta_kernel.cuh" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible +#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible #include "utils.hpp" #include @@ -51,48 +51,46 @@ namespace cuvs::neighbors::cagra::detail { namespace multi_cta_search { -template - -struct search : public search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; +template +struct search : public search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type ::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; + + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; uint32_t num_cta_per_query; rmm::device_uvector intermediate_indices; @@ -102,12 +100,11 @@ struct search : public search_plan_impl { search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric), + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk), intermediate_indices(0, raft::resource::get_cuda_stream(res)), intermediate_distances(0, raft::resource::get_cuda_stream(res)), topk_workspace(0, raft::resource::get_cuda_stream(res)) @@ -129,13 +126,9 @@ struct search : public search_plan_impl { // constexpr unsigned max_result_buffer_size = 256; RAFT_EXPECTS(result_buffer_size_32 <= 256, "Result buffer size cannot exceed 256"); - const auto query_smem_buffer_length = - raft::ceildiv(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - - smem_size = sizeof(float) * query_smem_buffer_length + + smem_size = dataset_desc.smem_ws_size_in_bytes + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + - sizeof(uint32_t) * search_width + sizeof(uint32_t) + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; + sizeof(uint32_t) * search_width + sizeof(uint32_t); RAFT_LOG_DEBUG("# smem_size: %u", smem_size); // @@ -204,44 +197,37 @@ struct search : public search_plan_impl { ~search() {} - void operator()( - raft::resources const& res, - // raft::device_matrix_view dataset, - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - uint32_t topk, - SAMPLE_FILTER_T sample_filter) + void operator()(raft::resources const& res, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* const num_executed_iterations, // [num_queries,] + uint32_t topk, + SAMPLE_FILTER_T sample_filter) { cudaStream_t stream = raft::resource::get_cuda_stream(res); - - select_and_run( - dataset_desc, - graph, - intermediate_indices.data(), - intermediate_distances.data(), - queries_ptr, - num_queries, - dev_seed_ptr, - num_executed_iterations, - *this, - topk, - thread_block_size, - result_buffer_size, - smem_size, - hash_bitlen, - hashmap.data(), - num_cta_per_query, - num_seeds, - sample_filter, - this->metric, - stream); + select_and_run(dataset_desc.dev_ptr(), + graph, + intermediate_indices.data(), + intermediate_distances.data(), + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + *this, + topk, + thread_block_size, + result_buffer_size, + smem_size, + hash_bitlen, + hashmap.data(), + num_cta_per_query, + num_seeds, + sample_filter, + stream); RAFT_CUDA_TRY(cudaPeekAtLastError()); // Select the top-k results from the intermediate results diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py index cb63c0e03..3153a3a9f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py @@ -39,8 +39,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { """ @@ -48,7 +46,6 @@ } // namespace cuvs::neighbors::cagra::detail::multi_cta_search """ -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] # mxelem = [64, 128, 256] load_types = ["uint4"] @@ -66,13 +63,12 @@ ) # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") + path = f"search_multi_cta_{type_path}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_kernel_selection(\n {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" + ) + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu index 2a14699f4..fae5a9387 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu @@ -25,13 +25,10 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu deleted file mode 100644 index 0bf4a192f..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu deleted file mode 100644 index a77859b7d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu deleted file mode 100644 index ab49fa9f2..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu index 157942dc5..88167b843 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu @@ -25,13 +25,10 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu deleted file mode 100644 index c38eeb009..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu deleted file mode 100644 index 3094ddaeb..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu deleted file mode 100644 index 91725d185..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu deleted file mode 100644 index 0f452a6fa..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu index ea38b60c0..9606d510f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu @@ -25,13 +25,10 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu deleted file mode 100644 index cfe7a7aef..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu deleted file mode 100644 index 292859382..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu index ee2400037..dafb89cc3 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu @@ -25,13 +25,10 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu deleted file mode 100644 index 13044f12d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu deleted file mode 100644 index 2ce6f292d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu deleted file mode 100644 index 2d607eb8d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh index b1cfaf870..036a4e414 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh @@ -21,30 +21,26 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { -#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \ - template void select_and_run( \ - DATASET_DESC_T dataset_desc, \ - raft::device_matrix_view \ - graph, \ - typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr, \ - typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr, \ - const typename DATASET_DESC_T::DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - typename DATASET_DESC_T::INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ +#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \ + template void select_and_run( \ + const dataset_descriptor_base_t* dataset_desc, \ + raft::device_matrix_view graph, \ + IndexT* topk_indices_ptr, \ + DistanceT* topk_distances_ptr, \ + const DataT* queries_ptr, \ + uint32_t num_queries, \ + const IndexT* dev_seed_ptr, \ + uint32_t* num_executed_iterations, \ + const search_params& ps, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + IndexT* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_seeds, \ + SampleFilterT sample_filter, \ cudaStream_t stream); -#define COMMA , - } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu new file mode 100644 index 000000000..a3322c435 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include "search_multi_cta_inst.cuh" + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { +instantiate_kernel_selection(int8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu deleted file mode 100644 index c28adbf80..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu deleted file mode 100644 index af5f13397..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu deleted file mode 100644 index bcc7b9b8c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu deleted file mode 100644 index 916196c35..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh deleted file mode 100644 index e907568f5..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "compute_distance_vpq.cuh" -#include // none_cagra_sample_filter -#include // RAFT_EXPLICIT - -#include - -namespace cuvs::neighbors::cagra::detail { -namespace multi_cta_search { - -#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY - -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - // multi_cta_search (params struct) - uint32_t block_size, // - uint32_t result_buffer_size, - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) RAFT_EXPLICIT; -#endif // CUVS_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_kernel_selection( \ - TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ - extern template void select_and_run< \ - TEAM_SIZE, \ - MAX_DATASET_DIM, \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t \ - dataset_desc, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_kernel_selection( - 32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_kernel_selection - -#define instantiate_q_kernel_selection(TEAM_SIZE, \ - MAX_DATASET_DIM, \ - CODE_BOOK_T, \ - PQ_BITS, \ - PQ_CODE_BOOK_DIM, \ - DATA_T, \ - INDEX_T, \ - DISTANCE_T, \ - SAMPLE_FILTER_T) \ - extern template void \ - select_and_run, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t dataset_desc, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_q_kernel_selection( - 8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection( - 8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_q_kernel_selection -} // namespace multi_cta_search -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 4d2030c6c..dd74ba44b 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -15,12 +15,14 @@ */ #pragma once +#include "search_multi_cta_kernel.cuh" + #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible +#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible #include "utils.hpp" #include @@ -53,11 +55,12 @@ namespace multi_cta_search { // #define _CLK_BREAKDOWN template -__device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [search_width] - const uint32_t search_width, - INDEX_T* const itopk_indices, // [num_itopk] - const size_t num_itopk, - uint32_t* const terminate_flag) +RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents( + INDEX_T* const next_parent_indices, // [search_width] + const uint32_t search_width, + INDEX_T* const itopk_indices, // [num_itopk] + const size_t num_itopk, + uint32_t* const terminate_flag) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; const unsigned warp_id = threadIdx.x / 32; @@ -93,10 +96,11 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [sea } template -__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] - INDEX_T* indices, // [num_elements] - const uint32_t num_elements, - const uint32_t num_itopk // num_itopk <= num_elements +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( + float* distances, // [num_elements] + INDEX_T* indices, // [num_elements] + const uint32_t num_elements, + const uint32_t num_itopk // num_itopk <= num_elements ) { const unsigned warp_id = threadIdx.x / 32; @@ -130,17 +134,13 @@ __device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] // // multiple CTAs per single query // -template -__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( +template +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] - DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph, // [dataset_size, graph_degree] const uint32_t graph_degree, @@ -156,13 +156,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const uint32_t min_iteration, const uint32_t max_iteration, uint32_t* const num_executed_iterations, /* stats */ - SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - using QUERY_T = typename DATASET_DESCRIPTOR_T::QUERY_T; const auto num_queries = gridDim.y; const auto query_id = blockIdx.y; @@ -184,7 +182,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #endif _CLK_START(); - extern __shared__ uint32_t smem[]; + extern __shared__ uint8_t smem[]; // Layout of result_buffer // +----------------+------------------------------+---------+ @@ -192,26 +190,21 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // | | | upto 32 | // +----------------+------------------------------+---------+ // |<--- result_buffer_size --->| - uint32_t result_buffer_size = itopk_size + (search_width * graph_degree); - uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } + const auto result_buffer_size = itopk_size + (search_width * graph_degree); + const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); assert(result_buffer_size_32 <= MAX_ELEMENTS); - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + query_smem_buffer_length); - auto result_distances_buffer = + // Set smem working buffer for the distance calculation + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + + auto* __restrict__ result_indices_buffer = + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); + auto* __restrict__ result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto parent_indices_buffer = + auto* __restrict__ parent_indices_buffer = reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto distance_work_buffer_ptr = - reinterpret_cast(parent_indices_buffer + search_width); - auto terminate_flag = reinterpret_cast(distance_work_buffer_ptr + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte); - - // Set smem working buffer for the distance calculation - dataset_desc.set_smem_ptr(distance_work_buffer_ptr); + auto* __restrict__ terminate_flag = + reinterpret_cast(parent_indices_buffer + search_width); #if 0 /* debug */ @@ -220,9 +213,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( result_distances_buffer[i] = utils::get_max_value(); } #endif - const DATA_T* const query_ptr = queries_ptr + (dataset_desc.dim * query_id); - dataset_desc.template copy_query( - query_ptr, query_buffer, query_smem_buffer_length); if (threadIdx.x == 0) { terminate_flag[0] = 0; } INDEX_T* const local_visited_hashmap_ptr = @@ -236,20 +226,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( uint32_t block_id = cta_id + (num_cta_per_query * query_id); uint32_t num_blocks = num_cta_per_query * num_queries; - device::compute_distance_to_random_nodes(result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_desc, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric, - block_id, - num_blocks); + device::compute_distance_to_random_nodes(result_indices_buffer, + result_distances_buffer, + *dataset_desc, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + block_id, + num_blocks); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -279,21 +267,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - // constexpr unsigned max_n_frags = 16; - constexpr unsigned max_n_frags = 0; - device::compute_distance_to_child_nodes( - result_indices_buffer + itopk_size, - result_distances_buffer + itopk_size, - query_buffer, - dataset_desc, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_indices_buffer, - result_indices_buffer, - search_width, - metric); + device::compute_distance_to_child_nodes(result_indices_buffer + itopk_size, + result_distances_buffer + itopk_size, + *dataset_desc, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_indices_buffer, + result_indices_buffer, + search_width); _CLK_REC(clk_compute_distance); __syncthreads(); @@ -409,84 +392,58 @@ void set_value_batch(T* const dev_ptr, <<>>(dev_ptr, ld, val, count, batch_size); } -template +template struct search_kernel_config { // Search kernel function type. Note that the actual values for the template value // parameters do not matter, because they are not part of the function signature. The // second to fourth value parameters will be selected by the choose_* functions below. - using kernel_t = decltype(&search_kernel); + using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t { if (result_buffer_size <= 64) { - return search_kernel; + return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (result_buffer_size <= 128) { - return search_kernel; + return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (result_buffer_size <= 256) { - return search_kernel; + return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256); } }; -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - // multi_cta_search (params struct) - uint32_t block_size, // - uint32_t result_buffer_size, - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_seeds, + SampleFilterT sample_filter, + cudaStream_t stream) { auto kernel = - search_kernel_config:: - choose_buffer_size(result_buffer_size, block_size); + search_kernel_config, + SampleFilterT>::choose_buffer_size(result_buffer_size, block_size); - RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte)); + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); // Initialize hash table const uint32_t hash_size = hashmap::get_size(hash_bitlen); - set_value_batch(hashmap_ptr, - hash_size, - utils::get_max_value(), - hash_size, - num_queries, - stream); + set_value_batch( + hashmap_ptr, hash_size, utils::get_max_value(), hash_size, num_queries, stream); dim3 block_dims(block_size, 1, 1); dim3 grid_dims(num_cta_per_query, num_queries, 1); @@ -513,8 +470,7 @@ void select_and_run( ps.min_iterations, ps.max_iterations, num_executed_iterations, - sample_filter, - metric); + sample_filter); } } // namespace multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh index 673fc5473..1ef35f947 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,32 @@ */ #pragma once -#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY -#include "search_multi_cta_kernel-inl.cuh" -#endif +#include "compute_distance-ext.cuh" -#ifdef RAFT_COMPILED -#include "search_multi_cta_kernel-ext.cuh" -#endif +#include + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { + +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_seeds, + SampleFilterT sample_filter, + cudaStream_t stream); + +} diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu new file mode 100644 index 000000000..51fc6526f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include "search_multi_cta_inst.cuh" + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { +instantiate_kernel_selection(uint8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu deleted file mode 100644 index 3fa12d933..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu deleted file mode 100644 index e2f25a1c2..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu deleted file mode 100644 index 4cd206d8c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu deleted file mode 100644 index 56989a1d5..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index bc1266fb4..7b3ecabf3 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -15,12 +15,11 @@ */ #pragma once -#include "compute_distance.hpp" -#include "compute_distance_vpq.cuh" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" //todo replace with raft kernel +#include "topk_for_cagra/topk.h" //todo replace with raft kernel #include "utils.hpp" #include @@ -93,9 +92,9 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre } // MAX_DATASET_DIM : must equal to or greater than dataset_dim -template +template RAFT_KERNEL random_pickup_kernel( - const DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::size_t num_pickup, const unsigned num_distilation, @@ -106,30 +105,19 @@ RAFT_KERNEL random_pickup_kernel( typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] const std::uint32_t ldr, // (*) ldr >= num_pickup typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << bitlen] - const std::uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric) + const std::uint32_t hash_bitlen) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; + const auto team_size_bits = dataset_desc->team_size_bitshift(); const auto ldb = hashmap::get_size(hash_bitlen); - const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE; + const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) >> team_size_bits; const uint32_t query_id = blockIdx.y; if (global_team_index >= num_pickup) { return; } - // Load a query - extern __shared__ float query_buffer[]; - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dataset_desc.dim) { - query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping{}( - (queries_ptr + query_id * dataset_desc.dim)[i]); - } else { - query_buffer[j] = 0.0; - } - } + extern __shared__ uint8_t smem[]; + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); __syncthreads(); INDEX_T best_index_team_local; @@ -141,27 +129,10 @@ RAFT_KERNEL random_pickup_kernel( } else { // Chose a seed node randomly seed_index = - device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size; - } - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, seed_index, true); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, true); - break; - default: break; + device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size; } + DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true); if (norm2 < best_norm2_team_local) { best_norm2_team_local = norm2; best_index_team_local = seed_index; @@ -169,7 +140,7 @@ RAFT_KERNEL random_pickup_kernel( } const auto store_gmem_index = global_team_index + (ldr * query_id); - if (threadIdx.x % TEAM_SIZE == 0) { + if ((threadIdx.x & ((1u << team_size_bits) - 1u)) == 0) { if (hashmap::insert( visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) { result_distances_ptr[store_gmem_index] = best_norm2_team_local; @@ -182,47 +153,40 @@ RAFT_KERNEL random_pickup_kernel( } // MAX_DATASET_DIM : must be equal to or greater than dataset_dim -template -void random_pickup( - const DATASET_DESCRIPTOR_T dataset_desc, - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const std::size_t num_queries, - const std::size_t num_pickup, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr, // [num_queries, num_seeds] - const uint32_t num_seeds, - typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldr] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] - const std::size_t ldr, // (*) ldr >= num_pickup - typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << bitlen] - const std::uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, - cudaStream_t const cuda_stream = 0) +template +void random_pickup(const dataset_descriptor_host& dataset_desc, + const DataT* queries_ptr, // [num_queries, dataset_dim] + std::size_t num_queries, + std::size_t num_pickup, + unsigned num_distilation, + uint64_t rand_xor_mask, + const IndexT* seed_ptr, // [num_queries, num_seeds] + uint32_t num_seeds, + IndexT* result_indices_ptr, // [num_queries, ldr] + DistanceT* result_distances_ptr, // [num_queries, ldr] + std::size_t ldr, // (*) ldr >= num_pickup + IndexT* visited_hashmap_ptr, // [num_queries, 1 << bitlen] + std::uint32_t hash_bitlen, + cudaStream_t cuda_stream) { const auto block_size = 256u; - const auto num_teams_per_threadblock = block_size / TEAM_SIZE; + const auto num_teams_per_threadblock = block_size / dataset_desc.team_size; const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock, num_queries); - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - const auto smem_size = query_smem_buffer_length * sizeof(float); - - random_pickup_kernel - <<>>(dataset_desc, - queries_ptr, - num_pickup, - num_distilation, - rand_xor_mask, - seed_ptr, - num_seeds, - result_indices_ptr, - result_distances_ptr, - ldr, - visited_hashmap_ptr, - hash_bitlen, - metric); + random_pickup_kernel<<>>( + dataset_desc.dev_ptr(), + queries_ptr, + num_pickup, + num_distilation, + rand_xor_mask, + seed_ptr, + num_seeds, + result_indices_ptr, + result_distances_ptr, + ldr, + visited_hashmap_ptr, + hash_bitlen); } template @@ -325,9 +289,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr, // [num_queries, terminate_flag); } -template RAFT_KERNEL compute_distance_to_child_nodes_kernel( const typename DATASET_DESCRIPTOR_T::INDEX_T* const @@ -338,7 +300,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( parent_distance_ptr, // [num_queries, search_width] const std::size_t lds, const std::uint32_t search_width, - const DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, const typename DATASET_DESCRIPTOR_T::INDEX_T* const neighbor_graph_ptr, // [dataset_size, graph_degree] const std::uint32_t graph_degree, @@ -349,29 +311,22 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldd] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] const std::uint32_t ldd, // (*) ldd >= search_width * graph_degree - SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; + const auto team_size_bits = dataset_desc->team_size_bitshift(); + const auto team_size = 1u << team_size_bits; const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; - const auto global_team_id = tid / TEAM_SIZE; + const auto global_team_id = tid >> team_size_bits; const auto query_id = blockIdx.y; - extern __shared__ float query_buffer[]; - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dataset_desc.dim) { - query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping{}( - (query_ptr + query_id * dataset_desc.dim)[i]); - } else { - query_buffer[j] = 0.0; - } - } + extern __shared__ uint8_t smem[]; + // Load a query + dataset_desc = dataset_desc->setup_workspace(smem, query_ptr, query_id); + __syncthreads(); if (global_team_id >= search_width * graph_degree) { return; } @@ -393,33 +348,18 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const std::size_t child_id = neighbor_list_head_ptr[global_team_id % graph_degree]; - const auto compute_distance_flag = hashmap::insert( - visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, compute_distance_flag); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, compute_distance_flag); - break; - default: break; - } + const auto compute_distance_flag = hashmap::insert( + team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); + + DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, compute_distance_flag); if (compute_distance_flag) { - if (threadIdx.x % TEAM_SIZE == 0) { + if ((threadIdx.x & (team_size - 1)) == 0) { result_indices_ptr[ldd * blockIdx.y + global_team_id] = child_id; result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2; } } else { - if (threadIdx.x % TEAM_SIZE == 0) { + if ((threadIdx.x & (team_size - 1)) == 0) { result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value(); } } @@ -434,66 +374,52 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( } } -template +template void compute_distance_to_child_nodes( - const typename DATASET_DESCRIPTOR_T::INDEX_T* const - parent_node_list, // [num_queries, search_width] - typename DATASET_DESCRIPTOR_T::INDEX_T* const - parent_candidates_ptr, // [num_queries, search_width] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const - parent_distance_ptr, // [num_queries, search_width] - const std::size_t lds, - const uint32_t search_width, - const DATASET_DESCRIPTOR_T dataset_desc, - const typename DATASET_DESCRIPTOR_T::INDEX_T* const - neighbor_graph_ptr, // [dataset_size, graph_degree] - const std::uint32_t graph_degree, - const typename DATASET_DESCRIPTOR_T::DATA_T* query_ptr, // [num_queries, data_dim] - const std::uint32_t num_queries, - typename DATASET_DESCRIPTOR_T::INDEX_T* const - visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] - const std::uint32_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldd] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] - const std::uint32_t ldd, // (*) ldd >= search_width * graph_degree + const IndexT* parent_node_list, // [num_queries, search_width] + IndexT* const parent_candidates_ptr, // [num_queries, search_width] + DistanceT* const parent_distance_ptr, // [num_queries, search_width] + std::size_t lds, + uint32_t search_width, + const dataset_descriptor_host& dataset_desc, + const IndexT* neighbor_graph_ptr, // [dataset_size, graph_degree] + std::uint32_t graph_degree, + const DataT* query_ptr, // [num_queries, data_dim] + std::uint32_t num_queries, + IndexT* visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] + std::uint32_t hash_bitlen, + IndexT* result_indices_ptr, // [num_queries, ldd] + DistanceT* result_distances_ptr, // [num_queries, ldd] + std::uint32_t ldd, // (*) ldd >= search_width * graph_degree SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric, - cudaStream_t cuda_stream = 0) + cudaStream_t cuda_stream) { - const auto block_size = 128; - const dim3 grid_size( - (search_width * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE), - num_queries); - - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - - const auto smem_size = - query_smem_buffer_length * sizeof(float) + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; - - compute_distance_to_child_nodes_kernel - <<>>(parent_node_list, - parent_candidates_ptr, - parent_distance_ptr, - lds, - search_width, - dataset_desc, - neighbor_graph_ptr, - graph_degree, - query_ptr, - visited_hashmap_ptr, - hash_bitlen, - result_indices_ptr, - result_distances_ptr, - ldd, - sample_filter, - metric); + const auto block_size = 128; + const auto teams_per_block = block_size / dataset_desc.team_size; + const dim3 grid_size((search_width * graph_degree + teams_per_block - 1) / teams_per_block, + num_queries); + + compute_distance_to_child_nodes_kernel<<>>(parent_node_list, + parent_candidates_ptr, + parent_distance_ptr, + lds, + search_width, + dataset_desc.dev_ptr(), + neighbor_graph_ptr, + graph_degree, + query_ptr, + visited_hashmap_ptr, + hash_bitlen, + result_indices_ptr, + result_distances_ptr, + ldd, + sample_filter); } template @@ -639,49 +565,48 @@ void set_value_batch(T* const dev_ptr, // |<--- result_buffer_allocation_size --->| // |<--- result_buffer_size --->| // Double buffer (A) // |<--- result_buffer_size --->| // Double buffer (B) -template -struct search : search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; +template +struct search : search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; static_assert(std::is_same_v, "Only float is supported as resulting distance"); - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; size_t result_buffer_allocation_size; rmm::device_uvector result_indices; // results_indices_buffer @@ -699,12 +624,11 @@ struct search : search_plan_impl { search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric), + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk), result_indices(0, raft::resource::get_cuda_stream(res)), result_distances(0, raft::resource::get_cuda_stream(res)), parent_node_list(0, raft::resource::get_cuda_stream(res)), @@ -837,7 +761,6 @@ struct search : search_plan_impl { } void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const topk_indices_ptr, // [num_queries, topk] DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] @@ -865,21 +788,20 @@ struct search : search_plan_impl { } // Choose initial entry point candidates at random - random_pickup(dataset_desc, - queries_ptr, - num_queries, - result_buffer_size, - num_random_samplings, - rand_xor_mask, - dev_seed_ptr, - num_seeds, - result_indices.data(), - result_distances.data(), - result_buffer_allocation_size, - hashmap.data(), - hash_bitlen, - this->metric, - stream); + random_pickup(dataset_desc, + queries_ptr, + num_queries, + result_buffer_size, + num_random_samplings, + rand_xor_mask, + dev_seed_ptr, + num_seeds, + result_indices.data(), + result_distances.data(), + result_buffer_allocation_size, + hashmap.data(), + hash_bitlen, + stream); unsigned iter = 0; while (1) { @@ -931,7 +853,7 @@ struct search : search_plan_impl { } // Compute distance to child nodes that are adjacent to the parent node - compute_distance_to_child_nodes( + compute_distance_to_child_nodes( parent_node_list.data(), result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size, result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size, @@ -948,7 +870,6 @@ struct search : search_plan_impl { result_distances.data() + itopk_size, result_buffer_allocation_size, sample_filter, - this->metric, stream); iter++; @@ -1025,70 +946,5 @@ struct search : search_plan_impl { } }; -template -struct search, - SAMPLE_FILTER_T> - : public search_plan_impl, - SAMPLE_FILTER_T> { - using DATASET_DESCRIPTOR_T = cagra_q_dataset_descriptor_t; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - search(raft::resources const& res, - search_params params, - int64_t dim, - int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric) - { - THROW("The multi-kernel mode does not support VPQ"); - } - - void set_params(raft::resources const& res) {} - - void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - uint32_t topk, - SAMPLE_FILTER_T sample_filter) - { - } -}; - } // namespace multi_kernel_search } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 0543224b3..16864ed19 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -18,10 +18,11 @@ #include "hashmap.hpp" +#include "compute_distance-ext.cuh" #include #include // #include "search_single_cta_inst.cuh" -// #include "topk_for_cagra/topk_core.cuh" +// #include "topk_for_cagra/topk.h" #include #include @@ -34,19 +35,12 @@ namespace cuvs::neighbors::cagra::detail { struct search_plan_impl_base : public search_params { - int64_t dataset_block_dim; int64_t dim; int64_t graph_degree; uint32_t topk; - cuvs::distance::DistanceType metric; - search_plan_impl_base(search_params params, - int64_t dim, - int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric) + search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk) + : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk) { - set_dataset_block_and_team_size(dim); if (algo == search_algo::AUTO) { const size_t num_sm = raft::getMultiProcessorCount(); if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) { @@ -61,29 +55,13 @@ struct search_plan_impl_base : public search_params { } } } - - void set_dataset_block_and_team_size(int64_t dim) - { - constexpr int64_t max_dataset_block_dim = 512; - dataset_block_dim = 128; - while (dataset_block_dim < dim && dataset_block_dim < max_dataset_block_dim) { - dataset_block_dim *= 2; - } - // To keep binary size in check we limit only one team size specialization for each max_dim. - // TODO(tfeher): revise this decision. - switch (dataset_block_dim) { - case 128: team_size = 8; break; - case 256: team_size = 16; break; - default: team_size = 32; break; - } - } }; -template +template struct search_plan_impl : public search_plan_impl_base { - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; + using DATA_T = DataT; + using INDEX_T = IndexT; + using DISTANCE_T = DistanceT; int64_t hash_bitlen; @@ -100,23 +78,24 @@ struct search_plan_impl : public search_plan_impl_base { rmm::device_uvector hashmap; rmm::device_uvector num_executed_iterations; // device or managed? rmm::device_uvector dev_seed; + const dataset_descriptor_host& dataset_desc; search_plan_impl(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl_base(params, dim, graph_degree, topk, metric), + uint32_t topk) + : search_plan_impl_base(params, dim, graph_degree, topk), hashmap(0, raft::resource::get_cuda_stream(res)), num_executed_iterations(0, raft::resource::get_cuda_stream(res)), dev_seed(0, raft::resource::get_cuda_stream(res)), - num_seeds(0) + num_seeds(0), + dataset_desc(dataset_desc) { adjust_search_params(); check_params(); calc_hashmap_params(res); - set_dataset_block_and_team_size(dim); num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res)); RAFT_LOG_DEBUG("# algo = %d", static_cast(algo)); } @@ -124,7 +103,6 @@ struct search_plan_impl : public search_plan_impl_base { virtual ~search_plan_impl() {} virtual void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] @@ -160,6 +138,7 @@ struct search_plan_impl : public search_plan_impl_base { itopk32); itopk_size = itopk32; } + team_size = dataset_desc.team_size; } // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size @@ -292,10 +271,6 @@ struct search_plan_impl : public search_plan_impl_base { algo != search_algo::MULTI_KERNEL) { error_message += "An invalid kernel mode has been given: " + std::to_string((int)algo) + ""; } - if (team_size != 0 && team_size != 4 && team_size != 8 && team_size != 16 && team_size != 32) { - error_message += - "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given."; - } if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 && thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) { error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " + @@ -330,20 +305,4 @@ struct search_plan_impl : public search_plan_impl_base { } }; -// template -// struct search_plan { -// search_plan(raft::resources const& res, -// search_params param, -// int64_t dim, -// int64_t graph_degree) -// : plan(res, param, dim, graph_degree) -// { -// } -// void check(uint32_t topk) { plan.check(topk); } - -// // private: -// detail::search_plan_impl plan; -// }; -/** @} */ // end group cagra - } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index 0a101cbfe..4abed6760 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -16,13 +16,13 @@ #pragma once #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" #include "search_single_cta_kernel.cuh" #include "topk_by_radix.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk +#include "topk_for_cagra/topk.h" // TODO replace with raft topk #include "utils.hpp" #include @@ -49,58 +49,56 @@ namespace cuvs::neighbors::cagra::detail { namespace single_cta_search { -template -struct search : search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; +template +struct search : search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; + + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; uint32_t num_itopk_candidates; search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric) + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk) { set_params(res); } @@ -128,14 +126,11 @@ struct search : search_plan_impl { constexpr unsigned max_block_size = 1024; // const std::uint32_t topk_ws_size = 3; - const auto query_smem_buffer_length = - raft::ceildiv(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; const std::uint32_t base_smem_size = - sizeof(float) * query_smem_buffer_length + + dataset_desc.smem_ws_size_in_bytes + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width + - sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t) + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; + sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t); smem_size = base_smem_size; if (num_itopk_candidates > 256) { // Tentatively calculate the required share memory size when radix @@ -212,7 +207,6 @@ struct search : search_plan_impl { } void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] @@ -224,28 +218,26 @@ struct search : search_plan_impl { SAMPLE_FILTER_T sample_filter) { cudaStream_t stream = raft::resource::get_cuda_stream(res); - select_and_run( - dataset_desc, - graph, - result_indices_ptr, - result_distances_ptr, - queries_ptr, - num_queries, - dev_seed_ptr, - num_executed_iterations, - *this, - topk, - num_itopk_candidates, - static_cast(thread_block_size), - smem_size, - hash_bitlen, - hashmap.data(), - small_hash_bitlen, - small_hash_reset_interval, - num_seeds, - sample_filter, - this->metric, - stream); + select_and_run(dataset_desc.dev_ptr(), + graph, + result_indices_ptr, + result_distances_ptr, + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + *this, + topk, + num_itopk_candidates, + static_cast(thread_block_size), + smem_size, + hash_bitlen, + hashmap.data(), + small_hash_bitlen, + small_hash_reset_interval, + num_seeds, + sample_filter, + stream); } }; diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py index a361269a6..e37ceb1fa 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py @@ -39,8 +39,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { """ @@ -48,7 +46,6 @@ } // namespace cuvs::neighbors::cagra::detail::single_cta_search """ -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] # itopk_candidates = [64, 128, 256] # itopk_size = [64, 128, 256, 512] @@ -69,14 +66,13 @@ # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - path = f"search_single_cta_{type_path}_dim{mxdim}_t{team}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) + path = f"search_single_cta_{type_path}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_kernel_selection(\n {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" + ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu index c2cfb13c4..f8495bc01 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu @@ -25,13 +25,10 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu deleted file mode 100644 index 4cf4a26f7..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu deleted file mode 100644 index 692710476..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu deleted file mode 100644 index ed3a900ff..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu index 1e2b83492..0ef5c366f 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu @@ -25,13 +25,10 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu deleted file mode 100644 index 2c4da00db..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu deleted file mode 100644 index 8b26a595f..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu deleted file mode 100644 index a93f893d4..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu deleted file mode 100644 index 4a7502e3e..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu index 7d3e86f38..c21e6d1f4 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu @@ -25,13 +25,10 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu deleted file mode 100644 index 6c13df91a..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu deleted file mode 100644 index 12aa72a24..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu index cfae9e367..b96ed0b22 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu @@ -25,13 +25,10 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu deleted file mode 100644 index 84a173d6d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu deleted file mode 100644 index d9c5198eb..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu deleted file mode 100644 index 3ba8f4e4d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh index a4581d15e..26ca7b672 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh @@ -21,31 +21,27 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { -#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \ - template void select_and_run( \ - DATASET_DESC_T dataset_desc, \ - raft::device_matrix_view \ - graph, \ - typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr, \ - typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr, \ - const typename DATASET_DESC_T::DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - typename DATASET_DESC_T::INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ +#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \ + template void select_and_run( \ + const dataset_descriptor_base_t* dataset_desc, \ + raft::device_matrix_view graph, \ + IndexT* topk_indices_ptr, \ + DistanceT* topk_distances_ptr, \ + const DataT* queries_ptr, \ + uint32_t num_queries, \ + const IndexT* dev_seed_ptr, \ + uint32_t* num_executed_iterations, \ + const search_params& ps, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + IndexT* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_seeds, \ + SampleFilterT sample_filter, \ cudaStream_t stream); -#define COMMA , - } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu new file mode 100644 index 000000000..56a0d8ba9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include "search_single_cta_inst.cuh" + +namespace cuvs::neighbors::cagra::detail::single_cta_search { +instantiate_kernel_selection(int8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu deleted file mode 100644 index ad2ca16fc..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu deleted file mode 100644 index 6130a84bc..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu deleted file mode 100644 index 1e7bee57c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu deleted file mode 100644 index 7f789e3d0..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh deleted file mode 100644 index 10dda0389..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh +++ /dev/null @@ -1,588 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include // RAFT_EXPLICIT - -#include - -namespace cuvs::neighbors::cagra::detail { -namespace single_cta_search { - -#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY - -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - uint32_t num_itopk_candidates, - uint32_t block_size, // - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - size_t small_hash_bitlen, - size_t small_hash_reset_interval, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) RAFT_EXPLICIT; - -#endif // CUVS_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_single_cta_select_and_run( \ - TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ - extern template void select_and_run< \ - TEAM_SIZE, \ - MAX_DATASET_DIM, \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t \ - dataset, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_single_cta_select_and_run( - 32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_single_cta_select_and_run - -#define instantiate_q_single_cta_select_and_run(TEAM_SIZE, \ - MAX_DATASET_DIM, \ - CODE_BOOK_T, \ - PQ_BITS, \ - PQ_CODE_BOOK_DIM, \ - DATA_T, \ - INDEX_T, \ - DISTANCE_T, \ - SAMPLE_FILTER_T) \ - extern template void \ - select_and_run, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t dataset, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 1024, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 1024, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - float, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - float, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_q_single_cta_select_and_run - -} // namespace single_cta_search -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index a101cdc1f..d10313c5b 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -15,13 +15,15 @@ */ #pragma once +#include "search_single_cta_kernel.cuh" + #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" #include "topk_by_radix.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk +#include "topk_for_cagra/topk.h" // TODO replace with raft topk #include "utils.hpp" #include @@ -56,12 +58,11 @@ namespace single_cta_search { // #define _CLK_BREAKDOWN template -__device__ void pickup_next_parents(std::uint32_t* const terminate_flag, - INDEX_T* const next_parent_indices, - INDEX_T* const internal_topk_indices, - const std::size_t internal_topk_size, - const std::size_t dataset_size, - const std::uint32_t search_width) +RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag, + INDEX_T* const next_parent_indices, + INDEX_T* const internal_topk_indices, + const std::size_t internal_topk_size, + const std::uint32_t search_width) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; // if (threadIdx.x >= 32) return; @@ -99,11 +100,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag, } template -__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - const std::uint32_t num_itopk, - unsigned MULTI_WARPS = 0) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st( + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + const std::uint32_t num_itopk, + unsigned MULTI_WARPS = 0) { const unsigned lane_id = threadIdx.x % 32; const unsigned warp_id = threadIdx.x / 32; @@ -202,15 +204,16 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // } template -__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first, - unsigned MULTI_WARPS = 0) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd( + float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first, + unsigned MULTI_WARPS = 0) { const unsigned lane_id = threadIdx.x % 32; const unsigned warp_id = threadIdx.x / 32; @@ -410,16 +413,17 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num template -__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first, - const unsigned MULTI_WARPS_1, - const unsigned MULTI_WARPS_2) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( + float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first, + const unsigned MULTI_WARPS_1, + const unsigned MULTI_WARPS_2) { // The results in candidate_distances/indices are sorted by bitonic sort. topk_by_bitonic_sort_1st( @@ -439,11 +443,11 @@ __device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] } template -__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, - const size_t hashmap_bitlen, - const INDEX_T* itopk_indices, - const uint32_t itopk_size, - const uint32_t first_tid = 0) +RAFT_DEVICE_INLINE_FUNCTION void hashmap_restore(INDEX_T* const hashmap_ptr, + const size_t hashmap_bitlen, + const INDEX_T* itopk_indices, + const uint32_t itopk_size, + const uint32_t first_tid = 0) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; if (threadIdx.x < first_tid) return; @@ -454,18 +458,16 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, } // One query one thread block -template -__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, top_k] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] const std::uint32_t top_k, - DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph, // [dataset_size, graph_degree] const std::uint32_t graph_degree, @@ -483,15 +485,13 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const std::uint32_t hash_bitlen, const std::uint32_t small_hash_bitlen, const std::uint32_t small_hash_reset_interval, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using LOAD_T = device::LOAD_128BIT_T; using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - using QUERY_T = typename DATASET_DESCRIPTOR_T::QUERY_T; const auto query_id = blockIdx.y; @@ -512,7 +512,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #endif _CLK_START(); - extern __shared__ std::uint32_t smem[]; + extern __shared__ uint8_t smem[]; // Layout of result_buffer // +----------------------+------------------------------+---------+ @@ -520,37 +520,28 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // | | | upto 32 | // +----------------------+------------------------------+---------+ // |<--- result_buffer_size --->| - std::uint32_t result_buffer_size = internal_topk + (search_width * graph_degree); - std::uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } - const auto small_hash_size = hashmap::get_size(small_hash_bitlen); - - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + query_smem_buffer_length); - auto result_distances_buffer = - reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto visited_hash_buffer = - reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto parent_list_buffer = reinterpret_cast(visited_hash_buffer + small_hash_size); - auto distance_work_buffer_ptr = - reinterpret_cast(parent_list_buffer + search_width); - auto topk_ws = reinterpret_cast(distance_work_buffer_ptr + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte); - auto terminate_flag = reinterpret_cast(topk_ws + 3); - auto smem_work_ptr = reinterpret_cast(terminate_flag + 1); + const auto result_buffer_size = internal_topk + (search_width * graph_degree); + const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); + const auto small_hash_size = hashmap::get_size(small_hash_bitlen); // Set smem working buffer for the distance calculation - dataset_desc.set_smem_ptr(distance_work_buffer_ptr); + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + + auto* __restrict__ result_indices_buffer = + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); + auto* __restrict__ result_distances_buffer = + reinterpret_cast(result_indices_buffer + result_buffer_size_32); + auto* __restrict__ visited_hash_buffer = + reinterpret_cast(result_distances_buffer + result_buffer_size_32); + auto* __restrict__ parent_list_buffer = + reinterpret_cast(visited_hash_buffer + small_hash_size); + auto* __restrict__ topk_ws = reinterpret_cast(parent_list_buffer + search_width); + auto* terminate_flag = reinterpret_cast(topk_ws + 3); + auto* __restrict__ smem_work_ptr = reinterpret_cast(terminate_flag + 1); // A flag for filtering. auto filter_flag = terminate_flag; - const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc.dim; - dataset_desc.template copy_query( - query_ptr, query_buffer, query_smem_buffer_length); - if (threadIdx.x == 0) { terminate_flag[0] = 0; topk_ws[0] = ~0u; @@ -570,18 +561,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute distance to randomly selecting nodes _CLK_START(); const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - device::compute_distance_to_random_nodes(result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_desc, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric); + device::compute_distance_to_random_nodes(result_indices_buffer, + result_distances_buffer, + *dataset_desc, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -666,7 +655,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( nullptr, topk_ws, true, - reinterpret_cast(smem_work_ptr)); + smem_work_ptr); _CLK_REC(clk_topk); // reset small-hash table @@ -683,12 +672,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // pick up next parents if (threadIdx.x < 32) { _CLK_START(); - pickup_next_parents(terminate_flag, - parent_list_buffer, - result_indices_buffer, - internal_topk, - dataset_desc.size, - search_width); + pickup_next_parents( + terminate_flag, parent_list_buffer, result_indices_buffer, internal_topk, search_width); _CLK_REC(clk_pickup_parents); } @@ -706,20 +691,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - constexpr unsigned max_n_frags = 8; - device::compute_distance_to_child_nodes( - result_indices_buffer + internal_topk, - result_distances_buffer + internal_topk, - query_buffer, - dataset_desc, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_list_buffer, - result_indices_buffer, - search_width, - metric); + device::compute_distance_to_child_nodes(result_indices_buffer + internal_topk, + result_distances_buffer + internal_topk, + *dataset_desc, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_list_buffer, + result_indices_buffer, + search_width); __syncthreads(); _CLK_REC(clk_compute_distance); @@ -815,50 +796,33 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #endif } -template +template struct search_kernel_config { - using kernel_t = decltype(&search_kernel); + using kernel_t = decltype(&search_kernel<64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); template static auto choose_search_kernel(unsigned itopk_size) -> kernel_t { if (itopk_size <= 64) { - return search_kernel; } else if (itopk_size <= 128) { - return search_kernel; } else if (itopk_size <= 256) { - return search_kernel; } else if (itopk_size <= 512) { - return search_kernel; + return search_kernel<256, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (itopk_size <= 512) { - return search_kernel; + return search_kernel<512, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } } THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u", @@ -905,40 +857,35 @@ struct search_kernel_config { } }; -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - uint32_t num_itopk_candidates, - uint32_t block_size, // - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - size_t small_hash_bitlen, - size_t small_hash_reset_interval, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, // + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_seeds, + SampleFilterT sample_filter, + cudaStream_t stream) { auto kernel = - search_kernel_config:: - choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size); - RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte)); + search_kernel_config, + SampleFilterT>::choose_itopk_and_mx_candidates(ps.itopk_size, + num_itopk_candidates, + block_size); + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); dim3 thread_dims(block_size, 1, 1); dim3 block_dims(1, num_queries, 1); RAFT_LOG_DEBUG( @@ -963,9 +910,9 @@ void select_and_run( hash_bitlen, small_hash_bitlen, small_hash_reset_interval, - sample_filter, - metric); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + sample_filter); + // RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } } // namespace single_cta_search } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh index 1ccec9219..7b7f44db7 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,32 @@ */ #pragma once -#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY -#include "search_single_cta_kernel-inl.cuh" -#endif +#include "compute_distance-ext.cuh" -#ifdef RAFT_COMPILED -#include "search_single_cta_kernel-ext.cuh" -#endif +#include + +namespace cuvs::neighbors::cagra::detail::single_cta_search { + +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, // + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_seeds, + SampleFilterT sample_filter, + cudaStream_t stream); + +} diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu new file mode 100644 index 000000000..ee6427170 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include "search_single_cta_inst.cuh" + +namespace cuvs::neighbors::cagra::detail::single_cta_search { +instantiate_kernel_selection(uint8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu deleted file mode 100644 index 35e04ea6a..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu deleted file mode 100644 index 614e6ca01..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu deleted file mode 100644 index 005afb566..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu deleted file mode 100644 index af30b2e24..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh index 67173026b..b6f97cb26 100644 --- a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh +++ b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh @@ -32,17 +32,17 @@ struct topk_by_radix_sort : topk_by_radix_sort_base {}; template struct topk_by_radix_sort> : topk_by_radix_sort_base { - __device__ void operator()(uint32_t topk, - uint32_t batch_size, - uint32_t len_x, - const uint32_t* _x, - const IdxT* _in_vals, - uint32_t* _y, - IdxT* _out_vals, - uint32_t* work, - uint32_t* _hints, - bool sort, - uint32_t* _smem) + RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk, + uint32_t batch_size, + uint32_t len_x, + const uint32_t* _x, + const IdxT* _in_vals, + uint32_t* _y, + IdxT* _out_vals, + uint32_t* work, + uint32_t* _hints, + bool sort, + uint32_t* _smem) { std::uint8_t* const state = reinterpret_cast(work); topk_cta_11_core::state_bit_lenght, @@ -60,17 +60,17 @@ struct topk_by_radix_sort V))>> \ : topk_by_radix_sort_base { \ - __device__ void operator()(uint32_t topk, \ - uint32_t batch_size, \ - uint32_t len_x, \ - const uint32_t* _x, \ - const IdxT* _in_vals, \ - uint32_t* _y, \ - IdxT* _out_vals, \ - uint32_t* work, \ - uint32_t* _hints, \ - bool sort, \ - uint32_t* _smem) \ + RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk, \ + uint32_t batch_size, \ + uint32_t len_x, \ + const uint32_t* _x, \ + const IdxT* _in_vals, \ + uint32_t* _y, \ + IdxT* _out_vals, \ + uint32_t* work, \ + uint32_t* _hints, \ + bool sort, \ + uint32_t* _smem) \ { \ assert(blockDim.x >= V / 4); \ std::uint8_t* state = (std::uint8_t*)work; \ diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu new file mode 100644 index 000000000..72ff2cb85 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "topk_core.cuh" + +namespace cuvs::neighbors::cagra::detail { + +// +size_t _cuann_find_topk_bufferSize(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + cudaDataType_t sampleDtype) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); + + size_t workspaceSize = 1; + // state + if (stateBitLen == 8) { + workspaceSize = _cuann_aligned( + sizeof(uint8_t) * get_state_size(numElements) * sizeBatch); + } + + return workspaceSize; +} + +template +void _cuann_find_topk(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + const float* inputKeys, // [sizeBatch, ldIK,] + uint32_t ldIK, // (*) ldIK >= numElements + const ValT* inputVals, // [sizeBatch, ldIV,] + uint32_t ldIV, // (*) ldIV >= numElements + float* outputKeys, // [sizeBatch, ldOK,] + uint32_t ldOK, // (*) ldOK >= topK + ValT* outputVals, // [sizeBatch, ldOV,] + uint32_t ldOV, // (*) ldOV >= topK + void* workspace, + bool sort, + uint32_t* hints, + cudaStream_t stream) +{ + assert(ldIK >= numElements); + assert(ldIV >= numElements); + assert(ldOK >= topK); + assert(ldOV >= topK); + + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); + + uint8_t* state = NULL; + if (stateBitLen == 8) { state = (uint8_t*)workspace; } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(sizeBatch, 1, 1); + + void (*cta_kernel)(uint32_t, + uint32_t, + uint32_t, + const uint32_t*, + uint32_t, + const ValT*, + uint32_t, + uint32_t*, + uint32_t, + ValT*, + uint32_t, + uint8_t*, + uint32_t*, + bool) = nullptr; + + // V:vecLen, K:maxTopk, T:numSortThreads +#define SET_KERNEL_VKT(V, K, T, ValT) \ + do { \ + assert(numThreads >= T); \ + assert((K % T) == 0); \ + assert((K / T) <= 4); \ + cta_kernel = kern_topk_cta_11; \ + } while (0) + + // V: vecLen +#define SET_KERNEL_V(V, ValT) \ + do { \ + if (topK <= 32) { \ + SET_KERNEL_VKT(V, 32, 32, ValT); \ + } else if (topK <= 64) { \ + SET_KERNEL_VKT(V, 64, 32, ValT); \ + } else if (topK <= 96) { \ + SET_KERNEL_VKT(V, 96, 32, ValT); \ + } else if (topK <= 128) { \ + SET_KERNEL_VKT(V, 128, 32, ValT); \ + } else if (topK <= 192) { \ + SET_KERNEL_VKT(V, 192, 64, ValT); \ + } else if (topK <= 256) { \ + SET_KERNEL_VKT(V, 256, 64, ValT); \ + } else if (topK <= 384) { \ + SET_KERNEL_VKT(V, 384, 128, ValT); \ + } else if (topK <= 512) { \ + SET_KERNEL_VKT(V, 512, 128, ValT); \ + } else if (topK <= 768) { \ + SET_KERNEL_VKT(V, 768, 256, ValT); \ + } else if (topK <= 1024) { \ + SET_KERNEL_VKT(V, 1024, 256, ValT); \ + } \ + /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \ + /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \ + /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \ + /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \ + else { \ + RAFT_FAIL("topk must be lower than or equal to 1024"); \ + } \ + } while (0) + + int _vecLen = _get_vecLen(ldIK, 2); + if (_vecLen == 2) { + SET_KERNEL_V(2, ValT); + } else if (_vecLen == 1) { + SET_KERNEL_V(1, ValT); + } + + cta_kernel<<>>(topK, + sizeBatch, + numElements, + (const uint32_t*)inputKeys, + ldIK, + inputVals, + ldIV, + (uint32_t*)outputKeys, + ldOK, + outputVals, + ldOV, + state, + hints, + sort); + + return; +} + +template void _cuann_find_topk(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + const float* inputKeys, // [sizeBatch, ldIK,] + uint32_t ldIK, // (*) ldIK >= numElements + const uint32_t* inputVals, // [sizeBatch, ldIV,] + uint32_t ldIV, // (*) ldIV >= numElements + float* outputKeys, // [sizeBatch, ldOK,] + uint32_t ldOK, // (*) ldOK >= topK + uint32_t* outputVals, // [sizeBatch, ldOV,] + uint32_t ldOV, // (*) ldOV >= topK + void* workspace, + bool sort, + uint32_t* hint, + cudaStream_t stream); + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh index cbf99a556..65f9cfade 100644 --- a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh +++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh @@ -14,10 +14,15 @@ * limitations under the License. */ #pragma once + +#include "../utils.hpp" #include "topk.h" #include +#include +#include + #include #include #include @@ -25,7 +30,7 @@ namespace cuvs::neighbors::cagra::detail { // -__device__ inline uint32_t convert(uint32_t x) +RAFT_DEVICE_INLINE_FUNCTION constexpr uint32_t convert(uint32_t x) { if (x & 0x80000000) { return x ^ 0xffffffff; @@ -35,7 +40,7 @@ __device__ inline uint32_t convert(uint32_t x) } // -__device__ inline uint16_t convert(uint16_t x) +RAFT_DEVICE_INLINE_FUNCTION constexpr uint16_t convert(uint16_t x) { if (x & 0x8000) { return x ^ 0xffff; @@ -62,7 +67,7 @@ struct u16_vector { // template -__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) +RAFT_DEVICE_INLINE_FUNCTION void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) { if (vecLen == 1) { vec.x1 = ((uint1*)(x + i))[0]; @@ -77,7 +82,7 @@ __device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x // template -__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) +RAFT_DEVICE_INLINE_FUNCTION void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) { if (vecLen == 1) { vec.x1 = ((ushort1*)(x + i))[0]; @@ -92,7 +97,7 @@ __device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x // template -__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) +RAFT_DEVICE_INLINE_FUNCTION uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) { uint32_t xi; if (vecLen == 1) { @@ -134,7 +139,7 @@ __device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, i // template -__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) +RAFT_DEVICE_INLINE_FUNCTION uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) { uint16_t xi; if (vecLen == 1) { @@ -175,7 +180,7 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i } template -__device__ inline void block_scan(const T input, T& output) +RAFT_DEVICE_INLINE_FUNCTION void block_scan(const T input, T& output) { switch (blockDim.x) { case 32: { @@ -214,19 +219,19 @@ __device__ inline void block_scan(const T input, T& output) // template -__device__ inline void update_histogram(int itr, - uint32_t thread_id, - uint32_t num_threads, - uint32_t hint, - uint32_t threshold, - uint32_t& num_bins, - uint32_t& shift, - const T* x, // [nx,] - uint32_t nx, - uint32_t* hist, // [num_bins] - uint8_t* state, - uint32_t* output, // [topk] - uint32_t* output_count) +RAFT_DEVICE_INLINE_FUNCTION void update_histogram(int itr, + uint32_t thread_id, + uint32_t num_threads, + uint32_t hint, + uint32_t threshold, + uint32_t& num_bins, + uint32_t& shift, + const T* x, // [nx,] + uint32_t nx, + uint32_t* hist, // [num_bins] + uint8_t* state, + uint32_t* output, // [topk] + uint32_t* output_count) { if (sizeof(T) == 4) { // 32-bit (uint32_t) @@ -324,15 +329,16 @@ __device__ inline void update_histogram(int itr, } template -__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index, - uint32_t& my_csum, - const unsigned num_bins, - const uint32_t* const hist, - const uint32_t nx_below_threshold, - const uint32_t max_threshold, - const uint32_t threshold, - const uint32_t shift, - const uint32_t topk) +RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold_core( + uint32_t& my_index, + uint32_t& my_csum, + const unsigned num_bins, + const uint32_t* const hist, + const uint32_t nx_below_threshold, + const uint32_t max_threshold, + const uint32_t threshold, + const uint32_t shift, + const uint32_t topk) { typedef cub::BlockScan BlockScanT; __shared__ typename BlockScanT::TempStorage temp_storage; @@ -370,7 +376,7 @@ __device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_in } // -__device__ inline void select_best_index_for_next_threshold( +RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold( const uint32_t topk, const uint32_t threshold, const uint32_t max_threshold, @@ -469,17 +475,17 @@ __device__ inline void select_best_index_for_next_threshold( // template -__device__ inline void output_index_below_threshold(const uint32_t topk, - const uint32_t thread_id, - const uint32_t num_threads, - const uint32_t threshold, - const uint32_t nx_below_threshold, - const T* const x, // [nx,] - const uint32_t nx, - const uint8_t* state, - uint32_t* const output, // [topk] - uint32_t* const output_count, - uint32_t* const output_count_eq) +RAFT_DEVICE_INLINE_FUNCTION void output_index_below_threshold(const uint32_t topk, + const uint32_t thread_id, + const uint32_t num_threads, + const uint32_t threshold, + const uint32_t nx_below_threshold, + const T* const x, // [nx,] + const uint32_t nx, + const uint8_t* state, + uint32_t* const output, // [topk] + uint32_t* const output_count, + uint32_t* const output_count_eq) { int ii = 0; for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { @@ -530,7 +536,7 @@ __device__ inline void output_index_below_threshold(const uint32_t topk, // template -__device__ inline void swap(T& val1, T& val2) +RAFT_DEVICE_INLINE_FUNCTION constexpr void swap(T& val1, T& val2) { const T val0 = val1; val1 = val2; @@ -539,7 +545,7 @@ __device__ inline void swap(T& val1, T& val2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2) { if (key1 > key2) { swap(key1, key2); @@ -550,7 +556,7 @@ __device__ inline bool swap_if_needed(K& key1, K& key2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) { if (key1 > key2) { swap(key1, key2); @@ -562,7 +568,8 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed( + K& key1, K& key2, V& val1, V& val2, bool ascending) { if (key1 == key2) { return false; } if ((key1 > key2) == ascending) { @@ -575,20 +582,20 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a // template -__device__ inline T max_value_of(); +RAFT_DEVICE_INLINE_FUNCTION T max_value_of(); template <> -__device__ inline float max_value_of() +RAFT_DEVICE_INLINE_FUNCTION float max_value_of() { return FLT_MAX; } template <> -__device__ inline uint32_t max_value_of() +RAFT_DEVICE_INLINE_FUNCTION uint32_t max_value_of() { return ~0u; } template -__device__ __host__ inline uint32_t get_state_size(uint32_t len_x) +RAFT_INLINE_FUNCTION constexpr uint32_t get_state_size(uint32_t len_x) { #ifdef __CUDA_ARCH__ const uint32_t num_threads = blockDim.x; @@ -605,16 +612,16 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x) // template -__device__ inline void topk_cta_11_core(uint32_t topk, - uint32_t len_x, - const uint32_t* _x, // [size_batch, ld_x,] - const ValT* _in_vals, // [size_batch, ld_iv,] - uint32_t* _y, // [size_batch, ld_y,] - ValT* _out_vals, // [size_batch, ld_ov,] - uint8_t* _state, // [size_batch, ...,] - uint32_t* _hint, - bool sort, - uint32_t* _smem) +RAFT_DEVICE_INLINE_FUNCTION void topk_cta_11_core(uint32_t topk, + uint32_t len_x, + const uint32_t* _x, // [size_batch, ld_x,] + const ValT* _in_vals, // [size_batch, ld_iv,] + uint32_t* _y, // [size_batch, ld_y,] + ValT* _out_vals, // [size_batch, ld_ov,] + uint8_t* _state, // [size_batch, ...,] + uint32_t* _hint, + bool sort, + uint32_t* _smem) { uint32_t* const smem_out_vals = _smem; uint32_t* const hist = &(_smem[2 * maxTopk]); @@ -904,137 +911,4 @@ __launch_bounds__(1024, 1) RAFT_KERNEL _smem); } -// -size_t inline _cuann_find_topk_bufferSize(uint32_t topK, - uint32_t sizeBatch, - uint32_t numElements, - cudaDataType_t sampleDtype) -{ - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); - - size_t workspaceSize = 1; - // state - if (stateBitLen == 8) { - workspaceSize = _cuann_aligned( - sizeof(uint8_t) * get_state_size(numElements) * sizeBatch); - } - - return workspaceSize; -} - -template -inline void _cuann_find_topk(uint32_t topK, - uint32_t sizeBatch, - uint32_t numElements, - const float* inputKeys, // [sizeBatch, ldIK,] - uint32_t ldIK, // (*) ldIK >= numElements - const ValT* inputVals, // [sizeBatch, ldIV,] - uint32_t ldIV, // (*) ldIV >= numElements - float* outputKeys, // [sizeBatch, ldOK,] - uint32_t ldOK, // (*) ldOK >= topK - ValT* outputVals, // [sizeBatch, ldOV,] - uint32_t ldOV, // (*) ldOV >= topK - void* workspace, - bool sort, - uint32_t* hints, - cudaStream_t stream) -{ - assert(ldIK >= numElements); - assert(ldIV >= numElements); - assert(ldOK >= topK); - assert(ldOV >= topK); - - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); - - uint8_t* state = NULL; - if (stateBitLen == 8) { state = (uint8_t*)workspace; } - - dim3 threads(numThreads, 1, 1); - dim3 blocks(sizeBatch, 1, 1); - - void (*cta_kernel)(uint32_t, - uint32_t, - uint32_t, - const uint32_t*, - uint32_t, - const ValT*, - uint32_t, - uint32_t*, - uint32_t, - ValT*, - uint32_t, - uint8_t*, - uint32_t*, - bool) = nullptr; - - // V:vecLen, K:maxTopk, T:numSortThreads -#define SET_KERNEL_VKT(V, K, T, ValT) \ - do { \ - assert(numThreads >= T); \ - assert((K % T) == 0); \ - assert((K / T) <= 4); \ - cta_kernel = kern_topk_cta_11; \ - } while (0) - - // V: vecLen -#define SET_KERNEL_V(V, ValT) \ - do { \ - if (topK <= 32) { \ - SET_KERNEL_VKT(V, 32, 32, ValT); \ - } else if (topK <= 64) { \ - SET_KERNEL_VKT(V, 64, 32, ValT); \ - } else if (topK <= 96) { \ - SET_KERNEL_VKT(V, 96, 32, ValT); \ - } else if (topK <= 128) { \ - SET_KERNEL_VKT(V, 128, 32, ValT); \ - } else if (topK <= 192) { \ - SET_KERNEL_VKT(V, 192, 64, ValT); \ - } else if (topK <= 256) { \ - SET_KERNEL_VKT(V, 256, 64, ValT); \ - } else if (topK <= 384) { \ - SET_KERNEL_VKT(V, 384, 128, ValT); \ - } else if (topK <= 512) { \ - SET_KERNEL_VKT(V, 512, 128, ValT); \ - } else if (topK <= 768) { \ - SET_KERNEL_VKT(V, 768, 256, ValT); \ - } else if (topK <= 1024) { \ - SET_KERNEL_VKT(V, 1024, 256, ValT); \ - } \ - /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \ - /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \ - /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \ - /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \ - else { \ - RAFT_FAIL("topk must be lower than or equal to 1024"); \ - } \ - } while (0) - - int _vecLen = _get_vecLen(ldIK, 2); - if (_vecLen == 2) { - SET_KERNEL_V(2, ValT); - } else if (_vecLen == 1) { - SET_KERNEL_V(1, ValT); - } - - cta_kernel<<>>(topK, - sizeBatch, - numElements, - (const uint32_t*)inputKeys, - ldIK, - inputVals, - ldIV, - (uint32_t*)outputKeys, - ldOK, - outputVals, - ldOV, - state, - hints, - sort); - - return; -} } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 8ce20ec5c..0f8309328 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -125,24 +125,24 @@ union fp_conv { FP_T fp; }; template -_RAFT_HOST_DEVICE inline T get_max_value(); +_RAFT_HOST_DEVICE constexpr inline T get_max_value(); template <> -_RAFT_HOST_DEVICE inline float get_max_value() +_RAFT_HOST_DEVICE constexpr inline float get_max_value() { return FLT_MAX; }; template <> -_RAFT_HOST_DEVICE inline half get_max_value() +_RAFT_HOST_DEVICE constexpr inline half get_max_value() { return fp_conv{.bs = 0x7aff}.fp; }; template <> -_RAFT_HOST_DEVICE inline std::uint32_t get_max_value() +_RAFT_HOST_DEVICE constexpr inline std::uint32_t get_max_value() { return 0xffffffffu; }; template <> -_RAFT_HOST_DEVICE inline std::uint64_t get_max_value() +_RAFT_HOST_DEVICE constexpr inline std::uint64_t get_max_value() { return 0xfffffffffffffffflu; }; diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 9d2f9c175..4ce0849fd 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -706,7 +706,7 @@ inline std::vector generate_inputs() {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT}, {search_algo::AUTO}, {10}, - {0, 4, 8, 16, 32}, // team_size + {0, 8, 16, 32}, // team_size {64}, {1}, {cuvs::distance::DistanceType::L2Expanded},