From 0dbe5b254bb094c3ac6520f043854b792bf36482 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 16 Aug 2024 11:30:50 +0200 Subject: [PATCH 01/41] [WIP] CAGRA - separable compilation for distance computation --- cpp/CMakeLists.txt | 218 ++++--- cpp/src/neighbors/detail/ann_utils.cuh | 16 +- .../neighbors/detail/cagra/cagra_search.cuh | 156 ++--- .../detail/cagra/compute_distance.hpp | 443 ++++++++++--- .../detail/cagra/compute_distance_vpq.cuh | 448 +++++++++++-- .../detail/cagra/distance_core-ext.cuh | 103 +++ .../detail/cagra/distance_core-impl.cuh | 41 ++ ...nerate.py => distance_core_00_generate.py} | 49 +- .../distance_core_float_uint32_dim1024_t32.cu | 32 + .../distance_core_float_uint32_dim128_t8.cu | 32 + .../distance_core_float_uint32_dim256_t16.cu | 32 + .../distance_core_float_uint32_dim512_t32.cu | 32 + .../distance_core_float_uint64_dim1024_t32.cu | 32 + .../distance_core_float_uint64_dim128_t8.cu | 32 + .../distance_core_float_uint64_dim256_t16.cu | 32 + .../distance_core_float_uint64_dim512_t32.cu | 32 + .../distance_core_half_uint32_dim1024_t32.cu | 32 + .../distance_core_half_uint32_dim128_t8.cu | 32 + .../distance_core_half_uint32_dim256_t16.cu | 32 + .../distance_core_half_uint32_dim512_t32.cu | 32 + .../distance_core_half_uint64_dim1024_t32.cu | 32 + .../distance_core_half_uint64_dim128_t8.cu | 32 + .../distance_core_half_uint64_dim256_t16.cu | 32 + .../distance_core_half_uint64_dim512_t32.cu | 32 + .../distance_core_int8_uint32_dim1024_t32.cu | 32 + .../distance_core_int8_uint32_dim128_t8.cu | 32 + .../distance_core_int8_uint32_dim256_t16.cu | 32 + .../distance_core_int8_uint32_dim512_t32.cu | 32 + .../distance_core_uint8_uint32_dim1024_t32.cu | 32 + .../distance_core_uint8_uint32_dim128_t8.cu | 32 + .../distance_core_uint8_uint32_dim256_t16.cu | 32 + .../distance_core_uint8_uint32_dim512_t32.cu | 32 + ...loat_uint32_dim1024_t32_8pq_2subd_half.cu} | 16 +- ...loat_uint32_dim1024_t32_8pq_4subd_half.cu} | 16 +- ..._float_uint32_dim128_t8_8pq_2subd_half.cu} | 16 +- ..._float_uint32_dim128_t8_8pq_4subd_half.cu} | 16 +- ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 32 + ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 32 + ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 32 + ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 32 + ...float_uint64_dim1024_t32_8pq_2subd_half.cu | 32 + ...float_uint64_dim1024_t32_8pq_4subd_half.cu | 32 + ...q_float_uint64_dim128_t8_8pq_2subd_half.cu | 32 + ...q_float_uint64_dim128_t8_8pq_4subd_half.cu | 32 + ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 32 + ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 32 + ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 32 + ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 32 + ..._half_uint32_dim1024_t32_8pq_2subd_half.cu | 32 + ..._half_uint32_dim1024_t32_8pq_4subd_half.cu | 32 + ...pq_half_uint32_dim128_t8_8pq_2subd_half.cu | 32 + ...pq_half_uint32_dim128_t8_8pq_4subd_half.cu | 32 + ...q_half_uint32_dim256_t16_8pq_2subd_half.cu | 32 + ...q_half_uint32_dim256_t16_8pq_4subd_half.cu | 32 + ...q_half_uint32_dim512_t32_8pq_2subd_half.cu | 32 + ...q_half_uint32_dim512_t32_8pq_4subd_half.cu | 32 + ..._half_uint64_dim1024_t32_8pq_2subd_half.cu | 32 + ..._half_uint64_dim1024_t32_8pq_4subd_half.cu | 32 + ...pq_half_uint64_dim128_t8_8pq_2subd_half.cu | 32 + ...pq_half_uint64_dim128_t8_8pq_4subd_half.cu | 32 + ...q_half_uint64_dim256_t16_8pq_2subd_half.cu | 32 + ...q_half_uint64_dim256_t16_8pq_4subd_half.cu | 32 + ...q_half_uint64_dim512_t32_8pq_2subd_half.cu | 32 + ...q_half_uint64_dim512_t32_8pq_4subd_half.cu | 32 + ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu | 32 + ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu | 32 + ...pq_int8_uint32_dim128_t8_8pq_2subd_half.cu | 32 + ...pq_int8_uint32_dim128_t8_8pq_4subd_half.cu | 32 + ...q_int8_uint32_dim256_t16_8pq_2subd_half.cu | 32 + ...q_int8_uint32_dim256_t16_8pq_4subd_half.cu | 32 + ...q_int8_uint32_dim512_t32_8pq_2subd_half.cu | 32 + ...q_int8_uint32_dim512_t32_8pq_4subd_half.cu | 32 + ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu | 32 + ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu | 32 + ...q_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 32 + ...q_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 32 + ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 32 + ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 32 + ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 32 + ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 32 + cpp/src/neighbors/detail/cagra/factory.cuh | 65 +- cpp/src/neighbors/detail/cagra/graph_core.cuh | 94 +-- .../cagra/q_search_multi_cta_00_generate.py | 83 --- ...float_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ...float_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_float_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_float_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ...float_uint64_dim1024_t32_8pq_2subd_half.cu | 36 -- ...float_uint64_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_float_uint64_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_float_uint64_dim128_t8_8pq_4subd_half.cu | 36 -- ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 36 -- ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 36 -- ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 36 -- ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 36 -- ..._half_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._half_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_half_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_half_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_half_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_half_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ..._half_uint64_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._half_uint64_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_half_uint64_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_half_uint64_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_half_uint64_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_half_uint64_dim512_t32_8pq_4subd_half.cu | 36 -- ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...ta_int8_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...ta_int8_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ...a_int8_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_int8_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_int8_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_int8_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ...float_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ...float_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_float_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_float_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ...float_uint64_dim1024_t32_8pq_2subd_half.cu | 36 -- ...float_uint64_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_float_uint64_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_float_uint64_dim128_t8_8pq_4subd_half.cu | 36 -- ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 36 -- ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 36 -- ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 36 -- ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 36 -- ..._half_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._half_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...ta_half_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...ta_half_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ...a_half_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_half_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_half_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_half_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ..._half_uint64_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._half_uint64_dim1024_t32_8pq_4subd_half.cu | 36 -- ...ta_half_uint64_dim128_t8_8pq_2subd_half.cu | 36 -- ...ta_half_uint64_dim128_t8_8pq_4subd_half.cu | 36 -- ...a_half_uint64_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_half_uint64_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_half_uint64_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_half_uint64_dim512_t32_8pq_4subd_half.cu | 36 -- ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...ta_int8_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...ta_int8_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ...a_int8_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ...a_int8_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ...a_int8_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ...a_int8_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu | 36 -- ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu | 36 -- ...a_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 36 -- ...a_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 36 -- ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 36 -- ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 36 -- ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 36 -- ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 36 -- .../detail/cagra/search_multi_cta.cuh | 162 +++-- .../cagra/search_multi_cta_00_generate.py | 20 +- ...t8.cu => search_multi_cta_float_uint32.cu} | 5 +- ...arch_multi_cta_float_uint32_dim1024_t32.cu | 37 -- ...earch_multi_cta_float_uint32_dim256_t16.cu | 37 -- ...earch_multi_cta_float_uint32_dim512_t32.cu | 37 -- ...t8.cu => search_multi_cta_float_uint64.cu} | 5 +- ...arch_multi_cta_float_uint64_dim1024_t32.cu | 37 -- ...search_multi_cta_float_uint64_dim128_t8.cu | 37 -- ...earch_multi_cta_float_uint64_dim256_t16.cu | 37 -- ...earch_multi_cta_float_uint64_dim512_t32.cu | 37 -- ..._t8.cu => search_multi_cta_half_uint32.cu} | 5 +- ...earch_multi_cta_half_uint32_dim1024_t32.cu | 37 -- ...search_multi_cta_half_uint32_dim512_t32.cu | 37 -- ...t16.cu => search_multi_cta_half_uint64.cu} | 5 +- ...earch_multi_cta_half_uint64_dim1024_t32.cu | 37 -- ...search_multi_cta_half_uint64_dim256_t16.cu | 37 -- ...search_multi_cta_half_uint64_dim512_t32.cu | 37 -- .../detail/cagra/search_multi_cta_inst.cuh | 45 +- .../cagra/search_multi_cta_int8_uint32.cu | 34 + ...earch_multi_cta_int8_uint32_dim1024_t32.cu | 37 -- .../search_multi_cta_int8_uint32_dim128_t8.cu | 37 -- ...search_multi_cta_int8_uint32_dim256_t16.cu | 37 -- ...search_multi_cta_int8_uint32_dim512_t32.cu | 37 -- .../cagra/search_multi_cta_kernel-ext.cuh | 405 ------------ .../cagra/search_multi_cta_kernel-inl.cuh | 181 +++--- .../detail/cagra/search_multi_cta_kernel.cuh | 37 +- .../cagra/search_multi_cta_uint8_uint32.cu | 34 + ...arch_multi_cta_uint8_uint32_dim1024_t32.cu | 37 -- ...search_multi_cta_uint8_uint32_dim128_t8.cu | 37 -- ...earch_multi_cta_uint8_uint32_dim256_t16.cu | 37 -- ...earch_multi_cta_uint8_uint32_dim512_t32.cu | 37 -- .../detail/cagra/search_multi_kernel.cuh | 89 ++- .../neighbors/detail/cagra/search_plan.cuh | 59 +- .../detail/cagra/search_single_cta.cuh | 138 ++-- .../cagra/search_single_cta_00_generate.py | 20 +- ...8.cu => search_single_cta_float_uint32.cu} | 5 +- ...rch_single_cta_float_uint32_dim1024_t32.cu | 37 -- ...arch_single_cta_float_uint32_dim256_t16.cu | 37 -- ...arch_single_cta_float_uint32_dim512_t32.cu | 37 -- ...8.cu => search_single_cta_float_uint64.cu} | 5 +- ...rch_single_cta_float_uint64_dim1024_t32.cu | 37 -- ...earch_single_cta_float_uint64_dim128_t8.cu | 37 -- ...arch_single_cta_float_uint64_dim256_t16.cu | 37 -- ...arch_single_cta_float_uint64_dim512_t32.cu | 37 -- ...16.cu => search_single_cta_half_uint32.cu} | 5 +- ...arch_single_cta_half_uint32_dim1024_t32.cu | 37 -- ...earch_single_cta_half_uint32_dim512_t32.cu | 37 -- ...t8.cu => search_single_cta_half_uint64.cu} | 5 +- ...arch_single_cta_half_uint64_dim1024_t32.cu | 37 -- ...earch_single_cta_half_uint64_dim256_t16.cu | 37 -- ...earch_single_cta_half_uint64_dim512_t32.cu | 37 -- .../detail/cagra/search_single_cta_inst.cuh | 47 +- .../cagra/search_single_cta_int8_uint32.cu | 34 + ...arch_single_cta_int8_uint32_dim1024_t32.cu | 37 -- ...search_single_cta_int8_uint32_dim128_t8.cu | 37 -- ...earch_single_cta_int8_uint32_dim256_t16.cu | 37 -- ...earch_single_cta_int8_uint32_dim512_t32.cu | 37 -- .../cagra/search_single_cta_kernel-ext.cuh | 588 ------------------ .../cagra/search_single_cta_kernel-inl.cuh | 201 +++--- .../detail/cagra/search_single_cta_kernel.cuh | 37 +- .../cagra/search_single_cta_uint8_uint32.cu | 34 + ...rch_single_cta_uint8_uint32_dim1024_t32.cu | 37 -- ...earch_single_cta_uint8_uint32_dim128_t8.cu | 37 -- ...arch_single_cta_uint8_uint32_dim256_t16.cu | 37 -- ...arch_single_cta_uint8_uint32_dim512_t32.cu | 37 -- .../neighbors/detail/cagra/topk_by_radix.cuh | 44 +- .../detail/cagra/topk_for_cagra/topk.cu | 171 +++++ .../detail/cagra/topk_for_cagra/topk_core.cuh | 258 ++------ 244 files changed, 4199 insertions(+), 7227 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/distance_core-ext.cuh create mode 100644 cpp/src/neighbors/detail/cagra/distance_core-impl.cuh rename cpp/src/neighbors/detail/cagra/{q_search_single_cta_00_generate.py => distance_core_00_generate.py} (60%) create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu => distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu} (53%) rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu => distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu} (53%) rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu => distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu} (53%) rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu => distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu} (53%) create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint64_dim128_t8.cu => search_multi_cta_float_uint32.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint32_dim128_t8.cu => search_multi_cta_float_uint64.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_multi_cta_float_uint32_dim128_t8.cu => search_multi_cta_half_uint32.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint32_dim256_t16.cu => search_multi_cta_half_uint64.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint32_dim128_t8.cu => search_single_cta_float_uint32.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint64_dim128_t8.cu => search_single_cta_float_uint64.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint32_dim256_t16.cu => search_single_cta_half_uint32.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu rename cpp/src/neighbors/detail/cagra/{search_single_cta_float_uint32_dim128_t8.cu => search_single_cta_half_uint64.cu} (85%) delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3b483538a..1e5a1723f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -199,6 +199,119 @@ endif() # ################################################################################################## # * cuvs --------------------------------------------------------------------- +add_library( + cuvs-cagra-search STATIC + src/neighbors/cagra_search_float.cu + src/neighbors/cagra_search_int8.cu + src/neighbors/cagra_search_uint8.cu + src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64.cu +) + +file(GLOB_RECURSE distance_core_sources "src/neighbors/detail/cagra/distance_core_*.cu") +set_source_files_properties(${distance_core_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) + +set_target_properties( + cuvs-cagra-search + PROPERTIES BUILD_RPATH "\$ORIGIN" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + CUDA_SEPARABLE_COMPILATION ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + POSITION_INDEPENDENT_CODE ON +) +target_link_libraries(cuvs-cagra-search PRIVATE raft::raft) +target_include_directories( + cuvs-cagra-search PRIVATE "$" +) +target_compile_options( + cuvs-cagra-search PRIVATE "$<$:${CUVS_CXX_FLAGS}>" + "$<$:${CUVS_CUDA_FLAGS}>" +) add_library( cuvs SHARED @@ -253,109 +366,11 @@ add_library( src/neighbors/cagra_extend_int8.cu src/neighbors/cagra_extend_uint8.cu src/neighbors/cagra_optimize.cu - src/neighbors/cagra_search_float.cu - src/neighbors/cagra_search_int8.cu - src/neighbors/cagra_search_uint8.cu src/neighbors/cagra_serialize_float.cu src/neighbors/cagra_serialize_int8.cu src/neighbors/cagra_serialize_uint8.cu src/neighbors/detail/cagra/cagra_build.cpp - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/topk_for_cagra/topk.cu $<$:src/neighbors/hnsw.cpp> src/neighbors/ivf_flat_index.cpp src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu @@ -446,7 +461,7 @@ if(NOT BUILD_CPU_ONLY) target_link_libraries( cuvs PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES} - PRIVATE nvidia::cutlass::cutlass $ + PRIVATE nvidia::cutlass::cutlass $ cuvs-cagra-search ) endif() @@ -522,7 +537,8 @@ target_compile_options( "$<$:${CUVS_CUDA_FLAGS}>" ) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries -target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") +# TODO (achirkin): disabled during experiments with CUDA_SEPARABLE_COMPILATION (otherwise did't link) +# target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") # ################################################################################################## # * cuvs_c ------------------------------------------------------------------------------- diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 1db2dca64..000de576b 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -224,7 +224,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream) } template -RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) +__global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); IdxT i = gid / len_b; @@ -234,12 +234,12 @@ RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* } template -RAFT_KERNEL block_copy_kernel(const IdxT* in_offsets, - const IdxT* out_offsets, - IdxT n_blocks, - const T* in_data, - T* out_data, - IdxT n_mult) +__global__ void block_copy_kernel(const IdxT* in_offsets, + const IdxT* out_offsets, + IdxT n_blocks, + const T* in_data, + T* out_data, + IdxT n_mult) { IdxT i = static_cast(blockDim.x) * static_cast(blockIdx.x) + threadIdx.x; // find the source offset using the binary search. @@ -317,7 +317,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s } template -RAFT_KERNEL copy_selected_kernel( +__global__ void copy_selected_kernel( IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index cfb5f7919..ace278c45 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -85,29 +85,24 @@ inline return filter; } -template +template void search_main_core( raft::resources const& res, search_params params, - DatasetDescriptorT dataset_desc, - raft::device_matrix_view - graph, - raft::device_matrix_view - queries, - raft::device_matrix_view - neighbors, - raft::device_matrix_view - distances, + const dataset_descriptor_host& dataset_desc, + raft::device_matrix_view graph, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT(), cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) { RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", - static_cast(dataset_desc.size), - static_cast(dataset_desc.dim)); + static_cast(graph.extent(0)), + static_cast(queries.extent(1))); RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n", static_cast(queries.extent(0)), static_cast(queries.extent(1))); - RAFT_EXPECTS(queries.extent(1) == dataset_desc.dim, "Queries and index dim must match"); const uint32_t topk = neighbors.extent(1); cudaDeviceProp deviceProp = raft::resource::get_device_properties(res); @@ -119,12 +114,12 @@ void search_main_core( "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, - dataset_desc.dim); + queries.extent(1)); using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector::type; - std::unique_ptr> plan = - factory::create( - res, params, dataset_desc.dim, graph.extent(1), topk, metric); + std::unique_ptr> plan = + factory::create( + res, params, dataset_desc, queries.extent(1), graph.extent(1), topk, metric); plan->check(topk); @@ -134,21 +129,17 @@ void search_main_core( for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) { const uint32_t n_queries = std::min(max_queries, queries.extent(0) - qid); - auto _topk_indices_ptr = - reinterpret_cast(neighbors.data_handle()) + - (topk * qid); + auto _topk_indices_ptr = reinterpret_cast(neighbors.data_handle()) + (topk * qid); auto _topk_distances_ptr = distances.data_handle() + (topk * qid); // todo(tfeher): one could keep distances optional and pass nullptr const auto* _query_ptr = queries.data_handle() + (query_dim * qid); const auto* _seed_ptr = plan->num_seeds > 0 - ? reinterpret_cast(plan->dev_seed.data()) + - (plan->num_seeds * qid) + ? reinterpret_cast(plan->dev_seed.data()) + (plan->num_seeds * qid) : nullptr; uint32_t* _num_executed_iterations = nullptr; (*plan)(res, - dataset_desc, graph, _topk_indices_ptr, _topk_distances_ptr, @@ -161,77 +152,6 @@ void search_main_core( } } -template -void launch_vpq_search_main_core( - raft::resources const& res, - const vpq_dataset* vpq_dset, - search_params params, - raft::device_matrix_view graph, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances, - CagraSampleFilterT sample_filter, - const cuvs::distance::DistanceType metric) -{ - RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now"); - RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4, - "Only pq_len 2 or 4 is supported for now"); - RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0, - "dim must be a multiple of pq_dim at the moment"); - - const float vq_scale = 1.0f; - const float pq_scale = 1.0f; - - if (vpq_dset->pq_bits() == 8) { - if (vpq_dset->pq_len() == 2) { - using dataset_desc_t = cagra_q_dataset_descriptor_t; - dataset_desc_t dataset_desc(vpq_dset->data.data_handle(), - vpq_dset->encoded_row_length(), - vpq_dset->pq_dim(), - vpq_dset->vq_code_book.data_handle(), - vq_scale, - vpq_dset->pq_code_book.data_handle(), - pq_scale, - size_t(vpq_dset->n_rows()), - vpq_dset->dim()); - search_main_core( - res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric); - } else if (vpq_dset->pq_len() == 4) { - using dataset_desc_t = cagra_q_dataset_descriptor_t; - dataset_desc_t dataset_desc(vpq_dset->data.data_handle(), - vpq_dset->encoded_row_length(), - vpq_dset->pq_dim(), - vpq_dset->vq_code_book.data_handle(), - vq_scale, - vpq_dset->pq_code_book.data_handle(), - pq_scale, - size_t(vpq_dset->n_rows()), - vpq_dset->dim()); - search_main_core( - res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric); - } else { - RAFT_FAIL("Subspace dimension must be 2 or 4"); - } - } else { - RAFT_FAIL("Only 8-bit PQ is supported now"); - } -} - /** * @brief Search ANN using the constructed index. * @@ -264,6 +184,7 @@ void search_main(raft::resources const& res, raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT()) { + auto stream = raft::resource::get_cuda_stream(res); const auto& graph = index.graph(); auto graph_internal = raft::make_device_matrix_view( reinterpret_cast(graph.data_handle()), graph.extent(0), graph.extent(1)); @@ -273,39 +194,34 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { - // Set TEAM_SIZE and DATASET_BLOCK_SIZE to zero tentatively since these parameters cannot be - // determined here. They are set just before kernel launch. - using dataset_desc_t = standard_dataset_descriptor_t; // Search using a plain (strided) row-major dataset - const dataset_desc_t dataset_desc(strided_dset->view().data_handle(), - strided_dset->n_rows(), - strided_dset->dim(), - strided_dset->stride()); - search_main_core(res, - params, - dataset_desc, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto desc = + dataset_descriptor_init(*strided_dset, stream); + search_main_core(res, + params, + desc, + graph_internal, + queries, + neighbors, + distances, + sample_filter, + index.metric()); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { // Search using a compressed dataset RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - launch_vpq_search_main_core( - res, - vpq_dset, - params, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto desc = dataset_descriptor_init(*vpq_dset, stream); + search_main_core(res, + params, + desc, + graph_internal, + queries, + neighbors, + distances, + sample_filter, + index.metric()); } else if (auto* empty_dset = dynamic_cast*>(&index.data()); empty_dset != nullptr) { // Forgot to add a dataset. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 2b0c750ff..dcc5fe285 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -20,6 +20,8 @@ #include "utils.hpp" #include +#include +#include #include // TODO: This shouldn't be invoking spatial/knn @@ -37,22 +39,22 @@ using LOAD_128BIT_T = uint4; using LOAD_64BIT_T = uint64_t; template -_RAFT_DEVICE constexpr unsigned get_vlen() +RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen() { return utils::size_of() / utils::size_of(); } -template -_RAFT_DEVICE void compute_distance_to_random_nodes( +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( INDEX_T* const result_indices_ptr, // [num_pickup] DISTANCE_T* const result_distances_ptr, // [num_pickup] - const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer, + typename DATASET_DESCRIPTOR_T::ws_handle workspace, const DATASET_DESCRIPTOR_T& dataset_desc, - const std::size_t num_pickup, + const size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, const INDEX_T* const seed_ptr, // [num_seeds] @@ -64,9 +66,9 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( const uint32_t num_blocks = 1) { uint32_t max_i = num_pickup; - if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); } + if (max_i % (32 / TeamSize)) { max_i += (32 / TeamSize) - (max_i % (32 / TeamSize)); } - for (uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += blockDim.x / TEAM_SIZE) { + for (uint32_t i = threadIdx.x / TeamSize; i < max_i; i += blockDim.x / TeamSize) { const bool valid_i = (i < num_pickup); INDEX_T best_index_team_local; @@ -88,17 +90,13 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( switch (metric) { case cuvs::distance::DistanceType::L2Expanded: norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, valid_i); + dataset_desc.template compute_similarity( + workspace, seed_index, valid_i); break; case cuvs::distance::DistanceType::InnerProduct: norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, valid_i); + dataset_desc.template compute_similarity( + workspace, seed_index, valid_i); break; default: break; } @@ -109,7 +107,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( } } - const unsigned lane_id = threadIdx.x % TEAM_SIZE; + const unsigned lane_id = threadIdx.x % TeamSize; if (valid_i && lane_id == 0) { if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { result_distances_ptr[i] = best_norm2_team_local; @@ -122,29 +120,28 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( } } -template -_RAFT_DEVICE void compute_distance_to_child_nodes( - INDEX_T* const result_child_indices_ptr, - DISTANCE_T* const result_child_distances_ptr, +template +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( + INDEX_T* result_child_indices_ptr, + DISTANCE_T* result_child_distances_ptr, // query - const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer, + typename DATASET_DESCRIPTOR_T::ws_handle workspace, // [dataset_dim, dataset_size] const DATASET_DESCRIPTOR_T& dataset_desc, // [knn_k, dataset_size] - const INDEX_T* const knn_graph, - const std::uint32_t knn_k, + const INDEX_T* knn_graph, + uint32_t knn_k, // hashmap - INDEX_T* const visited_hashmap_ptr, - const std::uint32_t hash_bitlen, - const INDEX_T* const parent_indices, - const INDEX_T* const internal_topk_list, - const std::uint32_t search_width, - const cuvs::distance::DistanceType metric) + INDEX_T* visited_hashmap_ptr, + uint32_t hash_bitlen, + const INDEX_T* parent_indices, + const INDEX_T* internal_topk_list, + uint32_t search_width, + cuvs::distance::DistanceType metric) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; const INDEX_T invalid_index = utils::get_max_value(); @@ -168,10 +165,10 @@ _RAFT_DEVICE void compute_distance_to_child_nodes( __syncthreads(); // Compute the distance to child nodes - std::uint32_t max_i = knn_k * search_width; - if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); } - for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) { - const auto i = tid / TEAM_SIZE; + uint32_t max_i = knn_k * search_width; + if (max_i % (32 / TeamSize)) { max_i += (32 / TeamSize) - (max_i % (32 / TeamSize)); } + for (uint32_t tid = threadIdx.x; tid < max_i * TeamSize; tid += blockDim.x) { + const auto i = tid / TeamSize; const bool valid_i = (i < (knn_k * search_width)); INDEX_T child_id = invalid_index; if (valid_i) { child_id = result_child_indices_ptr[i]; } @@ -179,23 +176,19 @@ _RAFT_DEVICE void compute_distance_to_child_nodes( DISTANCE_T norm2; switch (metric) { case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, child_id != invalid_index); + norm2 = dataset_desc.template compute_similarity( + workspace, child_id, child_id != invalid_index); break; case cuvs::distance::DistanceType::InnerProduct: norm2 = - dataset_desc.template compute_similarity( - query_buffer, child_id, child_id != invalid_index); + dataset_desc.template compute_similarity( + workspace, child_id, child_id != invalid_index); break; default: break; } // Store the distance - const unsigned lane_id = threadIdx.x % TEAM_SIZE; + const unsigned lane_id = threadIdx.x % TeamSize; if (valid_i && lane_id == 0) { if (child_id != invalid_index) { result_child_distances_ptr[i] = norm2; @@ -208,96 +201,264 @@ _RAFT_DEVICE void compute_distance_to_child_nodes( } // namespace device -template +template struct dataset_descriptor_base_t { - using INDEX_T = INDEX_T_; - using QUERY_T = QUERY_T_; - using DISTANCE_T = DISTANCE_T_; + using DATA_T = DataT; + using INDEX_T = IndexT; + using DISTANCE_T = DistanceT; + + struct distance_workspace; + using ws_handle = distance_workspace*; + + INDEX_T size; + uint32_t dim; + + _RAFT_HOST_DEVICE dataset_descriptor_base_t(INDEX_T size, uint32_t dim) : size(size), dim(dim) {} + + /** Total dynamic shared memory required by the descriptor. */ + _RAFT_HOST_DEVICE [[nodiscard]] virtual auto smem_ws_size_in_bytes() const -> uint32_t = 0; + + /** Set shared memory workspace (pointers). */ + _RAFT_DEVICE [[nodiscard]] virtual auto set_smem_ws(void* smem_ptr) const -> ws_handle = 0; + + /** Copy the query to the shared memory. */ + _RAFT_DEVICE virtual void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const = 0; + + _RAFT_DEVICE virtual void compute_distance_to_random_nodes( + ws_handle smem_workspace, + INDEX_T* const result_indices_ptr, // [num_pickup] + DISTANCE_T* const result_distances_ptr, // [num_pickup] + const size_t num_pickup, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const INDEX_T* const seed_ptr, // [num_seeds] + const uint32_t num_seeds, + INDEX_T* const visited_hash_ptr, + const uint32_t hash_bitlen, + const cuvs::distance::DistanceType metric, + const uint32_t block_id = 0, + const uint32_t num_blocks = 1) const = 0; + + _RAFT_DEVICE virtual void compute_distance_to_child_nodes( + ws_handle smem_workspace, + INDEX_T* const result_child_indices_ptr, + DISTANCE_T* const result_child_distances_ptr, + // [knn_k, dataset_size] + const INDEX_T* const knn_graph, + const uint32_t knn_k, + // hashmap + INDEX_T* const visited_hashmap_ptr, + const uint32_t hash_bitlen, + const INDEX_T* const parent_indices, + const INDEX_T* const internal_topk_list, + const uint32_t search_width, + const cuvs::distance::DistanceType metric) const = 0; +}; + +template +struct dataset_descriptor_host { + dataset_descriptor_base_t* dev_ptr = nullptr; + uint32_t smem_ws_size_in_bytes = 0; + uint32_t team_size = 0; + uint32_t dataset_block_dim = 0; + + template + dataset_descriptor_host(const DescriptorImpl& dd_host, + rmm::cuda_stream_view stream, + uint32_t team_size, + uint32_t dataset_block_dim) + : stream_{stream}, + smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, + team_size{team_size}, + dataset_block_dim{dataset_block_dim} + { + RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); + } + + ~dataset_descriptor_host() noexcept + { + if (dev_ptr == nullptr) { return; } + RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(dev_ptr, stream_)); + } - const INDEX_T size; - const std::uint32_t dim; + dataset_descriptor_host(dataset_descriptor_host&& other) + { + std::swap(this->dev_ptr, other.dev_ptr); + std::swap(this->smem_ws_size_in_bytes, other.smem_ws_size_in_bytes); + std::swap(this->stream_, other.stream_); + std::swap(this->team_size, other.team_size); + std::swap(this->dataset_block_dim, other.dataset_block_dim); + } + dataset_descriptor_host& operator=(dataset_descriptor_host&& b) + { + auto& a = *this; + std::swap(a.dev_ptr, b.dev_ptr); + std::swap(a.smem_ws_size_in_bytes, b.smem_ws_size_in_bytes); + std::swap(a.stream_, b.stream_); + std::swap(a.team_size, b.team_size); + std::swap(a.dataset_block_dim, b.dataset_block_dim); + return a; + } + dataset_descriptor_host(const dataset_descriptor_host&) = delete; + dataset_descriptor_host& operator=(const dataset_descriptor_host&) = delete; - dataset_descriptor_base_t(const INDEX_T size, const std::uint32_t dim) : size(size), dim(dim) {} + private: + rmm::cuda_stream_view stream_; }; -template -struct standard_dataset_descriptor_t - : public dataset_descriptor_base_t { - using LOAD_T = device::LOAD_128BIT_T; - using DATA_T = DATA_T_; - using QUERY_T = typename dataset_descriptor_base_t::QUERY_T; - - const DATA_T* const ptr; - const std::size_t ld; - using dataset_descriptor_base_t::size; - using dataset_descriptor_base_t::dim; - - standard_dataset_descriptor_t(const DATA_T* const ptr, - const std::size_t size, - const std::uint32_t dim, - const std::size_t ld) - : dataset_descriptor_base_t(size, dim), ptr(ptr), ld(ld) +template +struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using LOAD_T = device::LOAD_128BIT_T; + using QUERY_T = float; + using base_type::dim; + using typename base_type::DATA_T; + using typename base_type::DISTANCE_T; + using typename base_type::INDEX_T; + using typename base_type::ws_handle; + + const DATA_T* ptr; + size_t ld; + uint32_t smem_query_buffer_length; + + _RAFT_HOST_DEVICE standard_dataset_descriptor_t(const DATA_T* ptr, + INDEX_T size, + uint32_t dim, + size_t ld) + : base_type(size, dim), + ptr(ptr), + ld(ld), + smem_query_buffer_length{raft::round_up_safe(dim, DatasetBlockDim)} + { + } + + _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t { + return smem_query_buffer_length * sizeof(QUERY_T); } - static const std::uint32_t smem_buffer_size_in_byte = 0; - __device__ void set_smem_ptr(void* const){}; + _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle + { + return reinterpret_cast(smem_ptr); + } - template - __device__ void copy_query(const DATA_T* const dmem_query_ptr, - QUERY_T* const smem_query_ptr, - const std::uint32_t query_smem_buffer_length) + _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const { - for (unsigned i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { + auto buf = smem_query_buffer(smem_workspace); + for (unsigned i = threadIdx.x; i < smem_query_buffer_length; i += blockDim.x) { unsigned j = device::swizzling(i); if (i < dim) { - smem_query_ptr[j] = - cuvs::spatial::knn::detail::utils::mapping{}(dmem_query_ptr[i]); + buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(query_ptr[i]); } else { - smem_query_ptr[j] = 0.0; + buf[j] = 0.0; } } } + _RAFT_DEVICE void compute_distance_to_random_nodes( + ws_handle smem_workspace, + INDEX_T* const result_indices_ptr, // [num_pickup] + DISTANCE_T* const result_distances_ptr, // [num_pickup] + const size_t num_pickup, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const INDEX_T* const seed_ptr, // [num_seeds] + const uint32_t num_seeds, + INDEX_T* const visited_hash_ptr, + const uint32_t hash_bitlen, + const cuvs::distance::DistanceType metric, + const uint32_t block_id = 0, + const uint32_t num_blocks = 1) const + { + return device::compute_distance_to_random_nodes(result_indices_ptr, + result_distances_ptr, + smem_workspace, + *this, + num_pickup, + num_distilation, + rand_xor_mask, + seed_ptr, + num_seeds, + visited_hash_ptr, + hash_bitlen, + metric, + block_id, + num_blocks); + } + + _RAFT_DEVICE void compute_distance_to_child_nodes(ws_handle smem_workspace, + INDEX_T* const result_child_indices_ptr, + DISTANCE_T* const result_child_distances_ptr, + // [knn_k, dataset_size] + const INDEX_T* const knn_graph, + const uint32_t knn_k, + // hashmap + INDEX_T* const visited_hashmap_ptr, + const uint32_t hash_bitlen, + const INDEX_T* const parent_indices, + const INDEX_T* const internal_topk_list, + const uint32_t search_width, + const cuvs::distance::DistanceType metric) const + { + return device::compute_distance_to_child_nodes( + result_child_indices_ptr, + result_child_distances_ptr, + smem_workspace, + *this, + knn_graph, + knn_k, + visited_hashmap_ptr, + hash_bitlen, + parent_indices, + internal_topk_list, + search_width, + metric); + } + template - std::enable_if_t __device__ - dist_op(T a, T b) const + RAFT_DEVICE_INLINE_FUNCTION auto dist_op(T a, T b) const + -> std::enable_if_t { T diff = a - b; return diff * diff; } template - std::enable_if_t __device__ - dist_op(T a, T b) const + RAFT_DEVICE_INLINE_FUNCTION auto dist_op(T a, T b) const + -> std::enable_if_t { return -a * b; } - template - __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr, - const INDEX_T dataset_i, - const bool valid) const + template + RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, + const INDEX_T dataset_i, + const bool valid) const -> DISTANCE_T { + auto query_ptr = smem_query_buffer(smem_workspace); const auto dataset_ptr = ptr + dataset_i * ld; - const unsigned lane_id = threadIdx.x % TEAM_SIZE; + const unsigned lane_id = threadIdx.x % TeamSize; constexpr unsigned vlen = device::get_vlen(); // #include (DATASET_BLOCK_DIM, TEAM_SIZE * vlen); + constexpr unsigned reg_nelem = raft::ceildiv(DatasetBlockDim, TeamSize * vlen); raft::TxN_t dl_buff[reg_nelem]; DISTANCE_T norm2 = 0; if (valid) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) { + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset; + const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; if (k >= dim) break; dl_buff[e].load(dataset_ptr, k); } #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset; + const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; if (k >= dim) break; #pragma unroll for (uint32_t v = 0; v < vlen; v++) { @@ -313,11 +474,101 @@ struct standard_dataset_descriptor_t } } } - for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) { + for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); } return norm2; } + + private: + RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const + -> QUERY_T* + { + return reinterpret_cast(smem_workspace); + } }; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; + +template +__launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( + dataset_descriptor_base_t* out, + const DataT* ptr, + IndexT size, + uint32_t dim, + size_t ld) +{ + new (out) standard_dataset_descriptor_t( + ptr, size, dim, ld); +} + +template +auto standard_dataset_descriptor_init(const strided_dataset& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + standard_dataset_descriptor_t dd_host{ + dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; + dataset_descriptor_host result{ + dd_host, stream, TeamSize, DatasetBlockDim}; + standard_dataset_descriptor_init_kernel + <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); + return result; +} + +template +auto dataset_descriptor_init(const strided_dataset& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + constexpr int64_t max_dataset_block_dim = 512; + int64_t dataset_block_dim = 128; + while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { + dataset_block_dim *= 2; + } + switch (dataset_block_dim) { + case 128: + return standard_dataset_descriptor_init<8, 128, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + case 256: + return standard_dataset_descriptor_init<16, 256, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + default: + return standard_dataset_descriptor_init<32, 512, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + } +} + } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 68973662f..45eae30c5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -22,38 +22,74 @@ #include namespace cuvs::neighbors::cagra::detail { -template -struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { + +template +struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using CODE_BOOK_T = CodeBookT; using LOAD_T = device::LOAD_128BIT_T; - using DATA_T = DATA_T_; - using CODE_BOOK_T = CODE_BOOK_T_; - using QUERY_T = typename dataset_descriptor_base_t::QUERY_T; + using QUERY_T = half; + using base_type::dim; + using typename base_type::DATA_T; + using typename base_type::DISTANCE_T; + using typename base_type::INDEX_T; + using typename base_type::ws_handle; static_assert(std::is_same_v, "Only CODE_BOOK_T = `half` is supported now"); const std::uint8_t* encoded_dataset_ptr; - const std::uint32_t encoded_dataset_dim; - const std::uint32_t n_subspace; + std::uint32_t encoded_dataset_dim; + std::uint32_t n_subspace; const CODE_BOOK_T* vq_code_book_ptr; - const float vq_scale; + float vq_scale; const CODE_BOOK_T* pq_code_book_ptr; - const float pq_scale; - using dataset_descriptor_base_t::size; - using dataset_descriptor_base_t::dim; + float pq_scale; + + uint32_t smem_query_buffer_length; - // Set on device - CODE_BOOK_T* smem_pq_code_book_ptr; - static const std::uint32_t smem_buffer_size_in_byte = + static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of(); - __device__ void set_smem_ptr(void* const smem_ptr) + _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr, + std::uint32_t encoded_dataset_dim, + std::uint32_t n_subspace, + const CODE_BOOK_T* vq_code_book_ptr, + float vq_scale, + const CODE_BOOK_T* pq_code_book_ptr, + float pq_scale, + std::size_t size, + std::uint32_t dim) + : base_type(size, dim), + encoded_dataset_ptr(encoded_dataset_ptr), + encoded_dataset_dim(encoded_dataset_dim), + n_subspace(n_subspace), + vq_code_book_ptr(vq_code_book_ptr), + vq_scale(vq_scale), + pq_code_book_ptr(pq_code_book_ptr), + pq_scale(pq_scale), + smem_query_buffer_length{raft::round_up_safe(dim, DatasetBlockDim)} + { + } + + _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t { - smem_pq_code_book_ptr = reinterpret_cast(smem_ptr); + /* SMEM workspace layout: + 1. Codebook (kSMemCodeBookSizeInBytes bytes) + 2. Queries (smem_query_buffer_length elems) + */ + return kSMemCodeBookSizeInBytes + smem_query_buffer_length * sizeof(QUERY_T); + } + + _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle + { + auto codebook_buf = reinterpret_cast(smem_ptr); // Copy PQ table for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { @@ -68,44 +104,23 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(smem_pq_code_book_ptr)[smem_index] = buf2; + codebook_buf[smem_index] = buf2; } + return reinterpret_cast(smem_ptr); } - cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr, - const std::uint32_t encoded_dataset_dim, - const std::uint32_t n_subspace, - const CODE_BOOK_T* const vq_code_book_ptr, - const float vq_scale, - const CODE_BOOK_T* const pq_code_book_ptr, - const float pq_scale, - const std::size_t size, - const std::uint32_t dim) - : dataset_descriptor_base_t(size, dim), - encoded_dataset_ptr(encoded_dataset_ptr), - encoded_dataset_dim(encoded_dataset_dim), - n_subspace(n_subspace), - vq_code_book_ptr(vq_code_book_ptr), - vq_scale(vq_scale), - pq_code_book_ptr(pq_code_book_ptr), - pq_scale(pq_scale) - { - } - - template - __device__ void copy_query(const DATA_T* const dmem_query_ptr, - QUERY_T* const smem_query_ptr, - const std::uint32_t query_smem_buffer_length) + _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const { constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + auto smem_query_ptr = smem_query_buffer(smem_workspace); for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(dmem_query_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(dmem_query_ptr[i + 1]); } + if (i < dim) { buf2.x = mapping(query_ptr[i]); } + if (i + 1 < dim) { buf2.y = mapping(query_ptr[i + 1]); } if ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { // Use swizzling in the condition to reduce bank conflicts in shared // memory, which are likely to occur when pq_code_book_dim is large. - ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = + ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = buf2; } else { (reinterpret_cast(smem_query_ptr + i))[0] = buf2; @@ -113,26 +128,88 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t - __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr, - const INDEX_T node_id, - const bool valid) const + _RAFT_DEVICE void compute_distance_to_random_nodes( + ws_handle smem_workspace, + INDEX_T* const result_indices_ptr, // [num_pickup] + DISTANCE_T* const result_distances_ptr, // [num_pickup] + const size_t num_pickup, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const INDEX_T* const seed_ptr, // [num_seeds] + const uint32_t num_seeds, + INDEX_T* const visited_hash_ptr, + const uint32_t hash_bitlen, + const cuvs::distance::DistanceType metric, + const uint32_t block_id = 0, + const uint32_t num_blocks = 1) const { - float norm = 0; + return device::compute_distance_to_random_nodes(result_indices_ptr, + result_distances_ptr, + smem_workspace, + *this, + num_pickup, + num_distilation, + rand_xor_mask, + seed_ptr, + num_seeds, + visited_hash_ptr, + hash_bitlen, + metric, + block_id, + num_blocks); + } + + _RAFT_DEVICE void compute_distance_to_child_nodes(ws_handle smem_workspace, + INDEX_T* const result_child_indices_ptr, + DISTANCE_T* const result_child_distances_ptr, + // [knn_k, dataset_size] + const INDEX_T* const knn_graph, + const uint32_t knn_k, + // hashmap + INDEX_T* const visited_hashmap_ptr, + const uint32_t hash_bitlen, + const INDEX_T* const parent_indices, + const INDEX_T* const internal_topk_list, + const uint32_t search_width, + const cuvs::distance::DistanceType metric) const + { + return device::compute_distance_to_child_nodes( + result_child_indices_ptr, + result_child_distances_ptr, + smem_workspace, + *this, + knn_graph, + knn_k, + visited_hashmap_ptr, + hash_bitlen, + parent_indices, + internal_topk_list, + search_width, + metric); + } + + template + RAFT_DEVICE_INLINE_FUNCTION DISTANCE_T compute_similarity(ws_handle smem_workspace, + const INDEX_T node_id, + const bool valid) const + { + auto codebook_ptr = smem_pq_code_book_ptr(smem_workspace); + auto query_ptr = smem_query_buffer(smem_workspace); + float norm = 0; if (valid) { - const unsigned lane_id = threadIdx.x % TEAM_SIZE; + const unsigned lane_id = threadIdx.x % TeamSize; const uint32_t vq_code = *(reinterpret_cast( encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * node_id))); if (PQ_BITS == 8) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) { + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** constexpr unsigned nelem = - raft::div_rounding_up_unsafe(DATASET_BLOCK_DIM / PQ_LEN, TEAM_SIZE * vlen); + raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); // Loading PQ codes uint32_t pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN; + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; if (k >= n_subspace) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) pq_codes[e] = *(reinterpret_cast( @@ -145,7 +222,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t= n_subspace) break; // Loading VQ code-book raft::TxN_t vq_vals[PQ_LEN]; @@ -167,10 +244,10 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t( - query_ptr))[device::swizzling(d / 2)]; + query_ptr))[device::swizzling(d / 2)]; // Loading PQ code book in smem - diff2 -= *(reinterpret_cast( - smem_pq_code_book_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); + diff2 -= *(reinterpret_cast(codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + + (2 * (pq_code & 0xff)))); diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; norm2 += diff2 * diff2; } @@ -182,7 +259,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t= n_subspace) break; // Loading VQ code-book raft::TxN_t vq_vals[PQ_LEN]; @@ -202,7 +279,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t= dim) break; raft::TxN_t pq_vals; pq_vals.load( - reinterpret_cast(smem_pq_code_book_ptr + PQ_LEN * (pq_code & 0xff)), + reinterpret_cast(codebook_ptr + PQ_LEN * (pq_code & 0xff)), 0); // (from L1$ or smem) #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { @@ -221,11 +298,250 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t 0; offset >>= 1) { + for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { norm += __shfl_xor_sync(0xffffffff, norm, offset); } return norm; } + + private: + RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_pq_code_book_ptr(ws_handle smem_workspace) const + -> CODE_BOOK_T* + { + return reinterpret_cast(smem_workspace); + } + + RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const + -> QUERY_T* + { + return reinterpret_cast(reinterpret_cast(smem_workspace) + + kSMemCodeBookSizeInBytes); + } }; -} // namespace cuvs::neighbors::cagra::detail \ No newline at end of file +extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; + +template +__launch_bounds__(1, 1) __global__ + void vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, + const std::uint8_t* encoded_dataset_ptr, + std::uint32_t encoded_dataset_dim, + std::uint32_t n_subspace, + const CodeBookT* vq_code_book_ptr, + float vq_scale, + const CodeBookT* pq_code_book_ptr, + float pq_scale, + std::size_t size, + std::uint32_t dim) +{ + new (out) cagra_q_dataset_descriptor_t(encoded_dataset_ptr, + encoded_dataset_dim, + n_subspace, + vq_code_book_ptr, + vq_scale, + pq_code_book_ptr, + pq_scale, + size, + dim); +} + +template +auto vpq_dataset_descriptor_init(const vpq_dataset& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + const float vq_scale = 1.0f; + const float pq_scale = 1.0f; + cagra_q_dataset_descriptor_t + dd_host{dataset.data.data_handle(), + dataset.encoded_row_length(), + dataset.pq_dim(), + dataset.vq_code_book.data_handle(), + vq_scale, + dataset.pq_code_book.data_handle(), + pq_scale, + IndexT(dataset.n_rows()), + dataset.dim()}; + dataset_descriptor_host result{ + dd_host, stream, TeamSize, DatasetBlockDim}; + vpq_dataset_descriptor_init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr, + dd_host.encoded_dataset_ptr, + dd_host.encoded_dataset_dim, + dd_host.n_subspace, + dd_host.vq_code_book_ptr, + dd_host.vq_scale, + dd_host.pq_code_book_ptr, + dd_host.pq_scale, + dd_host.size, + dd_host.dim); + return result; +} + +template +auto vpq_dataset_descriptor_init_runtime(const vpq_dataset& dataset, + rmm::cuda_stream_view stream) + +{ + if (dataset.pq_bits() == 8) { + if (dataset.pq_len() == 2) { + return vpq_dataset_descriptor_init(dataset, stream); + } else if (dataset.pq_len() == 4) { + return vpq_dataset_descriptor_init(dataset, stream); + } else { + RAFT_FAIL("Subspace dimension must be 2 or 4"); + } + } else { + RAFT_FAIL("Only 8-bit PQ is supported now"); + } +} + +template +auto dataset_descriptor_init(const vpq_dataset& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + constexpr int64_t max_dataset_block_dim = 512; + int64_t dataset_block_dim = 128; + while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { + dataset_block_dim *= 2; + } + switch (dataset_block_dim) { + case 128: + return vpq_dataset_descriptor_init_runtime<8, 128, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + case 256: + return vpq_dataset_descriptor_init_runtime<16, 256, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + default: + return vpq_dataset_descriptor_init_runtime<32, 512, DataT, IndexT, DistanceT, DatasetIdxT>( + dataset, stream); + } +} + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh b/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh new file mode 100644 index 000000000..2dec5aa10 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh b/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh new file mode 100644 index 000000000..4e72daf63 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +// template +// __launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( +// dataset_descriptor_base_t* out, +// const DataT* ptr, +// IndexT size, +// uint32_t dim, +// size_t ld) +// { +// new (out) standard_dataset_descriptor_t( +// ptr, size, dim, ld); +// (void)out->set_smem_ws(out); +// } + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/distance_core_00_generate.py similarity index 60% rename from cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py rename to cpp/src/neighbors/detail/cagra/distance_core_00_generate.py index bc5f506ac..a6d563bf5 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/distance_core_00_generate.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -header = """/* +template = """/* * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,22 +29,21 @@ */ /* - * NOTE: this file is generated by q_search_single_cta_00_generate.py + * NOTE: this file is generated by distance_core_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_single_cta_00_generate.py + * > python distance_core_00_generate.py * */ -#include "search_single_cta_inst.cuh" -#include "compute_distance_vpq.cuh" +{includes} -namespace cuvs::neighbors::cagra::detail::single_cta_search { -""" +namespace cuvs::neighbors::cagra::detail {{ + +{instances} -trailer = """ -} // namespace cuvs::neighbors::cagra::detail::single_cta_search +}} // namespace cuvs::neighbors::cagra::detail """ mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] @@ -54,7 +53,7 @@ # mxelem = [64, 128, 256] pq_bits = [8] -subspace_dims = [2, 4] +pq_lens = [2, 4] # rblock = [(256, 4), (512, 2), (1024, 1)] # rcandidates = [32] @@ -70,19 +69,31 @@ half_uint64=("half", "uint64_t", "float"), ) +distance_core_ext = [] + # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): for (mxdim, team) in mxdim_team: + # CAGRA + path = f"distance_core_{type_path}_dim{mxdim}_t{team}.cu" + includes = '#include "compute_distance.hpp"' + decl = f"template struct standard_dataset_descriptor_t<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>;" + distance_core_ext.append(f"extern {decl}") + with open(path, "w") as f: + f.write(template.format(includes=includes, instances=decl)); + print(f"src/neighbors/detail/cagra/{path}") + + # CAGRA-Q for code_book_t in code_book_types: - for subspace_dim in subspace_dims: + for pq_len in pq_lens: for pq_bit in pq_bits: - path = f"q_search_single_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu" + path = f"distance_core_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" + decl = f"template struct cagra_q_dataset_descriptor_t<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>;" + includes = '#include "compute_distance_vpq.cuh"' + distance_core_ext.append(f"extern {decl}") with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - - f.write(trailer) - # For pasting into CMakeLists.txt + f.write(template.format(includes=includes, instances=decl)); print(f"src/neighbors/detail/cagra/{path}") + +with open("distance_core-ext.cuh", "w") as f: + f.write(template.format(includes='#include "compute_distance.hpp"', instances="\n".join(distance_core_ext))) diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu new file mode 100644 index 000000000..dcdd35467 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu new file mode 100644 index 000000000..1e79be961 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu new file mode 100644 index 000000000..a2a50f110 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu new file mode 100644 index 000000000..ebe530cae --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu new file mode 100644 index 000000000..47c31adfc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu new file mode 100644 index 000000000..43b4ad1fb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu new file mode 100644 index 000000000..b654d509b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu new file mode 100644 index 000000000..2d95145eb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu new file mode 100644 index 000000000..e66c27e86 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu new file mode 100644 index 000000000..353569911 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu new file mode 100644 index 000000000..df34f0f64 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu new file mode 100644 index 000000000..bfb038bbb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu new file mode 100644 index 000000000..6d28a341e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu new file mode 100644 index 000000000..39a3723f0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu new file mode 100644 index 000000000..a1ca21a47 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu new file mode 100644 index 000000000..f051409bb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu new file mode 100644 index 000000000..3ab24df00 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu new file mode 100644 index 000000000..29c4f30bc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu new file mode 100644 index 000000000..dde230be5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu new file mode 100644 index 000000000..168a5e534 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu new file mode 100644 index 000000000..fe9de9690 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..d664cdc64 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu new file mode 100644 index 000000000..e30f8be8a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu new file mode 100644 index 000000000..a14a6cfb9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu similarity index 53% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu index 1116eaaa4..9ec47cb7c 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu @@ -15,22 +15,18 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by distance_core_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python distance_core_00_generate.py * */ #include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu similarity index 53% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu index 7e3ec363d..7f9f2d0c3 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu @@ -15,22 +15,18 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by distance_core_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python distance_core_00_generate.py * */ #include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu similarity index 53% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu index af60c776a..d04b7fcbf 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu @@ -15,22 +15,18 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by distance_core_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python distance_core_00_generate.py * */ #include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu similarity index 53% rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu index 5dd79a79b..4f8148cdc 100644 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu @@ -15,22 +15,18 @@ */ /* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py + * NOTE: this file is generated by distance_core_00_generate.py * * Make changes there and run in this directory: * - * > python q_search_multi_cta_00_generate.py + * > python distance_core_00_generate.py * */ #include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); +namespace cuvs::neighbors::cagra::detail { -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..5d2359cd3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..a35f988b6 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..04489fae2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..862c6ef5b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..cdbac6a11 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..0bb71833a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..3ac72c549 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..086219443 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..71417b4b1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..cb9de4e7a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..007dc7a5a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..8b03a6188 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..74436b559 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..8bcae9232 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..971ac1e2e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..3c8eb14bb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..b71e6f4d2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..d459f0807 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..6263bd775 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..fc5af809f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..38ef65dac --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..9cae97b0d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..91859bf6a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..ed6435244 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..9d25412d3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..b2379526f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..aa5e5147d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..89aa65c78 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..a9132d69c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..9c2e1b798 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..422145374 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..515119b54 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..227192197 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..f87036bb2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..e581bad72 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..02b621192 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..e51d80aaa --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..c6975e620 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..c55f75af0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..2bf8ab622 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..b51b42159 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..ac86a9489 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..414e21ece --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..7b98c37b3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by distance_core_00_generate.py + * + * Make changes there and run in this directory: + * + * > python distance_core_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 183d6051f..5dd902bea 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -17,7 +17,7 @@ #pragma once #include "search_multi_cta.cuh" -#include "search_multi_kernel.cuh" +// #include "search_multi_kernel.cuh" #include "search_plan.cuh" #include "search_single_cta.cuh" @@ -25,70 +25,47 @@ namespace cuvs::neighbors::cagra::detail { -template class factory { - using T = typename DATASET_DESCRIPTOR_T::DATA_T; - using IdxT = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DistanceT = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - public: /** * Create a search structure for dataset with dim features. */ - static std::unique_ptr> create( + static std::unique_ptr> create( raft::resources const& res, search_params const& params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, uint32_t topk, const cuvs::distance::DistanceType metric) { search_plan_impl_base plan(params, dim, graph_degree, topk, metric); - switch (plan.dataset_block_dim) { - case 128: - switch (plan.team_size) { - case 8: return dispatch_kernel<128, 8>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - case 256: - switch (plan.team_size) { - case 16: return dispatch_kernel<256, 16>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - case 512: - switch (plan.team_size) { - case 32: return dispatch_kernel<512, 32>(res, plan); break; - default: THROW("Incorrect team size %lu", plan.team_size); - } - break; - default: THROW("Incorrect dataset_block_dim (%lu)\n", plan.dataset_block_dim); - } - return std::unique_ptr>(); + return dispatch_kernel(res, plan, dataset_desc); } private: - template - static std::unique_ptr> - dispatch_kernel(raft::resources const& res, search_plan_impl_base& plan) + static std::unique_ptr> + dispatch_kernel(raft::resources const& res, + search_plan_impl_base& plan, + const dataset_descriptor_host& dataset_desc) { if (plan.algo == search_algo::SINGLE_CTA) { - return std::unique_ptr>( - new single_cta_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + return std::make_unique< + single_cta_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); } else if (plan.algo == search_algo::MULTI_CTA) { - return std::unique_ptr>( - new multi_cta_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + return std::make_unique< + multi_cta_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); } else { - return std::unique_ptr>( - new multi_kernel_search:: - search( - res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric)); + // return std::make_unique< + // multi_kernel_search::search>( + // res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); + RAFT_FAIL("WIP!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); } } }; diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 515be75df..9edbbf5c1 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -73,12 +73,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a } template -RAFT_KERNEL kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dataset_dim] - const IdxT dataset_size, - const uint32_t dataset_dim, - IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree) +__global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dataset_dim] + const IdxT dataset_size, + const uint32_t dataset_dim, + IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree) { const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize; if (srcNode >= graph_size) { return; } @@ -129,15 +129,15 @@ RAFT_KERNEL kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, data } template -RAFT_KERNEL kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree, - const uint32_t degree, - const uint32_t batch_size, - const uint32_t batch_id, - uint8_t* const detour_count, // [graph_chunk_size, graph_degree] - uint32_t* const num_no_detour_edges, // [graph_size] - uint64_t* const stats) +__global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree, + const uint32_t degree, + const uint32_t batch_size, + const uint32_t batch_id, + uint8_t* const detour_count, // [graph_chunk_size, graph_degree] + uint32_t* const num_no_detour_edges, // [graph_size] + uint64_t* const stats) { __shared__ uint32_t smem_num_detour[MAX_DEGREE]; uint64_t* const num_retain = stats; @@ -192,11 +192,11 @@ RAFT_KERNEL kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph } template -RAFT_KERNEL kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] - IdxT* const rev_graph, // [size, degree] - uint32_t* const rev_graph_count, // [graph_size] - const uint32_t graph_size, - const uint32_t degree) +__global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] + IdxT* const rev_graph, // [size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree) { const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); const uint32_t tnum = blockDim.x * gridDim.x; @@ -221,16 +221,16 @@ __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label) } template -RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph, // [graph_size, graph_degree] - const IdxT* candidate_edges, // [graph_size] - IdxT* outgoing_num_edges, // [graph_size] - IdxT* incoming_num_edges, // [graph_size] - const IdxT* outgoing_max_edges, // [graph_size] - const IdxT* incoming_max_edges, // [graph_size] - const IdxT* label, // [graph_size] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_update_graph(IdxT* mst_graph, // [graph_size, graph_degree] + const IdxT* candidate_edges, // [graph_size] + IdxT* outgoing_num_edges, // [graph_size] + IdxT* incoming_num_edges, // [graph_size] + const IdxT* outgoing_max_edges, // [graph_size] + const IdxT* incoming_max_edges, // [graph_size] + const IdxT* label, // [graph_size] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -310,11 +310,11 @@ RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph, // [graph } template -RAFT_KERNEL kern_mst_opt_labeling(IdxT* label, // [graph_size] - const IdxT* mst_graph, // [graph_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_labeling(IdxT* label, // [graph_size] + const IdxT* mst_graph, // [graph_size, graph_degree] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -348,10 +348,10 @@ RAFT_KERNEL kern_mst_opt_labeling(IdxT* label, // [graph_size] } template -RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] - const IdxT* label, // [graph_size] - const uint32_t graph_size, - uint64_t* stats) +__global__ void kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] + const IdxT* label, // [graph_size] + const uint32_t graph_size, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; @@ -375,14 +375,14 @@ RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size, // [graph_size] } template -RAFT_KERNEL kern_mst_opt_postprocessing(IdxT* outgoing_num_edges, // [graph_size] - IdxT* incoming_num_edges, // [graph_size] - IdxT* outgoing_max_edges, // [graph_size] - IdxT* incoming_max_edges, // [graph_size] - const IdxT* cluster_size, // [graph_size] - const uint32_t graph_size, - const uint32_t graph_degree, - uint64_t* stats) +__global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges, // [graph_size] + IdxT* incoming_num_edges, // [graph_size] + IdxT* outgoing_max_edges, // [graph_size] + IdxT* incoming_max_edges, // [graph_size] + const IdxT* cluster_size, // [graph_size] + const uint32_t graph_size, + const uint32_t graph_degree, + uint64_t* stats) { const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x); if (i >= graph_size) return; diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py deleted file mode 100644 index 63171373f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -header = """/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -""" - -trailer = """ -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search -""" - -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] -pq_bits = [8] -subspace_dims = [2, 4] -# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] -# mxelem = [64, 128, 256] -load_types = ["uint4"] -code_book_types = ["half"] -search_types = dict( - float_uint32=( - "float", - "uint32_t", - "float", - ), # data_t, vec_idx_t, distance_t - half_uint32=("half", "uint32_t", "float"), - int8_uint32=("int8_t", "uint32_t", "float"), - uint8_uint32=("uint8_t", "uint32_t", "float"), - float_uint64=("float", "uint64_t", "float"), - half_uint64=("half", "uint64_t", "float"), -) -# knn -for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - for code_book_t in code_book_types: - for subspace_dim in subspace_dims: - for pq_bit in pq_bits: - path = f"q_search_multi_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 9ec7ce3dd..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 292a1429a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 1a5ad50e3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 0ab23d7eb..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 5d94a501a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 56534dc05..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 7ff962058..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 3387a32a3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 2d3f2cb1d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 73dd8cd4b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index b5e33602d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 32fe0d628..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index e2726ea26..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index b4ebd49c4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 72f198c92..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index dfb667a7f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index c583569f6..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index fedfb5146..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 2b6e8e3da..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 4a97fb752..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 675cd3c93..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index b42b3289c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 0db4296f1..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 4a2610dc7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index b1c15662e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 201f68fb5..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 26744ed76..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 1bce71bef..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 694304f3c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index e6a563731..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 5c554af3f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 965b43c07..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 97a4f8092..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index bdd1719b3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e39bc1e2d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 599cf327a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 621c5a249..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index cbed3ef8a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 7428bfd9e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 70efefdb0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 4039b8582..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 022eb0e05..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e48b2ed71..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 64f08530f..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_multi_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_multi_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index b40322741..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 36273d0d4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index ef483437a..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index d9ebb1b85..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index e86524ee0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 9f2b7fbc7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 1ce4f5520..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 2d6f93ef0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 5f3267410..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 631ac7938..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index ea8faee1c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 061b1a04e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 15610d853..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index f984b46f0..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 45299f272..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index fcb91be8c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index b594fedab..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index a82be6b55..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index d80fef52c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index e2c3ef4f7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 98889811d..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index f5e9d12c9..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 4f14910b4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 67d52f8d5..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 1420918a1..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index eb0a72da3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 7a98b59a9..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 7e07033c7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 857f32712..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 3c00c5223..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index e5c4c7b69..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 22359d71b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 37c783f19..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 0a4049d79..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 773f567c4..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index dfc176abd..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 680c32655..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index e57881e82..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 525004f2e..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 7af2ef124..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 0fd36c31b..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index d4cc5f449..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 1024, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index aa58ac2b7..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 189c3ed9c..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(8, - 128, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 9dc9aaae3..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 100110313..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(16, - 256, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 8d4e0aeee..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 4c7318735..000000000 --- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by q_search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python q_search_single_cta_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" -#include "search_single_cta_inst.cuh" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection(32, - 512, - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t< - uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index efbf9b56d..34467c916 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -21,7 +21,7 @@ #include "hashmap.hpp" #include "search_multi_cta_kernel.cuh" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible +#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible #include "utils.hpp" #include @@ -51,48 +51,46 @@ namespace cuvs::neighbors::cagra::detail { namespace multi_cta_search { -template - -struct search : public search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; +template +struct search : public search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type ::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; + + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; uint32_t num_cta_per_query; rmm::device_uvector intermediate_indices; @@ -102,12 +100,12 @@ struct search : public search_plan_impl { search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, uint32_t topk, cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric), + : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric), intermediate_indices(0, raft::resource::get_cuda_stream(res)), intermediate_distances(0, raft::resource::get_cuda_stream(res)), topk_workspace(0, raft::resource::get_cuda_stream(res)) @@ -129,13 +127,9 @@ struct search : public search_plan_impl { // constexpr unsigned max_result_buffer_size = 256; RAFT_EXPECTS(result_buffer_size_32 <= 256, "Result buffer size cannot exceed 256"); - const auto query_smem_buffer_length = - raft::ceildiv(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - - smem_size = sizeof(float) * query_smem_buffer_length + + smem_size = dataset_desc.smem_ws_size_in_bytes + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + - sizeof(uint32_t) * search_width + sizeof(uint32_t) + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; + sizeof(uint32_t) * search_width + sizeof(uint32_t); RAFT_LOG_DEBUG("# smem_size: %u", smem_size); // @@ -204,44 +198,38 @@ struct search : public search_plan_impl { ~search() {} - void operator()( - raft::resources const& res, - // raft::device_matrix_view dataset, - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - uint32_t topk, - SAMPLE_FILTER_T sample_filter) + void operator()(raft::resources const& res, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* const num_executed_iterations, // [num_queries,] + uint32_t topk, + SAMPLE_FILTER_T sample_filter) { cudaStream_t stream = raft::resource::get_cuda_stream(res); - - select_and_run( - dataset_desc, - graph, - intermediate_indices.data(), - intermediate_distances.data(), - queries_ptr, - num_queries, - dev_seed_ptr, - num_executed_iterations, - *this, - topk, - thread_block_size, - result_buffer_size, - smem_size, - hash_bitlen, - hashmap.data(), - num_cta_per_query, - num_seeds, - sample_filter, - this->metric, - stream); + select_and_run(dataset_desc.dev_ptr, + graph, + intermediate_indices.data(), + intermediate_distances.data(), + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + *this, + topk, + thread_block_size, + result_buffer_size, + smem_size, + hash_bitlen, + hashmap.data(), + num_cta_per_query, + num_seeds, + sample_filter, + this->metric, + stream); RAFT_CUDA_TRY(cudaPeekAtLastError()); // Select the top-k results from the intermediate results diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py index cb63c0e03..42d104f89 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py @@ -48,7 +48,6 @@ } // namespace cuvs::neighbors::cagra::detail::multi_cta_search """ -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] # mxelem = [64, 128, 256] load_types = ["uint4"] @@ -66,13 +65,12 @@ ) # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") + path = f"search_multi_cta_{type_path}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_kernel_selection(\n {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" + ) + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu index 2a14699f4..3912e0b0f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu deleted file mode 100644 index 0bf4a192f..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu deleted file mode 100644 index a77859b7d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu deleted file mode 100644 index ab49fa9f2..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu index 157942dc5..45c8c0602 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + float, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu deleted file mode 100644 index c38eeb009..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu deleted file mode 100644 index 3094ddaeb..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu deleted file mode 100644 index 91725d185..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu deleted file mode 100644 index 0f452a6fa..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu index ea38b60c0..8c40dce5a 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu deleted file mode 100644 index cfe7a7aef..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu deleted file mode 100644 index 292859382..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu index ee2400037..d1cbac723 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + half, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu deleted file mode 100644 index 13044f12d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu deleted file mode 100644 index 2ce6f292d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu deleted file mode 100644 index 2d607eb8d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh index b1cfaf870..30bbd60aa 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh @@ -21,30 +21,27 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { -#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \ - template void select_and_run( \ - DATASET_DESC_T dataset_desc, \ - raft::device_matrix_view \ - graph, \ - typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr, \ - typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr, \ - const typename DATASET_DESC_T::DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - typename DATASET_DESC_T::INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ +#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \ + template void select_and_run( \ + const dataset_descriptor_base_t* dataset_desc, \ + raft::device_matrix_view graph, \ + IndexT* topk_indices_ptr, \ + DistanceT* topk_distances_ptr, \ + const DataT* queries_ptr, \ + uint32_t num_queries, \ + const IndexT* dev_seed_ptr, \ + uint32_t* num_executed_iterations, \ + const search_params& ps, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + IndexT* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_seeds, \ + SampleFilterT sample_filter, \ + cuvs::distance::DistanceType metric, \ cudaStream_t stream); -#define COMMA , - } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu new file mode 100644 index 000000000..5f15b8bc5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include "search_multi_cta_inst.cuh" + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { +instantiate_kernel_selection( + int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu deleted file mode 100644 index c28adbf80..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu deleted file mode 100644 index af5f13397..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu deleted file mode 100644 index bcc7b9b8c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu deleted file mode 100644 index 916196c35..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh deleted file mode 100644 index b00d6617c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "compute_distance_vpq.cuh" -#include // none_cagra_sample_filter -#include // RAFT_EXPLICIT - -#include - -namespace cuvs::neighbors::cagra::detail { -namespace multi_cta_search { - -#ifdef CUVS_EXPLICIT_INSTANTIATE_ONLY - -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - // multi_cta_search (params struct) - uint32_t block_size, // - uint32_t result_buffer_size, - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) RAFT_EXPLICIT; -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_kernel_selection( \ - TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ - extern template void select_and_run< \ - TEAM_SIZE, \ - MAX_DATASET_DIM, \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t \ - dataset_desc, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_kernel_selection( - 32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_kernel_selection - -#define instantiate_q_kernel_selection(TEAM_SIZE, \ - MAX_DATASET_DIM, \ - CODE_BOOK_T, \ - PQ_BITS, \ - PQ_CODE_BOOK_DIM, \ - DATA_T, \ - INDEX_T, \ - DISTANCE_T, \ - SAMPLE_FILTER_T) \ - extern template void \ - select_and_run, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t dataset_desc, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_q_kernel_selection( - 8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection( - 8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection( - 8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(8, - 128, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(16, - 256, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 512, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_kernel_selection(32, - 1024, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_q_kernel_selection -} // namespace multi_cta_search -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 4d2030c6c..0c856aa19 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -15,12 +15,14 @@ */ #pragma once +#include "search_multi_cta_kernel.cuh" + #include "bitonic.hpp" #include "compute_distance.hpp" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible +#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible #include "utils.hpp" #include @@ -130,17 +132,13 @@ __device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] // // multiple CTAs per single query // -template +template __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] - DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph, // [dataset_size, graph_degree] const uint32_t graph_degree, @@ -162,7 +160,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - using QUERY_T = typename DATASET_DESCRIPTOR_T::QUERY_T; const auto num_queries = gridDim.y; const auto query_id = blockIdx.y; @@ -184,7 +181,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #endif _CLK_START(); - extern __shared__ uint32_t smem[]; + extern __shared__ uint8_t smem[]; // Layout of result_buffer // +----------------+------------------------------+---------+ @@ -197,21 +194,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } assert(result_buffer_size_32 <= MAX_ELEMENTS); - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + query_smem_buffer_length); + // Set smem working buffer for the distance calculation + auto distance_workspace = dataset_desc->set_smem_ws(smem); + + auto result_indices_buffer = + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto parent_indices_buffer = reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto distance_work_buffer_ptr = - reinterpret_cast(parent_indices_buffer + search_width); - auto terminate_flag = reinterpret_cast(distance_work_buffer_ptr + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte); - - // Set smem working buffer for the distance calculation - dataset_desc.set_smem_ptr(distance_work_buffer_ptr); + auto terminate_flag = reinterpret_cast(parent_indices_buffer + search_width); #if 0 /* debug */ @@ -220,9 +212,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( result_distances_buffer[i] = utils::get_max_value(); } #endif - const DATA_T* const query_ptr = queries_ptr + (dataset_desc.dim * query_id); - dataset_desc.template copy_query( - query_ptr, query_buffer, query_smem_buffer_length); + const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc->dim; + dataset_desc->copy_query(distance_workspace, query_ptr); if (threadIdx.x == 0) { terminate_flag[0] = 0; } INDEX_T* const local_visited_hashmap_ptr = @@ -236,20 +227,19 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( uint32_t block_id = cta_id + (num_cta_per_query * query_id); uint32_t num_blocks = num_cta_per_query * num_queries; - device::compute_distance_to_random_nodes(result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_desc, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric, - block_id, - num_blocks); + dataset_desc->compute_distance_to_random_nodes(distance_workspace, + result_indices_buffer, + result_distances_buffer, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + metric, + block_id, + num_blocks); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -279,21 +269,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - // constexpr unsigned max_n_frags = 16; - constexpr unsigned max_n_frags = 0; - device::compute_distance_to_child_nodes( - result_indices_buffer + itopk_size, - result_distances_buffer + itopk_size, - query_buffer, - dataset_desc, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_indices_buffer, - result_indices_buffer, - search_width, - metric); + dataset_desc->compute_distance_to_child_nodes(distance_workspace, + result_indices_buffer + itopk_size, + result_distances_buffer + itopk_size, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_indices_buffer, + result_indices_buffer, + search_width, + metric); _CLK_REC(clk_compute_distance); __syncthreads(); @@ -409,84 +395,59 @@ void set_value_batch(T* const dev_ptr, <<>>(dev_ptr, ld, val, count, batch_size); } -template +template struct search_kernel_config { // Search kernel function type. Note that the actual values for the template value // parameters do not matter, because they are not part of the function signature. The // second to fourth value parameters will be selected by the choose_* functions below. - using kernel_t = decltype(&search_kernel); + using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t { if (result_buffer_size <= 64) { - return search_kernel; + return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (result_buffer_size <= 128) { - return search_kernel; + return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (result_buffer_size <= 256) { - return search_kernel; + return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256); } }; -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - // multi_cta_search (params struct) - uint32_t block_size, // - uint32_t result_buffer_size, - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_seeds, + SampleFilterT sample_filter, + cuvs::distance::DistanceType metric, + cudaStream_t stream) { auto kernel = - search_kernel_config:: - choose_buffer_size(result_buffer_size, block_size); + search_kernel_config, + SampleFilterT>::choose_buffer_size(result_buffer_size, block_size); - RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte)); + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); // Initialize hash table const uint32_t hash_size = hashmap::get_size(hash_bitlen); - set_value_batch(hashmap_ptr, - hash_size, - utils::get_max_value(), - hash_size, - num_queries, - stream); + set_value_batch( + hashmap_ptr, hash_size, utils::get_max_value(), hash_size, num_queries, stream); dim3 block_dims(block_size, 1, 1); dim3 grid_dims(num_cta_per_query, num_queries, 1); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh index 2cbb758f9..aa403647d 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,33 @@ */ #pragma once -#ifndef CUVS_EXPLICIT_INSTANTIATE_ONLY -#include "search_multi_cta_kernel-inl.cuh" -#endif +#include "compute_distance.hpp" -#ifdef RAFT_COMPILED -#include "search_multi_cta_kernel-ext.cuh" -#endif +#include + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { + +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_seeds, + SampleFilterT sample_filter, + cuvs::distance::DistanceType metric, + cudaStream_t stream); + +} diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu new file mode 100644 index 000000000..cd3b271a1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include "search_multi_cta_inst.cuh" + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail::multi_cta_search { +instantiate_kernel_selection( + uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu deleted file mode 100644 index 3fa12d933..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu deleted file mode 100644 index e2f25a1c2..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu deleted file mode 100644 index 4cd206d8c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu deleted file mode 100644 index 56989a1d5..000000000 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_multi_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_multi_cta_00_generate.py - * - */ - -#include "search_multi_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index bc1266fb4..5697e503a 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -20,7 +20,7 @@ #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" -#include "topk_for_cagra/topk_core.cuh" //todo replace with raft kernel +#include "topk_for_cagra/topk.h" //todo replace with raft kernel #include "utils.hpp" #include @@ -639,49 +639,48 @@ void set_value_batch(T* const dev_ptr, // |<--- result_buffer_allocation_size --->| // |<--- result_buffer_size --->| // Double buffer (A) // |<--- result_buffer_size --->| // Double buffer (B) -template -struct search : search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; +template +struct search : search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; static_assert(std::is_same_v, "Only float is supported as resulting distance"); - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; size_t result_buffer_allocation_size; rmm::device_uvector result_indices; // results_indices_buffer @@ -699,12 +698,12 @@ struct search : search_plan_impl { search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, uint32_t topk, cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric), + : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric), result_indices(0, raft::resource::get_cuda_stream(res)), result_distances(0, raft::resource::get_cuda_stream(res)), parent_node_list(0, raft::resource::get_cuda_stream(res)), @@ -837,7 +836,6 @@ struct search : search_plan_impl { } void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const topk_indices_ptr, // [num_queries, topk] DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] @@ -1067,8 +1065,7 @@ struct search( - res, params, dim, graph_degree, topk, metric) + : base_type(res, params, dim, graph_degree, topk, metric) { THROW("The multi-kernel mode does not support VPQ"); } diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 0543224b3..436293e03 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -18,10 +18,11 @@ #include "hashmap.hpp" +#include "compute_distance.hpp" #include #include // #include "search_single_cta_inst.cuh" -// #include "topk_for_cagra/topk_core.cuh" +// #include "topk_for_cagra/topk.h" #include #include @@ -34,7 +35,6 @@ namespace cuvs::neighbors::cagra::detail { struct search_plan_impl_base : public search_params { - int64_t dataset_block_dim; int64_t dim; int64_t graph_degree; uint32_t topk; @@ -46,7 +46,6 @@ struct search_plan_impl_base : public search_params { cuvs::distance::DistanceType metric) : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric) { - set_dataset_block_and_team_size(dim); if (algo == search_algo::AUTO) { const size_t num_sm = raft::getMultiProcessorCount(); if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) { @@ -61,29 +60,13 @@ struct search_plan_impl_base : public search_params { } } } - - void set_dataset_block_and_team_size(int64_t dim) - { - constexpr int64_t max_dataset_block_dim = 512; - dataset_block_dim = 128; - while (dataset_block_dim < dim && dataset_block_dim < max_dataset_block_dim) { - dataset_block_dim *= 2; - } - // To keep binary size in check we limit only one team size specialization for each max_dim. - // TODO(tfeher): revise this decision. - switch (dataset_block_dim) { - case 128: team_size = 8; break; - case 256: team_size = 16; break; - default: team_size = 32; break; - } - } }; -template +template struct search_plan_impl : public search_plan_impl_base { - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; + using DATA_T = DataT; + using INDEX_T = IndexT; + using DISTANCE_T = DistanceT; int64_t hash_bitlen; @@ -100,9 +83,11 @@ struct search_plan_impl : public search_plan_impl_base { rmm::device_uvector hashmap; rmm::device_uvector num_executed_iterations; // device or managed? rmm::device_uvector dev_seed; + const dataset_descriptor_host& dataset_desc; search_plan_impl(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, uint32_t topk, @@ -111,12 +96,12 @@ struct search_plan_impl : public search_plan_impl_base { hashmap(0, raft::resource::get_cuda_stream(res)), num_executed_iterations(0, raft::resource::get_cuda_stream(res)), dev_seed(0, raft::resource::get_cuda_stream(res)), - num_seeds(0) + num_seeds(0), + dataset_desc(dataset_desc) { adjust_search_params(); check_params(); calc_hashmap_params(res); - set_dataset_block_and_team_size(dim); num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res)); RAFT_LOG_DEBUG("# algo = %d", static_cast(algo)); } @@ -124,7 +109,6 @@ struct search_plan_impl : public search_plan_impl_base { virtual ~search_plan_impl() {} virtual void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] @@ -133,7 +117,7 @@ struct search_plan_impl : public search_plan_impl_base { const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] std::uint32_t* const num_executed_iterations, // [num_queries] uint32_t topk, - SAMPLE_FILTER_T sample_filter){}; + SAMPLE_FILTER_T sample_filter) {}; void adjust_search_params() { @@ -160,6 +144,7 @@ struct search_plan_impl : public search_plan_impl_base { itopk32); itopk_size = itopk32; } + team_size = dataset_desc.team_size; } // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size @@ -292,10 +277,6 @@ struct search_plan_impl : public search_plan_impl_base { algo != search_algo::MULTI_KERNEL) { error_message += "An invalid kernel mode has been given: " + std::to_string((int)algo) + ""; } - if (team_size != 0 && team_size != 4 && team_size != 8 && team_size != 16 && team_size != 32) { - error_message += - "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given."; - } if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 && thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) { error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " + @@ -330,20 +311,4 @@ struct search_plan_impl : public search_plan_impl_base { } }; -// template -// struct search_plan { -// search_plan(raft::resources const& res, -// search_params param, -// int64_t dim, -// int64_t graph_degree) -// : plan(res, param, dim, graph_degree) -// { -// } -// void check(uint32_t topk) { plan.check(topk); } - -// // private: -// detail::search_plan_impl plan; -// }; -/** @} */ // end group cagra - } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index 0a101cbfe..187fe71e3 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -22,7 +22,7 @@ #include "search_plan.cuh" #include "search_single_cta_kernel.cuh" #include "topk_by_radix.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk +#include "topk_for_cagra/topk.h" // TODO replace with raft topk #include "utils.hpp" #include @@ -49,58 +49,57 @@ namespace cuvs::neighbors::cagra::detail { namespace single_cta_search { -template -struct search : search_plan_impl { - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - using search_plan_impl::max_queries; - using search_plan_impl::itopk_size; - using search_plan_impl::algo; - using search_plan_impl::team_size; - using search_plan_impl::search_width; - using search_plan_impl::min_iterations; - using search_plan_impl::max_iterations; - using search_plan_impl::thread_block_size; - using search_plan_impl::hashmap_mode; - using search_plan_impl::hashmap_min_bitlen; - using search_plan_impl::hashmap_max_fill_rate; - using search_plan_impl::num_random_samplings; - using search_plan_impl::rand_xor_mask; - - using search_plan_impl::dim; - using search_plan_impl::graph_degree; - using search_plan_impl::topk; - - using search_plan_impl::hash_bitlen; - - using search_plan_impl::small_hash_bitlen; - using search_plan_impl::small_hash_reset_interval; - using search_plan_impl::hashmap_size; - using search_plan_impl::dataset_size; - using search_plan_impl::result_buffer_size; - - using search_plan_impl::smem_size; - - using search_plan_impl::hashmap; - using search_plan_impl::num_executed_iterations; - using search_plan_impl::dev_seed; - using search_plan_impl::num_seeds; +template +struct search : search_plan_impl { + using base_type = search_plan_impl; + using DATA_T = typename base_type::DATA_T; + using INDEX_T = typename base_type::INDEX_T; + using DISTANCE_T = typename base_type::DISTANCE_T; + + using base_type::algo; + using base_type::hashmap_max_fill_rate; + using base_type::hashmap_min_bitlen; + using base_type::hashmap_mode; + using base_type::itopk_size; + using base_type::max_iterations; + using base_type::max_queries; + using base_type::min_iterations; + using base_type::num_random_samplings; + using base_type::rand_xor_mask; + using base_type::search_width; + using base_type::team_size; + using base_type::thread_block_size; + + using base_type::dim; + using base_type::graph_degree; + using base_type::topk; + + using base_type::hash_bitlen; + + using base_type::dataset_size; + using base_type::hashmap_size; + using base_type::result_buffer_size; + using base_type::small_hash_bitlen; + using base_type::small_hash_reset_interval; + + using base_type::smem_size; + + using base_type::dataset_desc; + using base_type::dev_seed; + using base_type::hashmap; + using base_type::num_executed_iterations; + using base_type::num_seeds; uint32_t num_itopk_candidates; search(raft::resources const& res, search_params params, + const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, uint32_t topk, cuvs::distance::DistanceType metric) - : search_plan_impl( - res, params, dim, graph_degree, topk, metric) + : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric) { set_params(res); } @@ -128,14 +127,11 @@ struct search : search_plan_impl { constexpr unsigned max_block_size = 1024; // const std::uint32_t topk_ws_size = 3; - const auto query_smem_buffer_length = - raft::ceildiv(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; const std::uint32_t base_smem_size = - sizeof(float) * query_smem_buffer_length + + dataset_desc.smem_ws_size_in_bytes + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width + - sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t) + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; + sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t); smem_size = base_smem_size; if (num_itopk_candidates > 256) { // Tentatively calculate the required share memory size when radix @@ -212,7 +208,6 @@ struct search : search_plan_impl { } void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] @@ -224,28 +219,27 @@ struct search : search_plan_impl { SAMPLE_FILTER_T sample_filter) { cudaStream_t stream = raft::resource::get_cuda_stream(res); - select_and_run( - dataset_desc, - graph, - result_indices_ptr, - result_distances_ptr, - queries_ptr, - num_queries, - dev_seed_ptr, - num_executed_iterations, - *this, - topk, - num_itopk_candidates, - static_cast(thread_block_size), - smem_size, - hash_bitlen, - hashmap.data(), - small_hash_bitlen, - small_hash_reset_interval, - num_seeds, - sample_filter, - this->metric, - stream); + select_and_run(dataset_desc.dev_ptr, + graph, + result_indices_ptr, + result_distances_ptr, + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + *this, + topk, + num_itopk_candidates, + static_cast(thread_block_size), + smem_size, + hash_bitlen, + hashmap.data(), + small_hash_bitlen, + small_hash_reset_interval, + num_seeds, + sample_filter, + this->metric, + stream); } }; diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py index a361269a6..b401aed1a 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py @@ -48,7 +48,6 @@ } // namespace cuvs::neighbors::cagra::detail::single_cta_search """ -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] # itopk_candidates = [64, 128, 256] # itopk_size = [64, 128, 256, 512] @@ -69,14 +68,13 @@ # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): - for (mxdim, team) in mxdim_team: - path = f"search_single_cta_{type_path}_dim{mxdim}_t{team}.cu" - with open(path, "w") as f: - f.write(header) - f.write( - f"instantiate_kernel_selection(\n {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" - ) + path = f"search_single_cta_{type_path}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_kernel_selection(\n {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n" + ) - f.write(trailer) - # For pasting into CMakeLists.txt - print(f"src/neighbors/detail/cagra/{path}") + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu index 1e2b83492..e50335908 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu deleted file mode 100644 index 4cf4a26f7..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu deleted file mode 100644 index 692710476..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu deleted file mode 100644 index ed3a900ff..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu index c2cfb13c4..167e243ac 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + float, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu deleted file mode 100644 index 2c4da00db..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu deleted file mode 100644 index 8b26a595f..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu deleted file mode 100644 index a93f893d4..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu deleted file mode 100644 index 4a7502e3e..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu index cfae9e367..901ddeac1 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu deleted file mode 100644 index 6c13df91a..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu deleted file mode 100644 index 12aa72a24..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu similarity index 85% rename from cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu index 7d3e86f38..7ef167666 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu @@ -29,9 +29,6 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); + half, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu deleted file mode 100644 index 84a173d6d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu deleted file mode 100644 index d9c5198eb..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu deleted file mode 100644 index 3ba8f4e4d..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh index a4581d15e..2a9974575 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh @@ -21,31 +21,28 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { -#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \ - template void select_and_run( \ - DATASET_DESC_T dataset_desc, \ - raft::device_matrix_view \ - graph, \ - typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr, \ - typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr, \ - const typename DATASET_DESC_T::DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - typename DATASET_DESC_T::INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ +#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \ + template void select_and_run( \ + const dataset_descriptor_base_t* dataset_desc, \ + raft::device_matrix_view graph, \ + IndexT* topk_indices_ptr, \ + DistanceT* topk_distances_ptr, \ + const DataT* queries_ptr, \ + uint32_t num_queries, \ + const IndexT* dev_seed_ptr, \ + uint32_t* num_executed_iterations, \ + const search_params& ps, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + IndexT* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_seeds, \ + SampleFilterT sample_filter, \ + cuvs::distance::DistanceType metric, \ cudaStream_t stream); -#define COMMA , - } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu new file mode 100644 index 000000000..d2ab1d8a2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include "search_single_cta_inst.cuh" + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail::single_cta_search { +instantiate_kernel_selection( + int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu deleted file mode 100644 index ad2ca16fc..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu deleted file mode 100644 index 6130a84bc..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu deleted file mode 100644 index 1e7bee57c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu deleted file mode 100644 index 7f789e3d0..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh deleted file mode 100644 index 79f6e153c..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh +++ /dev/null @@ -1,588 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include // RAFT_EXPLICIT - -#include - -namespace cuvs::neighbors::cagra::detail { -namespace single_cta_search { - -#ifdef CUVS_EXPLICIT_INSTANTIATE_ONLY - -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - uint32_t num_itopk_candidates, - uint32_t block_size, // - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - size_t small_hash_bitlen, - size_t small_hash_reset_interval, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) RAFT_EXPLICIT; - -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_single_cta_select_and_run( \ - TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ - extern template void select_and_run< \ - TEAM_SIZE, \ - MAX_DATASET_DIM, \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t \ - dataset, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_single_cta_select_and_run( - 32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_single_cta_select_and_run( - 32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_single_cta_select_and_run - -#define instantiate_q_single_cta_select_and_run(TEAM_SIZE, \ - MAX_DATASET_DIM, \ - CODE_BOOK_T, \ - PQ_BITS, \ - PQ_CODE_BOOK_DIM, \ - DATA_T, \ - INDEX_T, \ - DISTANCE_T, \ - SAMPLE_FILTER_T) \ - extern template void \ - select_and_run, \ - SAMPLE_FILTER_T>( \ - cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t dataset, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - const search_params& ps, \ - uint32_t topk, \ - uint32_t num_itopk_candidates, \ - uint32_t block_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - size_t small_hash_bitlen, \ - size_t small_hash_reset_interval, \ - uint32_t num_seeds, \ - SAMPLE_FILTER_T sample_filter, \ - cuvs::distance::DistanceType metric, \ - cudaStream_t stream); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - half, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - float, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 1024, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 1024, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - float, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 16, 256, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 32, 512, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - float, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - uint8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - int8_t, - uint32_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(8, - 128, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - uint8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 2, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 2, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run( - 8, 128, half, 8, 4, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(16, - 256, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 512, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); -instantiate_q_single_cta_select_and_run(32, - 1024, - half, - 8, - 4, - int8_t, - int64_t, - float, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_q_single_cta_select_and_run - -} // namespace single_cta_search -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index a101cdc1f..61354413d 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -15,13 +15,15 @@ */ #pragma once +#include "search_single_cta_kernel.cuh" + #include "bitonic.hpp" #include "compute_distance.hpp" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" #include "topk_by_radix.cuh" -#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk +#include "topk_for_cagra/topk.h" // TODO replace with raft topk #include "utils.hpp" #include @@ -60,7 +62,6 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag, INDEX_T* const next_parent_indices, INDEX_T* const internal_topk_indices, const std::size_t internal_topk_size, - const std::size_t dataset_size, const std::uint32_t search_width) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; @@ -454,9 +455,7 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, } // One query one thread block -template (dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + query_smem_buffer_length); + // Set smem working buffer for the distance calculation + auto distance_workspace = dataset_desc->set_smem_ws(smem); + + auto result_indices_buffer = + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto visited_hash_buffer = reinterpret_cast(result_distances_buffer + result_buffer_size_32); auto parent_list_buffer = reinterpret_cast(visited_hash_buffer + small_hash_size); - auto distance_work_buffer_ptr = - reinterpret_cast(parent_list_buffer + search_width); - auto topk_ws = reinterpret_cast(distance_work_buffer_ptr + - DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte); - auto terminate_flag = reinterpret_cast(topk_ws + 3); - auto smem_work_ptr = reinterpret_cast(terminate_flag + 1); - - // Set smem working buffer for the distance calculation - dataset_desc.set_smem_ptr(distance_work_buffer_ptr); + auto topk_ws = reinterpret_cast(parent_list_buffer + search_width); + auto terminate_flag = reinterpret_cast(topk_ws + 3); + auto smem_work_ptr = reinterpret_cast(terminate_flag + 1); // A flag for filtering. auto filter_flag = terminate_flag; - const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc.dim; - dataset_desc.template copy_query( - query_ptr, query_buffer, query_smem_buffer_length); + const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc->dim; + dataset_desc->copy_query(distance_workspace, query_ptr); if (threadIdx.x == 0) { terminate_flag[0] = 0; @@ -570,18 +562,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute distance to randomly selecting nodes _CLK_START(); const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - device::compute_distance_to_random_nodes(result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_desc, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric); + dataset_desc->compute_distance_to_random_nodes(distance_workspace, + result_indices_buffer, + result_distances_buffer, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + metric); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -683,12 +674,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // pick up next parents if (threadIdx.x < 32) { _CLK_START(); - pickup_next_parents(terminate_flag, - parent_list_buffer, - result_indices_buffer, - internal_topk, - dataset_desc.size, - search_width); + pickup_next_parents( + terminate_flag, parent_list_buffer, result_indices_buffer, internal_topk, search_width); _CLK_REC(clk_pickup_parents); } @@ -706,20 +693,17 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - constexpr unsigned max_n_frags = 8; - device::compute_distance_to_child_nodes( - result_indices_buffer + internal_topk, - result_distances_buffer + internal_topk, - query_buffer, - dataset_desc, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_list_buffer, - result_indices_buffer, - search_width, - metric); + dataset_desc->compute_distance_to_child_nodes(distance_workspace, + result_indices_buffer + internal_topk, + result_distances_buffer + internal_topk, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_list_buffer, + result_indices_buffer, + search_width, + metric); __syncthreads(); _CLK_REC(clk_compute_distance); @@ -815,50 +799,33 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #endif } -template +template struct search_kernel_config { - using kernel_t = decltype(&search_kernel); + using kernel_t = decltype(&search_kernel<64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); template static auto choose_search_kernel(unsigned itopk_size) -> kernel_t { if (itopk_size <= 64) { - return search_kernel; } else if (itopk_size <= 128) { - return search_kernel; } else if (itopk_size <= 256) { - return search_kernel; } else if (itopk_size <= 512) { - return search_kernel; + return search_kernel<256, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } else if (itopk_size <= 512) { - return search_kernel; + return search_kernel<512, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; } } THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u", @@ -905,40 +860,36 @@ struct search_kernel_config { } }; -template -void select_and_run( - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view - graph, - typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr, // [num_queries, topk] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - uint32_t num_itopk_candidates, - uint32_t block_size, // - uint32_t smem_size, - int64_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr, - size_t small_hash_bitlen, - size_t small_hash_reset_interval, - uint32_t num_seeds, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, - cudaStream_t stream) +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, // + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_seeds, + SampleFilterT sample_filter, + cuvs::distance::DistanceType metric, + cudaStream_t stream) { auto kernel = - search_kernel_config:: - choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size); - RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte)); + search_kernel_config, + SampleFilterT>::choose_itopk_and_mx_candidates(ps.itopk_size, + num_itopk_candidates, + block_size); + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); dim3 thread_dims(block_size, 1, 1); dim3 block_dims(1, num_queries, 1); RAFT_LOG_DEBUG( diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh index 40de22f2e..d21c7f7aa 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,33 @@ */ #pragma once -#ifndef CUVS_EXPLICIT_INSTANTIATE_ONLY -#include "search_single_cta_kernel-inl.cuh" -#endif +#include "compute_distance.hpp" -#ifdef RAFT_COMPILED -#include "search_single_cta_kernel-ext.cuh" -#endif +#include + +namespace cuvs::neighbors::cagra::detail::single_cta_search { + +template +void select_and_run(const dataset_descriptor_base_t* dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, // + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_seeds, + SampleFilterT sample_filter, + cuvs::distance::DistanceType metric, + cudaStream_t stream); + +} diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu new file mode 100644 index 000000000..6404c640c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include "search_single_cta_inst.cuh" + +#include "compute_distance.hpp" + +namespace cuvs::neighbors::cagra::detail::single_cta_search { +instantiate_kernel_selection( + uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); + +} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu deleted file mode 100644 index 35e04ea6a..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 1024, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu deleted file mode 100644 index 614e6ca01..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 8, - 128, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu deleted file mode 100644 index 005afb566..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 16, - 256, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu deleted file mode 100644 index af30b2e24..000000000 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by search_single_cta_00_generate.py - * - * Make changes there and run in this directory: - * - * > python search_single_cta_00_generate.py - * - */ - -#include "search_single_cta_inst.cuh" - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - 32, - 512, - cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t, - cuvs::neighbors::filtering::none_cagra_sample_filter); - -} // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh index 67173026b..b6f97cb26 100644 --- a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh +++ b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh @@ -32,17 +32,17 @@ struct topk_by_radix_sort : topk_by_radix_sort_base {}; template struct topk_by_radix_sort> : topk_by_radix_sort_base { - __device__ void operator()(uint32_t topk, - uint32_t batch_size, - uint32_t len_x, - const uint32_t* _x, - const IdxT* _in_vals, - uint32_t* _y, - IdxT* _out_vals, - uint32_t* work, - uint32_t* _hints, - bool sort, - uint32_t* _smem) + RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk, + uint32_t batch_size, + uint32_t len_x, + const uint32_t* _x, + const IdxT* _in_vals, + uint32_t* _y, + IdxT* _out_vals, + uint32_t* work, + uint32_t* _hints, + bool sort, + uint32_t* _smem) { std::uint8_t* const state = reinterpret_cast(work); topk_cta_11_core::state_bit_lenght, @@ -60,17 +60,17 @@ struct topk_by_radix_sort V))>> \ : topk_by_radix_sort_base { \ - __device__ void operator()(uint32_t topk, \ - uint32_t batch_size, \ - uint32_t len_x, \ - const uint32_t* _x, \ - const IdxT* _in_vals, \ - uint32_t* _y, \ - IdxT* _out_vals, \ - uint32_t* work, \ - uint32_t* _hints, \ - bool sort, \ - uint32_t* _smem) \ + RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk, \ + uint32_t batch_size, \ + uint32_t len_x, \ + const uint32_t* _x, \ + const IdxT* _in_vals, \ + uint32_t* _y, \ + IdxT* _out_vals, \ + uint32_t* work, \ + uint32_t* _hints, \ + bool sort, \ + uint32_t* _smem) \ { \ assert(blockDim.x >= V / 4); \ std::uint8_t* state = (std::uint8_t*)work; \ diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu new file mode 100644 index 000000000..72ff2cb85 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "topk_core.cuh" + +namespace cuvs::neighbors::cagra::detail { + +// +size_t _cuann_find_topk_bufferSize(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + cudaDataType_t sampleDtype) +{ + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); + + size_t workspaceSize = 1; + // state + if (stateBitLen == 8) { + workspaceSize = _cuann_aligned( + sizeof(uint8_t) * get_state_size(numElements) * sizeBatch); + } + + return workspaceSize; +} + +template +void _cuann_find_topk(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + const float* inputKeys, // [sizeBatch, ldIK,] + uint32_t ldIK, // (*) ldIK >= numElements + const ValT* inputVals, // [sizeBatch, ldIV,] + uint32_t ldIV, // (*) ldIV >= numElements + float* outputKeys, // [sizeBatch, ldOK,] + uint32_t ldOK, // (*) ldOK >= topK + ValT* outputVals, // [sizeBatch, ldOV,] + uint32_t ldOV, // (*) ldOV >= topK + void* workspace, + bool sort, + uint32_t* hints, + cudaStream_t stream) +{ + assert(ldIK >= numElements); + assert(ldIV >= numElements); + assert(ldOK >= topK); + assert(ldOV >= topK); + + constexpr int numThreads = NUM_THREADS; + constexpr int stateBitLen = STATE_BIT_LENGTH; + assert(stateBitLen == 0 || stateBitLen == 8); + + uint8_t* state = NULL; + if (stateBitLen == 8) { state = (uint8_t*)workspace; } + + dim3 threads(numThreads, 1, 1); + dim3 blocks(sizeBatch, 1, 1); + + void (*cta_kernel)(uint32_t, + uint32_t, + uint32_t, + const uint32_t*, + uint32_t, + const ValT*, + uint32_t, + uint32_t*, + uint32_t, + ValT*, + uint32_t, + uint8_t*, + uint32_t*, + bool) = nullptr; + + // V:vecLen, K:maxTopk, T:numSortThreads +#define SET_KERNEL_VKT(V, K, T, ValT) \ + do { \ + assert(numThreads >= T); \ + assert((K % T) == 0); \ + assert((K / T) <= 4); \ + cta_kernel = kern_topk_cta_11; \ + } while (0) + + // V: vecLen +#define SET_KERNEL_V(V, ValT) \ + do { \ + if (topK <= 32) { \ + SET_KERNEL_VKT(V, 32, 32, ValT); \ + } else if (topK <= 64) { \ + SET_KERNEL_VKT(V, 64, 32, ValT); \ + } else if (topK <= 96) { \ + SET_KERNEL_VKT(V, 96, 32, ValT); \ + } else if (topK <= 128) { \ + SET_KERNEL_VKT(V, 128, 32, ValT); \ + } else if (topK <= 192) { \ + SET_KERNEL_VKT(V, 192, 64, ValT); \ + } else if (topK <= 256) { \ + SET_KERNEL_VKT(V, 256, 64, ValT); \ + } else if (topK <= 384) { \ + SET_KERNEL_VKT(V, 384, 128, ValT); \ + } else if (topK <= 512) { \ + SET_KERNEL_VKT(V, 512, 128, ValT); \ + } else if (topK <= 768) { \ + SET_KERNEL_VKT(V, 768, 256, ValT); \ + } else if (topK <= 1024) { \ + SET_KERNEL_VKT(V, 1024, 256, ValT); \ + } \ + /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \ + /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \ + /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \ + /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \ + else { \ + RAFT_FAIL("topk must be lower than or equal to 1024"); \ + } \ + } while (0) + + int _vecLen = _get_vecLen(ldIK, 2); + if (_vecLen == 2) { + SET_KERNEL_V(2, ValT); + } else if (_vecLen == 1) { + SET_KERNEL_V(1, ValT); + } + + cta_kernel<<>>(topK, + sizeBatch, + numElements, + (const uint32_t*)inputKeys, + ldIK, + inputVals, + ldIV, + (uint32_t*)outputKeys, + ldOK, + outputVals, + ldOV, + state, + hints, + sort); + + return; +} + +template void _cuann_find_topk(uint32_t topK, + uint32_t sizeBatch, + uint32_t numElements, + const float* inputKeys, // [sizeBatch, ldIK,] + uint32_t ldIK, // (*) ldIK >= numElements + const uint32_t* inputVals, // [sizeBatch, ldIV,] + uint32_t ldIV, // (*) ldIV >= numElements + float* outputKeys, // [sizeBatch, ldOK,] + uint32_t ldOK, // (*) ldOK >= topK + uint32_t* outputVals, // [sizeBatch, ldOV,] + uint32_t ldOV, // (*) ldOV >= topK + void* workspace, + bool sort, + uint32_t* hint, + cudaStream_t stream); + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh index cbf99a556..65f9cfade 100644 --- a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh +++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh @@ -14,10 +14,15 @@ * limitations under the License. */ #pragma once + +#include "../utils.hpp" #include "topk.h" #include +#include +#include + #include #include #include @@ -25,7 +30,7 @@ namespace cuvs::neighbors::cagra::detail { // -__device__ inline uint32_t convert(uint32_t x) +RAFT_DEVICE_INLINE_FUNCTION constexpr uint32_t convert(uint32_t x) { if (x & 0x80000000) { return x ^ 0xffffffff; @@ -35,7 +40,7 @@ __device__ inline uint32_t convert(uint32_t x) } // -__device__ inline uint16_t convert(uint16_t x) +RAFT_DEVICE_INLINE_FUNCTION constexpr uint16_t convert(uint16_t x) { if (x & 0x8000) { return x ^ 0xffff; @@ -62,7 +67,7 @@ struct u16_vector { // template -__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) +RAFT_DEVICE_INLINE_FUNCTION void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i) { if (vecLen == 1) { vec.x1 = ((uint1*)(x + i))[0]; @@ -77,7 +82,7 @@ __device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x // template -__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) +RAFT_DEVICE_INLINE_FUNCTION void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i) { if (vecLen == 1) { vec.x1 = ((ushort1*)(x + i))[0]; @@ -92,7 +97,7 @@ __device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x // template -__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) +RAFT_DEVICE_INLINE_FUNCTION uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i) { uint32_t xi; if (vecLen == 1) { @@ -134,7 +139,7 @@ __device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, i // template -__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) +RAFT_DEVICE_INLINE_FUNCTION uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i) { uint16_t xi; if (vecLen == 1) { @@ -175,7 +180,7 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i } template -__device__ inline void block_scan(const T input, T& output) +RAFT_DEVICE_INLINE_FUNCTION void block_scan(const T input, T& output) { switch (blockDim.x) { case 32: { @@ -214,19 +219,19 @@ __device__ inline void block_scan(const T input, T& output) // template -__device__ inline void update_histogram(int itr, - uint32_t thread_id, - uint32_t num_threads, - uint32_t hint, - uint32_t threshold, - uint32_t& num_bins, - uint32_t& shift, - const T* x, // [nx,] - uint32_t nx, - uint32_t* hist, // [num_bins] - uint8_t* state, - uint32_t* output, // [topk] - uint32_t* output_count) +RAFT_DEVICE_INLINE_FUNCTION void update_histogram(int itr, + uint32_t thread_id, + uint32_t num_threads, + uint32_t hint, + uint32_t threshold, + uint32_t& num_bins, + uint32_t& shift, + const T* x, // [nx,] + uint32_t nx, + uint32_t* hist, // [num_bins] + uint8_t* state, + uint32_t* output, // [topk] + uint32_t* output_count) { if (sizeof(T) == 4) { // 32-bit (uint32_t) @@ -324,15 +329,16 @@ __device__ inline void update_histogram(int itr, } template -__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index, - uint32_t& my_csum, - const unsigned num_bins, - const uint32_t* const hist, - const uint32_t nx_below_threshold, - const uint32_t max_threshold, - const uint32_t threshold, - const uint32_t shift, - const uint32_t topk) +RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold_core( + uint32_t& my_index, + uint32_t& my_csum, + const unsigned num_bins, + const uint32_t* const hist, + const uint32_t nx_below_threshold, + const uint32_t max_threshold, + const uint32_t threshold, + const uint32_t shift, + const uint32_t topk) { typedef cub::BlockScan BlockScanT; __shared__ typename BlockScanT::TempStorage temp_storage; @@ -370,7 +376,7 @@ __device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_in } // -__device__ inline void select_best_index_for_next_threshold( +RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold( const uint32_t topk, const uint32_t threshold, const uint32_t max_threshold, @@ -469,17 +475,17 @@ __device__ inline void select_best_index_for_next_threshold( // template -__device__ inline void output_index_below_threshold(const uint32_t topk, - const uint32_t thread_id, - const uint32_t num_threads, - const uint32_t threshold, - const uint32_t nx_below_threshold, - const T* const x, // [nx,] - const uint32_t nx, - const uint8_t* state, - uint32_t* const output, // [topk] - uint32_t* const output_count, - uint32_t* const output_count_eq) +RAFT_DEVICE_INLINE_FUNCTION void output_index_below_threshold(const uint32_t topk, + const uint32_t thread_id, + const uint32_t num_threads, + const uint32_t threshold, + const uint32_t nx_below_threshold, + const T* const x, // [nx,] + const uint32_t nx, + const uint8_t* state, + uint32_t* const output, // [topk] + uint32_t* const output_count, + uint32_t* const output_count_eq) { int ii = 0; for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) { @@ -530,7 +536,7 @@ __device__ inline void output_index_below_threshold(const uint32_t topk, // template -__device__ inline void swap(T& val1, T& val2) +RAFT_DEVICE_INLINE_FUNCTION constexpr void swap(T& val1, T& val2) { const T val0 = val1; val1 = val2; @@ -539,7 +545,7 @@ __device__ inline void swap(T& val1, T& val2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2) { if (key1 > key2) { swap(key1, key2); @@ -550,7 +556,7 @@ __device__ inline bool swap_if_needed(K& key1, K& key2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) { if (key1 > key2) { swap(key1, key2); @@ -562,7 +568,8 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2) // template -__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending) +RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed( + K& key1, K& key2, V& val1, V& val2, bool ascending) { if (key1 == key2) { return false; } if ((key1 > key2) == ascending) { @@ -575,20 +582,20 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a // template -__device__ inline T max_value_of(); +RAFT_DEVICE_INLINE_FUNCTION T max_value_of(); template <> -__device__ inline float max_value_of() +RAFT_DEVICE_INLINE_FUNCTION float max_value_of() { return FLT_MAX; } template <> -__device__ inline uint32_t max_value_of() +RAFT_DEVICE_INLINE_FUNCTION uint32_t max_value_of() { return ~0u; } template -__device__ __host__ inline uint32_t get_state_size(uint32_t len_x) +RAFT_INLINE_FUNCTION constexpr uint32_t get_state_size(uint32_t len_x) { #ifdef __CUDA_ARCH__ const uint32_t num_threads = blockDim.x; @@ -605,16 +612,16 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x) // template -__device__ inline void topk_cta_11_core(uint32_t topk, - uint32_t len_x, - const uint32_t* _x, // [size_batch, ld_x,] - const ValT* _in_vals, // [size_batch, ld_iv,] - uint32_t* _y, // [size_batch, ld_y,] - ValT* _out_vals, // [size_batch, ld_ov,] - uint8_t* _state, // [size_batch, ...,] - uint32_t* _hint, - bool sort, - uint32_t* _smem) +RAFT_DEVICE_INLINE_FUNCTION void topk_cta_11_core(uint32_t topk, + uint32_t len_x, + const uint32_t* _x, // [size_batch, ld_x,] + const ValT* _in_vals, // [size_batch, ld_iv,] + uint32_t* _y, // [size_batch, ld_y,] + ValT* _out_vals, // [size_batch, ld_ov,] + uint8_t* _state, // [size_batch, ...,] + uint32_t* _hint, + bool sort, + uint32_t* _smem) { uint32_t* const smem_out_vals = _smem; uint32_t* const hist = &(_smem[2 * maxTopk]); @@ -904,137 +911,4 @@ __launch_bounds__(1024, 1) RAFT_KERNEL _smem); } -// -size_t inline _cuann_find_topk_bufferSize(uint32_t topK, - uint32_t sizeBatch, - uint32_t numElements, - cudaDataType_t sampleDtype) -{ - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); - - size_t workspaceSize = 1; - // state - if (stateBitLen == 8) { - workspaceSize = _cuann_aligned( - sizeof(uint8_t) * get_state_size(numElements) * sizeBatch); - } - - return workspaceSize; -} - -template -inline void _cuann_find_topk(uint32_t topK, - uint32_t sizeBatch, - uint32_t numElements, - const float* inputKeys, // [sizeBatch, ldIK,] - uint32_t ldIK, // (*) ldIK >= numElements - const ValT* inputVals, // [sizeBatch, ldIV,] - uint32_t ldIV, // (*) ldIV >= numElements - float* outputKeys, // [sizeBatch, ldOK,] - uint32_t ldOK, // (*) ldOK >= topK - ValT* outputVals, // [sizeBatch, ldOV,] - uint32_t ldOV, // (*) ldOV >= topK - void* workspace, - bool sort, - uint32_t* hints, - cudaStream_t stream) -{ - assert(ldIK >= numElements); - assert(ldIV >= numElements); - assert(ldOK >= topK); - assert(ldOV >= topK); - - constexpr int numThreads = NUM_THREADS; - constexpr int stateBitLen = STATE_BIT_LENGTH; - assert(stateBitLen == 0 || stateBitLen == 8); - - uint8_t* state = NULL; - if (stateBitLen == 8) { state = (uint8_t*)workspace; } - - dim3 threads(numThreads, 1, 1); - dim3 blocks(sizeBatch, 1, 1); - - void (*cta_kernel)(uint32_t, - uint32_t, - uint32_t, - const uint32_t*, - uint32_t, - const ValT*, - uint32_t, - uint32_t*, - uint32_t, - ValT*, - uint32_t, - uint8_t*, - uint32_t*, - bool) = nullptr; - - // V:vecLen, K:maxTopk, T:numSortThreads -#define SET_KERNEL_VKT(V, K, T, ValT) \ - do { \ - assert(numThreads >= T); \ - assert((K % T) == 0); \ - assert((K / T) <= 4); \ - cta_kernel = kern_topk_cta_11; \ - } while (0) - - // V: vecLen -#define SET_KERNEL_V(V, ValT) \ - do { \ - if (topK <= 32) { \ - SET_KERNEL_VKT(V, 32, 32, ValT); \ - } else if (topK <= 64) { \ - SET_KERNEL_VKT(V, 64, 32, ValT); \ - } else if (topK <= 96) { \ - SET_KERNEL_VKT(V, 96, 32, ValT); \ - } else if (topK <= 128) { \ - SET_KERNEL_VKT(V, 128, 32, ValT); \ - } else if (topK <= 192) { \ - SET_KERNEL_VKT(V, 192, 64, ValT); \ - } else if (topK <= 256) { \ - SET_KERNEL_VKT(V, 256, 64, ValT); \ - } else if (topK <= 384) { \ - SET_KERNEL_VKT(V, 384, 128, ValT); \ - } else if (topK <= 512) { \ - SET_KERNEL_VKT(V, 512, 128, ValT); \ - } else if (topK <= 768) { \ - SET_KERNEL_VKT(V, 768, 256, ValT); \ - } else if (topK <= 1024) { \ - SET_KERNEL_VKT(V, 1024, 256, ValT); \ - } \ - /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \ - /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \ - /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \ - /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \ - else { \ - RAFT_FAIL("topk must be lower than or equal to 1024"); \ - } \ - } while (0) - - int _vecLen = _get_vecLen(ldIK, 2); - if (_vecLen == 2) { - SET_KERNEL_V(2, ValT); - } else if (_vecLen == 1) { - SET_KERNEL_V(1, ValT); - } - - cta_kernel<<>>(topK, - sizeBatch, - numElements, - (const uint32_t*)inputKeys, - ldIK, - inputVals, - ldIV, - (uint32_t*)outputKeys, - ldOK, - outputVals, - ldOV, - state, - hints, - sort); - - return; -} } // namespace cuvs::neighbors::cagra::detail From ba52b132b9067398b99d839e7ccf603c0ce6d25a Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 16 Aug 2024 16:15:39 +0200 Subject: [PATCH 02/41] Fix style --- cpp/CMakeLists.txt | 3 ++- .../neighbors/detail/cagra/search_multi_cta_float_uint32.cu | 6 ++++-- .../neighbors/detail/cagra/search_multi_cta_float_uint64.cu | 6 ++++-- .../neighbors/detail/cagra/search_multi_cta_half_uint32.cu | 6 ++++-- .../neighbors/detail/cagra/search_multi_cta_half_uint64.cu | 6 ++++-- .../neighbors/detail/cagra/search_multi_cta_int8_uint32.cu | 6 ++++-- .../neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu | 6 ++++-- cpp/src/neighbors/detail/cagra/search_plan.cuh | 2 +- .../detail/cagra/search_single_cta_float_uint32.cu | 6 ++++-- .../detail/cagra/search_single_cta_float_uint64.cu | 6 ++++-- .../neighbors/detail/cagra/search_single_cta_half_uint32.cu | 6 ++++-- .../neighbors/detail/cagra/search_single_cta_half_uint64.cu | 6 ++++-- .../neighbors/detail/cagra/search_single_cta_int8_uint32.cu | 6 ++++-- .../detail/cagra/search_single_cta_uint8_uint32.cu | 6 ++++-- 14 files changed, 51 insertions(+), 26 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1e5a1723f..cf2baa8b7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -537,7 +537,8 @@ target_compile_options( "$<$:${CUVS_CUDA_FLAGS}>" ) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries -# TODO (achirkin): disabled during experiments with CUDA_SEPARABLE_COMPILATION (otherwise did't link) +# +# TODO(achirkin): disabled during experiments with CUDA_SEPARABLE_COMPILATION (otherwise did't link) # target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") # ################################################################################################## diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu index 3912e0b0f..39064e003 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu index 45c8c0602..fc2f6ce9c 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - float, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu index 8c40dce5a..8a255c450 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu index d1cbac723..016c8d875 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - half, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu index 5f15b8bc5..17e0c67ff 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(int8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu index cd3b271a1..480f7ab45 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::multi_cta_search { -instantiate_kernel_selection( - uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(uint8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 436293e03..e483a33cf 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -117,7 +117,7 @@ struct search_plan_impl : public search_plan_impl_base { const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] std::uint32_t* const num_executed_iterations, // [num_queries] uint32_t topk, - SAMPLE_FILTER_T sample_filter) {}; + SAMPLE_FILTER_T sample_filter){}; void adjust_search_params() { diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu index e50335908..6cc3f1976 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu index 167e243ac..0e5039733 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - float, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(float, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu index 901ddeac1..5c8dc25bd 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu index 7ef167666..63c089850 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - half, uint64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(half, + uint64_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu index d2ab1d8a2..c0c0e9c02 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(int8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu index 6404c640c..80ea54fe5 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu @@ -28,7 +28,9 @@ #include "compute_distance.hpp" namespace cuvs::neighbors::cagra::detail::single_cta_search { -instantiate_kernel_selection( - uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection(uint8_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_cagra_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search From 434e50a832914553921b2dc2d1968e39c83f072b Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 19 Aug 2024 12:09:45 +0200 Subject: [PATCH 03/41] Add missing multi-kernel implementation --- .../detail/cagra/compute_distance.hpp | 23 ++ .../detail/cagra/compute_distance_vpq.cuh | 16 + cpp/src/neighbors/detail/cagra/factory.cuh | 9 +- cpp/src/neighbors/detail/cagra/hashmap.hpp | 26 +- .../detail/cagra/search_multi_kernel.cuh | 367 ++++++------------ 5 files changed, 186 insertions(+), 255 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index dcc5fe285..c1f001c95 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -224,6 +224,13 @@ struct dataset_descriptor_base_t { /** Copy the query to the shared memory. */ _RAFT_DEVICE virtual void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const = 0; + /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector + * given by the dataset_index. */ + _RAFT_DEVICE virtual auto compute_distance(ws_handle smem_workspace, + INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) const -> DISTANCE_T = 0; + _RAFT_DEVICE virtual void compute_distance_to_random_nodes( ws_handle smem_workspace, INDEX_T* const result_indices_ptr, // [num_pickup] @@ -434,6 +441,22 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T + { + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + return compute_similarity( + smem_workspace, dataset_index, valid); + case cuvs::distance::DistanceType::InnerProduct: + return compute_similarity( + smem_workspace, dataset_index, valid); + default: return 0; + } + } + template RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, const INDEX_T dataset_i, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 45eae30c5..7446c5991 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -188,6 +188,22 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T + { + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + return compute_similarity( + smem_workspace, dataset_index, valid); + case cuvs::distance::DistanceType::InnerProduct: + return compute_similarity( + smem_workspace, dataset_index, valid); + default: return 0; + } + } + template RAFT_DEVICE_INLINE_FUNCTION DISTANCE_T compute_similarity(ws_handle smem_workspace, const INDEX_T node_id, diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 5dd902bea..2a6d9add0 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -17,7 +17,7 @@ #pragma once #include "search_multi_cta.cuh" -// #include "search_multi_kernel.cuh" +#include "search_multi_kernel.cuh" #include "search_plan.cuh" #include "search_single_cta.cuh" @@ -62,10 +62,9 @@ class factory { multi_cta_search::search>( res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); } else { - // return std::make_unique< - // multi_kernel_search::search>( - // res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); - RAFT_FAIL("WIP!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + return std::make_unique< + multi_kernel_search::search>( + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); } } }; diff --git a/cpp/src/neighbors/detail/cagra/hashmap.hpp b/cpp/src/neighbors/detail/cagra/hashmap.hpp index dd6c6c844..2c62dda90 100644 --- a/cpp/src/neighbors/detail/cagra/hashmap.hpp +++ b/cpp/src/neighbors/detail/cagra/hashmap.hpp @@ -29,10 +29,12 @@ namespace cuvs::neighbors::cagra::detail { namespace hashmap { -_RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; } +RAFT_INLINE_FUNCTION uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; } template -_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0) +RAFT_DEVICE_INLINE_FUNCTION void init(IdxT* const table, + const unsigned bitlen, + unsigned FIRST_TID = 0) { if (threadIdx.x < FIRST_TID) return; for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) { @@ -41,7 +43,9 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned } template -_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key) +RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table, + const uint32_t bitlen, + const IdxT key) { // Open addressing is used for collision resolution const uint32_t size = get_size(bitlen); @@ -68,7 +72,9 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co } template -_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key) +RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table, + const uint32_t bitlen, + const IdxT key) { IdxT ret = 0; if (threadIdx.x % TEAM_SIZE == 0) { ret = insert(table, bitlen, key); } @@ -78,5 +84,17 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co return ret; } +template +RAFT_DEVICE_INLINE_FUNCTION uint32_t +insert(unsigned team_size, IdxT* const table, const uint32_t bitlen, const IdxT key) +{ + IdxT ret = 0; + if (threadIdx.x % team_size == 0) { ret = insert(table, bitlen, key); } + for (unsigned offset = 1; offset < team_size; offset *= 2) { + ret |= __shfl_xor_sync(0xffffffff, ret, offset); + } + return ret; +} + } // namespace hashmap } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 5697e503a..c228c51d8 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -93,9 +93,10 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre } // MAX_DATASET_DIM : must equal to or greater than dataset_dim -template +template RAFT_KERNEL random_pickup_kernel( - const DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, + uint32_t team_size, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::size_t num_pickup, const unsigned num_distilation, @@ -114,22 +115,14 @@ RAFT_KERNEL random_pickup_kernel( using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; const auto ldb = hashmap::get_size(hash_bitlen); - const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE; + const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / team_size; const uint32_t query_id = blockIdx.y; if (global_team_index >= num_pickup) { return; } + extern __shared__ uint8_t smem[]; + auto distance_workspace = dataset_desc->set_smem_ws(smem); // Load a query - extern __shared__ float query_buffer[]; - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dataset_desc.dim) { - query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping{}( - (queries_ptr + query_id * dataset_desc.dim)[i]); - } else { - query_buffer[j] = 0.0; - } - } + dataset_desc->copy_query(distance_workspace, queries_ptr + query_id * dataset_desc->dim); + __syncthreads(); INDEX_T best_index_team_local; @@ -141,27 +134,10 @@ RAFT_KERNEL random_pickup_kernel( } else { // Chose a seed node randomly seed_index = - device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size; - } - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, seed_index, true); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - query_buffer, seed_index, true); - break; - default: break; + device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size; } + DISTANCE_T norm2 = dataset_desc->compute_distance(distance_workspace, seed_index, metric, true); if (norm2 < best_norm2_team_local) { best_norm2_team_local = norm2; best_index_team_local = seed_index; @@ -169,7 +145,7 @@ RAFT_KERNEL random_pickup_kernel( } const auto store_gmem_index = global_team_index + (ldr * query_id); - if (threadIdx.x % TEAM_SIZE == 0) { + if (threadIdx.x % team_size == 0) { if (hashmap::insert( visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) { result_distances_ptr[store_gmem_index] = best_norm2_team_local; @@ -182,47 +158,43 @@ RAFT_KERNEL random_pickup_kernel( } // MAX_DATASET_DIM : must be equal to or greater than dataset_dim -template -void random_pickup( - const DATASET_DESCRIPTOR_T dataset_desc, - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const std::size_t num_queries, - const std::size_t num_pickup, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr, // [num_queries, num_seeds] - const uint32_t num_seeds, - typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldr] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] - const std::size_t ldr, // (*) ldr >= num_pickup - typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << bitlen] - const std::uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, - cudaStream_t const cuda_stream = 0) +template +void random_pickup(const dataset_descriptor_host& dataset_desc, + const DataT* queries_ptr, // [num_queries, dataset_dim] + std::size_t num_queries, + std::size_t num_pickup, + unsigned num_distilation, + uint64_t rand_xor_mask, + const IndexT* seed_ptr, // [num_queries, num_seeds] + uint32_t num_seeds, + IndexT* result_indices_ptr, // [num_queries, ldr] + DistanceT* result_distances_ptr, // [num_queries, ldr] + std::size_t ldr, // (*) ldr >= num_pickup + IndexT* visited_hashmap_ptr, // [num_queries, 1 << bitlen] + std::uint32_t hash_bitlen, + cuvs::distance::DistanceType metric, + cudaStream_t cuda_stream) { const auto block_size = 256u; - const auto num_teams_per_threadblock = block_size / TEAM_SIZE; + const auto num_teams_per_threadblock = block_size / dataset_desc.team_size; const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock, num_queries); - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - const auto smem_size = query_smem_buffer_length * sizeof(float); - - random_pickup_kernel - <<>>(dataset_desc, - queries_ptr, - num_pickup, - num_distilation, - rand_xor_mask, - seed_ptr, - num_seeds, - result_indices_ptr, - result_distances_ptr, - ldr, - visited_hashmap_ptr, - hash_bitlen, - metric); + random_pickup_kernel<<>>( + dataset_desc.dev_ptr, + dataset_desc.team_size, + queries_ptr, + num_pickup, + num_distilation, + rand_xor_mask, + seed_ptr, + num_seeds, + result_indices_ptr, + result_distances_ptr, + ldr, + visited_hashmap_ptr, + hash_bitlen, + metric); } template @@ -325,9 +297,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr, // [num_queries, terminate_flag); } -template RAFT_KERNEL compute_distance_to_child_nodes_kernel( const typename DATASET_DESCRIPTOR_T::INDEX_T* const @@ -338,7 +308,8 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( parent_distance_ptr, // [num_queries, search_width] const std::size_t lds, const std::uint32_t search_width, - const DATASET_DESCRIPTOR_T dataset_desc, + const DATASET_DESCRIPTOR_T* dataset_desc, + uint32_t team_size, const typename DATASET_DESCRIPTOR_T::INDEX_T* const neighbor_graph_ptr, // [dataset_size, graph_degree] const std::uint32_t graph_degree, @@ -357,21 +328,14 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; - const auto global_team_id = tid / TEAM_SIZE; + const auto global_team_id = tid / team_size; const auto query_id = blockIdx.y; - extern __shared__ float query_buffer[]; - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dataset_desc.dim) { - query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping{}( - (query_ptr + query_id * dataset_desc.dim)[i]); - } else { - query_buffer[j] = 0.0; - } - } + extern __shared__ uint8_t smem[]; + auto distance_workspace = dataset_desc->set_smem_ws(smem); + // Load a query + dataset_desc->copy_query(distance_workspace, query_ptr + query_id * dataset_desc->dim); + __syncthreads(); if (global_team_id >= search_width * graph_degree) { return; } @@ -393,33 +357,19 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const std::size_t child_id = neighbor_list_head_ptr[global_team_id % graph_degree]; - const auto compute_distance_flag = hashmap::insert( - visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, compute_distance_flag); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = dataset_desc.template compute_similarity( - query_buffer, child_id, compute_distance_flag); - break; - default: break; - } + const auto compute_distance_flag = hashmap::insert( + team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); + + DISTANCE_T norm2 = + dataset_desc->compute_distance(distance_workspace, child_id, metric, compute_distance_flag); if (compute_distance_flag) { - if (threadIdx.x % TEAM_SIZE == 0) { + if (threadIdx.x % team_size == 0) { result_indices_ptr[ldd * blockIdx.y + global_team_id] = child_id; result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2; } } else { - if (threadIdx.x % TEAM_SIZE == 0) { + if (threadIdx.x % team_size == 0) { result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value(); } } @@ -434,66 +384,55 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( } } -template +template void compute_distance_to_child_nodes( - const typename DATASET_DESCRIPTOR_T::INDEX_T* const - parent_node_list, // [num_queries, search_width] - typename DATASET_DESCRIPTOR_T::INDEX_T* const - parent_candidates_ptr, // [num_queries, search_width] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const - parent_distance_ptr, // [num_queries, search_width] - const std::size_t lds, - const uint32_t search_width, - const DATASET_DESCRIPTOR_T dataset_desc, - const typename DATASET_DESCRIPTOR_T::INDEX_T* const - neighbor_graph_ptr, // [dataset_size, graph_degree] - const std::uint32_t graph_degree, - const typename DATASET_DESCRIPTOR_T::DATA_T* query_ptr, // [num_queries, data_dim] - const std::uint32_t num_queries, - typename DATASET_DESCRIPTOR_T::INDEX_T* const - visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] - const std::uint32_t hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldd] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] - const std::uint32_t ldd, // (*) ldd >= search_width * graph_degree + const IndexT* parent_node_list, // [num_queries, search_width] + IndexT* const parent_candidates_ptr, // [num_queries, search_width] + DistanceT* const parent_distance_ptr, // [num_queries, search_width] + std::size_t lds, + uint32_t search_width, + const dataset_descriptor_host& dataset_desc, + const IndexT* neighbor_graph_ptr, // [dataset_size, graph_degree] + std::uint32_t graph_degree, + const DataT* query_ptr, // [num_queries, data_dim] + std::uint32_t num_queries, + IndexT* visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] + std::uint32_t hash_bitlen, + IndexT* result_indices_ptr, // [num_queries, ldd] + DistanceT* result_distances_ptr, // [num_queries, ldd] + std::uint32_t ldd, // (*) ldd >= search_width * graph_degree SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric, - cudaStream_t cuda_stream = 0) + cuvs::distance::DistanceType metric, + cudaStream_t cuda_stream) { - const auto block_size = 128; - const dim3 grid_size( - (search_width * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE), - num_queries); - - const auto query_smem_buffer_length = - raft::ceildiv(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM; - - const auto smem_size = - query_smem_buffer_length * sizeof(float) + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte; - - compute_distance_to_child_nodes_kernel - <<>>(parent_node_list, - parent_candidates_ptr, - parent_distance_ptr, - lds, - search_width, - dataset_desc, - neighbor_graph_ptr, - graph_degree, - query_ptr, - visited_hashmap_ptr, - hash_bitlen, - result_indices_ptr, - result_distances_ptr, - ldd, - sample_filter, - metric); + const auto block_size = 128; + const auto teams_per_block = block_size / dataset_desc.team_size; + const dim3 grid_size((search_width * graph_degree + teams_per_block - 1) / teams_per_block, + num_queries); + + compute_distance_to_child_nodes_kernel<<>>(parent_node_list, + parent_candidates_ptr, + parent_distance_ptr, + lds, + search_width, + dataset_desc.dev_ptr, + dataset_desc.team_size, + neighbor_graph_ptr, + graph_degree, + query_ptr, + visited_hashmap_ptr, + hash_bitlen, + result_indices_ptr, + result_distances_ptr, + ldd, + sample_filter, + metric); } template @@ -863,21 +802,21 @@ struct search : search_plan_impl { } // Choose initial entry point candidates at random - random_pickup(dataset_desc, - queries_ptr, - num_queries, - result_buffer_size, - num_random_samplings, - rand_xor_mask, - dev_seed_ptr, - num_seeds, - result_indices.data(), - result_distances.data(), - result_buffer_allocation_size, - hashmap.data(), - hash_bitlen, - this->metric, - stream); + random_pickup(dataset_desc, + queries_ptr, + num_queries, + result_buffer_size, + num_random_samplings, + rand_xor_mask, + dev_seed_ptr, + num_seeds, + result_indices.data(), + result_distances.data(), + result_buffer_allocation_size, + hashmap.data(), + hash_bitlen, + this->metric, + stream); unsigned iter = 0; while (1) { @@ -929,7 +868,7 @@ struct search : search_plan_impl { } // Compute distance to child nodes that are adjacent to the parent node - compute_distance_to_child_nodes( + compute_distance_to_child_nodes( parent_node_list.data(), result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size, result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size, @@ -1023,69 +962,5 @@ struct search : search_plan_impl { } }; -template -struct search, - SAMPLE_FILTER_T> - : public search_plan_impl, - SAMPLE_FILTER_T> { - using DATASET_DESCRIPTOR_T = cagra_q_dataset_descriptor_t; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - search(raft::resources const& res, - search_params params, - int64_t dim, - int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : base_type(res, params, dim, graph_degree, topk, metric) - { - THROW("The multi-kernel mode does not support VPQ"); - } - - void set_params(raft::resources const& res) {} - - void operator()(raft::resources const& res, - DATASET_DESCRIPTOR_T dataset_desc, - raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const uint32_t num_queries, - const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* const num_executed_iterations, // [num_queries,] - uint32_t topk, - SAMPLE_FILTER_T sample_filter) - { - } -}; - } // namespace multi_kernel_search } // namespace cuvs::neighbors::cagra::detail From 63525501c3d50a585e3049ef42f2dff033f265dc Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 19 Aug 2024 15:08:09 +0200 Subject: [PATCH 04/41] Move common code out of virtual functions scope (aiming for more inlining) --- .../detail/cagra/compute_distance.hpp | 279 ++---------------- .../detail/cagra/compute_distance_vpq.cuh | 79 +---- .../neighbors/detail/cagra/device_common.hpp | 144 +++++++++ .../cagra/search_multi_cta_kernel-inl.cuh | 50 ++-- .../detail/cagra/search_multi_kernel.cuh | 6 +- .../cagra/search_single_cta_kernel-inl.cuh | 46 +-- 6 files changed, 228 insertions(+), 376 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index c1f001c95..fab4f8bf4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -32,174 +32,6 @@ #include namespace cuvs::neighbors::cagra::detail { -namespace device { - -// using LOAD_256BIT_T = ulonglong4; -using LOAD_128BIT_T = uint4; -using LOAD_64BIT_T = uint64_t; - -template -RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen() -{ - return utils::size_of() / utils::size_of(); -} - -template -RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( - INDEX_T* const result_indices_ptr, // [num_pickup] - DISTANCE_T* const result_distances_ptr, // [num_pickup] - typename DATASET_DESCRIPTOR_T::ws_handle workspace, - const DATASET_DESCRIPTOR_T& dataset_desc, - const size_t num_pickup, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const INDEX_T* const seed_ptr, // [num_seeds] - const uint32_t num_seeds, - INDEX_T* const visited_hash_ptr, - const uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, - const uint32_t block_id = 0, - const uint32_t num_blocks = 1) -{ - uint32_t max_i = num_pickup; - if (max_i % (32 / TeamSize)) { max_i += (32 / TeamSize) - (max_i % (32 / TeamSize)); } - - for (uint32_t i = threadIdx.x / TeamSize; i < max_i; i += blockDim.x / TeamSize) { - const bool valid_i = (i < num_pickup); - - INDEX_T best_index_team_local; - DISTANCE_T best_norm2_team_local = utils::get_max_value(); - for (uint32_t j = 0; j < num_distilation; j++) { - // Select a node randomly and compute the distance to it - INDEX_T seed_index; - if (valid_i) { - // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id))); - uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j))); - if (seed_ptr && (gid < num_seeds)) { - seed_index = seed_ptr[gid]; - } else { - seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size; - } - } - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = - dataset_desc.template compute_similarity( - workspace, seed_index, valid_i); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - workspace, seed_index, valid_i); - break; - default: break; - } - - if (valid_i && (norm2 < best_norm2_team_local)) { - best_norm2_team_local = norm2; - best_index_team_local = seed_index; - } - } - - const unsigned lane_id = threadIdx.x % TeamSize; - if (valid_i && lane_id == 0) { - if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { - result_distances_ptr[i] = best_norm2_team_local; - result_indices_ptr[i] = best_index_team_local; - } else { - result_distances_ptr[i] = utils::get_max_value(); - result_indices_ptr[i] = utils::get_max_value(); - } - } - } -} - -template -RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( - INDEX_T* result_child_indices_ptr, - DISTANCE_T* result_child_distances_ptr, - // query - typename DATASET_DESCRIPTOR_T::ws_handle workspace, - // [dataset_dim, dataset_size] - const DATASET_DESCRIPTOR_T& dataset_desc, - // [knn_k, dataset_size] - const INDEX_T* knn_graph, - uint32_t knn_k, - // hashmap - INDEX_T* visited_hashmap_ptr, - uint32_t hash_bitlen, - const INDEX_T* parent_indices, - const INDEX_T* internal_topk_list, - uint32_t search_width, - cuvs::distance::DistanceType metric) -{ - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - const INDEX_T invalid_index = utils::get_max_value(); - - // Read child indices of parents from knn graph and check if the distance - // computaiton is necessary. - for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) { - const INDEX_T smem_parent_id = parent_indices[i / knn_k]; - INDEX_T child_id = invalid_index; - if (smem_parent_id != invalid_index) { - const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask; - child_id = knn_graph[(i % knn_k) + (static_cast(knn_k) * parent_id)]; - } - if (child_id != invalid_index) { - if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) { - child_id = invalid_index; - } - } - result_child_indices_ptr[i] = child_id; - } - __syncthreads(); - - // Compute the distance to child nodes - uint32_t max_i = knn_k * search_width; - if (max_i % (32 / TeamSize)) { max_i += (32 / TeamSize) - (max_i % (32 / TeamSize)); } - for (uint32_t tid = threadIdx.x; tid < max_i * TeamSize; tid += blockDim.x) { - const auto i = tid / TeamSize; - const bool valid_i = (i < (knn_k * search_width)); - INDEX_T child_id = invalid_index; - if (valid_i) { child_id = result_child_indices_ptr[i]; } - - DISTANCE_T norm2; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - norm2 = dataset_desc.template compute_similarity( - workspace, child_id, child_id != invalid_index); - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 = - dataset_desc.template compute_similarity( - workspace, child_id, child_id != invalid_index); - break; - default: break; - } - - // Store the distance - const unsigned lane_id = threadIdx.x % TeamSize; - if (valid_i && lane_id == 0) { - if (child_id != invalid_index) { - result_child_distances_ptr[i] = norm2; - } else { - result_child_distances_ptr[i] = utils::get_max_value(); - } - } - } -} - -} // namespace device template struct dataset_descriptor_base_t { @@ -207,6 +39,21 @@ struct dataset_descriptor_base_t { using INDEX_T = IndexT; using DISTANCE_T = DistanceT; + /** + * Maximum expected size of the descriptor struct. + * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global + * memory. Increase this if new fields are needed (but try to keep the descriptors small really). + */ + static constexpr size_t kMaxStructSize = 64; + + template + static inline constexpr void assert_struct_size() + { + static_assert(ActualSize <= MaximumSize, + "The maximum descriptor size is tracked in the dataset_descriptor_base_t. " + "Update this constant if implementing a new, larger descriptor."); + } + struct distance_workspace; using ws_handle = distance_workspace*; @@ -215,6 +62,9 @@ struct dataset_descriptor_base_t { _RAFT_HOST_DEVICE dataset_descriptor_base_t(INDEX_T size, uint32_t dim) : size(size), dim(dim) {} + /** How many threads are involved in computing a single distance. */ + _RAFT_HOST_DEVICE [[nodiscard]] virtual auto team_size() const -> uint32_t = 0; + /** Total dynamic shared memory required by the descriptor. */ _RAFT_HOST_DEVICE [[nodiscard]] virtual auto smem_ws_size_in_bytes() const -> uint32_t = 0; @@ -230,36 +80,6 @@ struct dataset_descriptor_base_t { INDEX_T dataset_index, cuvs::distance::DistanceType metric, bool valid) const -> DISTANCE_T = 0; - - _RAFT_DEVICE virtual void compute_distance_to_random_nodes( - ws_handle smem_workspace, - INDEX_T* const result_indices_ptr, // [num_pickup] - DISTANCE_T* const result_distances_ptr, // [num_pickup] - const size_t num_pickup, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const INDEX_T* const seed_ptr, // [num_seeds] - const uint32_t num_seeds, - INDEX_T* const visited_hash_ptr, - const uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, - const uint32_t block_id = 0, - const uint32_t num_blocks = 1) const = 0; - - _RAFT_DEVICE virtual void compute_distance_to_child_nodes( - ws_handle smem_workspace, - INDEX_T* const result_child_indices_ptr, - DISTANCE_T* const result_child_distances_ptr, - // [knn_k, dataset_size] - const INDEX_T* const knn_graph, - const uint32_t knn_k, - // hashmap - INDEX_T* const visited_hashmap_ptr, - const uint32_t hash_bitlen, - const INDEX_T* const parent_indices, - const INDEX_T* const internal_topk_list, - const uint32_t search_width, - const cuvs::distance::DistanceType metric) const = 0; }; template @@ -341,8 +161,11 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim)} { + base_type::template assert_struct_size(); } + _RAFT_HOST_DEVICE [[nodiscard]] auto team_size() const -> uint32_t { return TeamSize; } + _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t { return smem_query_buffer_length * sizeof(QUERY_T); @@ -366,66 +189,6 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(result_indices_ptr, - result_distances_ptr, - smem_workspace, - *this, - num_pickup, - num_distilation, - rand_xor_mask, - seed_ptr, - num_seeds, - visited_hash_ptr, - hash_bitlen, - metric, - block_id, - num_blocks); - } - - _RAFT_DEVICE void compute_distance_to_child_nodes(ws_handle smem_workspace, - INDEX_T* const result_child_indices_ptr, - DISTANCE_T* const result_child_distances_ptr, - // [knn_k, dataset_size] - const INDEX_T* const knn_graph, - const uint32_t knn_k, - // hashmap - INDEX_T* const visited_hashmap_ptr, - const uint32_t hash_bitlen, - const INDEX_T* const parent_indices, - const INDEX_T* const internal_topk_list, - const uint32_t search_width, - const cuvs::distance::DistanceType metric) const - { - return device::compute_distance_to_child_nodes( - result_child_indices_ptr, - result_child_distances_ptr, - smem_workspace, - *this, - knn_graph, - knn_k, - visited_hashmap_ptr, - hash_bitlen, - parent_indices, - internal_topk_list, - search_width, - metric); - } - template RAFT_DEVICE_INLINE_FUNCTION auto dist_op(T a, T b) const -> std::enable_if_t diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 7446c5991..19de93951 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -42,18 +42,19 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); + static_assert(std::is_same_v, + "Only CODE_BOOK_T = " + "`half` is supported " + "now"); const std::uint8_t* encoded_dataset_ptr; + const CODE_BOOK_T* vq_code_book_ptr; + const CODE_BOOK_T* pq_code_book_ptr; std::uint32_t encoded_dataset_dim; std::uint32_t n_subspace; - const CODE_BOOK_T* vq_code_book_ptr; float vq_scale; - const CODE_BOOK_T* pq_code_book_ptr; float pq_scale; - uint32_t smem_query_buffer_length; - static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of(); @@ -73,10 +74,11 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim)} + pq_scale(pq_scale) { + base_type::template assert_struct_size(); } + _RAFT_HOST_DEVICE [[nodiscard]] auto team_size() const -> uint32_t { return TeamSize; } _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t { @@ -84,7 +86,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * sizeof(QUERY_T); } _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle @@ -128,66 +131,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(result_indices_ptr, - result_distances_ptr, - smem_workspace, - *this, - num_pickup, - num_distilation, - rand_xor_mask, - seed_ptr, - num_seeds, - visited_hash_ptr, - hash_bitlen, - metric, - block_id, - num_blocks); - } - - _RAFT_DEVICE void compute_distance_to_child_nodes(ws_handle smem_workspace, - INDEX_T* const result_child_indices_ptr, - DISTANCE_T* const result_child_distances_ptr, - // [knn_k, dataset_size] - const INDEX_T* const knn_graph, - const uint32_t knn_k, - // hashmap - INDEX_T* const visited_hashmap_ptr, - const uint32_t hash_bitlen, - const INDEX_T* const parent_indices, - const INDEX_T* const internal_topk_list, - const uint32_t search_width, - const cuvs::distance::DistanceType metric) const - { - return device::compute_distance_to_child_nodes( - result_child_indices_ptr, - result_child_distances_ptr, - smem_workspace, - *this, - knn_graph, - knn_k, - visited_hashmap_ptr, - hash_bitlen, - parent_indices, - internal_topk_list, - search_width, - metric); - } - _RAFT_DEVICE auto compute_distance(ws_handle smem_workspace, INDEX_T dataset_index, cuvs::distance::DistanceType metric, diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 192d81aa8..06b517f58 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -15,8 +15,11 @@ */ #pragma once +#include "hashmap.hpp" #include "utils.hpp" +#include + // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors #include @@ -31,6 +34,16 @@ namespace device { // warpSize for compile time calculation constexpr unsigned warp_size = 32; +// using LOAD_256BIT_T = ulonglong4; +using LOAD_128BIT_T = uint4; +using LOAD_64BIT_T = uint64_t; + +template +RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen() +{ + return utils::size_of() / utils::size_of(); +} + /** Xorshift rondem number generator. * * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference. @@ -56,5 +69,136 @@ _RAFT_DEVICE inline T swizzling(T x) } } +template +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( + IndexT* result_indices_ptr, // [num_pickup] + DistanceT* result_distances_ptr, // [num_pickup] + typename DATASET_DESCRIPTOR_T::ws_handle workspace, + const DATASET_DESCRIPTOR_T& dataset_desc, + size_t num_pickup, + unsigned num_distilation, + uint64_t rand_xor_mask, + const IndexT* seed_ptr, // [num_seeds] + uint32_t num_seeds, + IndexT* visited_hash_ptr, + uint32_t hash_bitlen, + cuvs::distance::DistanceType metric, + uint32_t block_id = 0, + uint32_t num_blocks = 1) +{ + const auto team_size = dataset_desc.team_size(); + uint32_t max_i = num_pickup; + if (max_i % (warp_size / team_size)) { + max_i += (warp_size / team_size) - (max_i % (warp_size / team_size)); + } + + for (uint32_t i = threadIdx.x / team_size; i < max_i; i += blockDim.x / team_size) { + const bool valid_i = (i < num_pickup); + + IndexT best_index_team_local; + DistanceT best_norm2_team_local = utils::get_max_value(); + for (uint32_t j = 0; j < num_distilation; j++) { + // Select a node randomly and compute the distance to it + IndexT seed_index; + if (valid_i) { + // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id))); + uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j))); + if (seed_ptr && (gid < num_seeds)) { + seed_index = seed_ptr[gid]; + } else { + seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size; + } + } + + auto norm2 = dataset_desc.compute_distance(workspace, seed_index, metric, valid_i); + + if (valid_i && (norm2 < best_norm2_team_local)) { + best_norm2_team_local = norm2; + best_index_team_local = seed_index; + } + } + + const unsigned lane_id = threadIdx.x % team_size; + if (valid_i && lane_id == 0) { + if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { + result_distances_ptr[i] = best_norm2_team_local; + result_indices_ptr[i] = best_index_team_local; + } else { + result_distances_ptr[i] = utils::get_max_value(); + result_indices_ptr[i] = utils::get_max_value(); + } + } + } +} + +template +RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( + IndexT* result_child_indices_ptr, + DistanceT* result_child_distances_ptr, + // query + typename DATASET_DESCRIPTOR_T::ws_handle workspace, + // [dataset_dim, dataset_size] + const DATASET_DESCRIPTOR_T& dataset_desc, + // [knn_k, dataset_size] + const IndexT* knn_graph, + uint32_t knn_k, + // hashmap + IndexT* visited_hashmap_ptr, + uint32_t hash_bitlen, + const IndexT* parent_indices, + const IndexT* internal_topk_list, + uint32_t search_width, + cuvs::distance::DistanceType metric) +{ + constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask::value; + const IndexT invalid_index = utils::get_max_value(); + + // Read child indices of parents from knn graph and check if the distance + // computaiton is necessary. + for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) { + const IndexT smem_parent_id = parent_indices[i / knn_k]; + IndexT child_id = invalid_index; + if (smem_parent_id != invalid_index) { + const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask; + child_id = knn_graph[(i % knn_k) + (static_cast(knn_k) * parent_id)]; + } + if (child_id != invalid_index) { + if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) { + child_id = invalid_index; + } + } + result_child_indices_ptr[i] = child_id; + } + __syncthreads(); + + // Compute the distance to child nodes + uint32_t max_i = knn_k * search_width; + const auto team_size = dataset_desc.team_size(); + if (max_i % (warp_size / team_size)) { + max_i += (warp_size / team_size) - (max_i % (warp_size / team_size)); + } + for (uint32_t tid = threadIdx.x; tid < max_i * team_size; tid += blockDim.x) { + const auto i = tid / team_size; + const bool valid_i = (i < (knn_k * search_width)); + IndexT child_id = invalid_index; + if (valid_i) { child_id = result_child_indices_ptr[i]; } + + auto norm2 = + dataset_desc.compute_distance(workspace, child_id, metric, child_id != invalid_index); + + // Store the distance + const unsigned lane_id = threadIdx.x % team_size; + if (valid_i && lane_id == 0) { + if (child_id != invalid_index) { + result_child_distances_ptr[i] = norm2; + } else { + result_child_distances_ptr[i] = utils::get_max_value(); + } + } + } +} + } // namespace device } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 0c856aa19..aaf1ce7ef 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -227,19 +227,20 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( uint32_t block_id = cta_id + (num_cta_per_query * query_id); uint32_t num_blocks = num_cta_per_query * num_queries; - dataset_desc->compute_distance_to_random_nodes(distance_workspace, - result_indices_buffer, - result_distances_buffer, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric, - block_id, - num_blocks); + device::compute_distance_to_random_nodes(result_indices_buffer, + result_distances_buffer, + distance_workspace, + *dataset_desc, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + metric, + block_id, + num_blocks); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -269,17 +270,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - dataset_desc->compute_distance_to_child_nodes(distance_workspace, - result_indices_buffer + itopk_size, - result_distances_buffer + itopk_size, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_indices_buffer, - result_indices_buffer, - search_width, - metric); + device::compute_distance_to_child_nodes(result_indices_buffer + itopk_size, + result_distances_buffer + itopk_size, + distance_workspace, + *dataset_desc, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_indices_buffer, + result_indices_buffer, + search_width, + metric); _CLK_REC(clk_compute_distance); __syncthreads(); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index c228c51d8..3d25128e9 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -96,7 +96,6 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre template RAFT_KERNEL random_pickup_kernel( const DATASET_DESCRIPTOR_T* dataset_desc, - uint32_t team_size, const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::size_t num_pickup, const unsigned num_distilation, @@ -114,6 +113,7 @@ RAFT_KERNEL random_pickup_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; + const auto team_size = dataset_desc->team_size(); const auto ldb = hashmap::get_size(hash_bitlen); const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / team_size; const uint32_t query_id = blockIdx.y; @@ -182,7 +182,6 @@ void random_pickup(const dataset_descriptor_host& data random_pickup_kernel<<>>( dataset_desc.dev_ptr, - dataset_desc.team_size, queries_ptr, num_pickup, num_distilation, @@ -309,7 +308,6 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const std::size_t lds, const std::uint32_t search_width, const DATASET_DESCRIPTOR_T* dataset_desc, - uint32_t team_size, const typename DATASET_DESCRIPTOR_T::INDEX_T* const neighbor_graph_ptr, // [dataset_size, graph_degree] const std::uint32_t graph_degree, @@ -326,6 +324,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; + const auto team_size = dataset_desc->team_size(); const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; const auto global_team_id = tid / team_size; @@ -422,7 +421,6 @@ void compute_distance_to_child_nodes( lds, search_width, dataset_desc.dev_ptr, - dataset_desc.team_size, neighbor_graph_ptr, graph_degree, query_ptr, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 61354413d..6e64a12a9 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -562,17 +562,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute distance to randomly selecting nodes _CLK_START(); const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - dataset_desc->compute_distance_to_random_nodes(distance_workspace, - result_indices_buffer, - result_distances_buffer, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - metric); + device::compute_distance_to_random_nodes(result_indices_buffer, + result_distances_buffer, + distance_workspace, + *dataset_desc, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + metric); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -693,17 +694,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // compute the norms between child nodes and query node _CLK_START(); - dataset_desc->compute_distance_to_child_nodes(distance_workspace, - result_indices_buffer + internal_topk, - result_distances_buffer + internal_topk, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_list_buffer, - result_indices_buffer, - search_width, - metric); + device::compute_distance_to_child_nodes(result_indices_buffer + internal_topk, + result_distances_buffer + internal_topk, + distance_workspace, + *dataset_desc, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_list_buffer, + result_indices_buffer, + search_width, + metric); __syncthreads(); _CLK_REC(clk_compute_distance); From d161f79f142b1415a2a614fd593e98df321568c4 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 20 Aug 2024 11:05:00 +0200 Subject: [PATCH 05/41] Make small descriptor functions into fields --- .../detail/cagra/compute_distance.hpp | 87 ++++++++++--------- .../detail/cagra/compute_distance_vpq.cuh | 30 +++---- .../neighbors/detail/cagra/device_common.hpp | 4 +- .../cagra/search_multi_cta_kernel-inl.cuh | 2 +- .../detail/cagra/search_multi_kernel.cuh | 4 +- .../cagra/search_single_cta_kernel-inl.cuh | 2 +- 6 files changed, 65 insertions(+), 64 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index fab4f8bf4..c867cff5f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -44,7 +44,7 @@ struct dataset_descriptor_base_t { * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global * memory. Increase this if new fields are needed (but try to keep the descriptors small really). */ - static constexpr size_t kMaxStructSize = 64; + static constexpr size_t kMaxStructSize = 72; template static inline constexpr void assert_struct_size() @@ -57,18 +57,24 @@ struct dataset_descriptor_base_t { struct distance_workspace; using ws_handle = distance_workspace*; + /** Number of records in the database. */ INDEX_T size; + /** Dimensionality of the data/queries. */ uint32_t dim; - - _RAFT_HOST_DEVICE dataset_descriptor_base_t(INDEX_T size, uint32_t dim) : size(size), dim(dim) {} - /** How many threads are involved in computing a single distance. */ - _RAFT_HOST_DEVICE [[nodiscard]] virtual auto team_size() const -> uint32_t = 0; - + uint32_t team_size; /** Total dynamic shared memory required by the descriptor. */ - _RAFT_HOST_DEVICE [[nodiscard]] virtual auto smem_ws_size_in_bytes() const -> uint32_t = 0; + uint32_t smem_ws_size_in_bytes; + + _RAFT_HOST_DEVICE dataset_descriptor_base_t(INDEX_T size, + uint32_t dim, + uint32_t team_size, + uint32_t smem_ws_size_in_bytes) + : size(size), dim(dim), team_size(team_size), smem_ws_size_in_bytes(smem_ws_size_in_bytes) + { + } - /** Set shared memory workspace (pointers). */ + /** Setup the shared memory workspace (e.g. assign pointers or prepare a lookup table). */ _RAFT_DEVICE [[nodiscard]] virtual auto set_smem_ws(void* smem_ptr) const -> ws_handle = 0; /** Copy the query to the shared memory. */ @@ -92,11 +98,10 @@ struct dataset_descriptor_host { template dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream, - uint32_t team_size, uint32_t dataset_block_dim) : stream_{stream}, - smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, - team_size{team_size}, + smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes}, + team_size{dd_host.team_size}, dataset_block_dim{dataset_block_dim} { RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); @@ -143,6 +148,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim)} + : base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), ptr(ptr), ld(ld) { base_type::template assert_struct_size(); } - _RAFT_HOST_DEVICE [[nodiscard]] auto team_size() const -> uint32_t { return TeamSize; } - - _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t - { - return smem_query_buffer_length * sizeof(QUERY_T); - } - _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle { return reinterpret_cast(smem_ptr); @@ -178,8 +173,9 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t{}(query_ptr[i]); @@ -189,21 +185,6 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t - RAFT_DEVICE_INLINE_FUNCTION auto dist_op(T a, T b) const - -> std::enable_if_t - { - T diff = a - b; - return diff * diff; - } - - template - RAFT_DEVICE_INLINE_FUNCTION auto dist_op(T a, T b) const - -> std::enable_if_t - { - return -a * b; - } - _RAFT_DEVICE auto compute_distance(ws_handle smem_workspace, INDEX_T dataset_index, cuvs::distance::DistanceType metric, @@ -220,6 +201,22 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t + RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) + -> std::enable_if_t + { + T diff = a - b; + return diff * diff; + } + + template + RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) + -> std::enable_if_t + { + return -a * b; + } + template RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, const INDEX_T dataset_i, @@ -266,12 +263,17 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t QUERY_T* { return reinterpret_cast(smem_workspace); } + + _RAFT_HOST_DEVICE [[nodiscard]] constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) + -> uint32_t + { + return raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } }; extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; @@ -327,8 +329,7 @@ auto standard_dataset_descriptor_init(const strided_dataset& { standard_dataset_descriptor_t dd_host{ dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; - dataset_descriptor_host result{ - dd_host, stream, TeamSize, DatasetBlockDim}; + dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; standard_dataset_descriptor_init_kernel <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); return result; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 19de93951..45fe23a76 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -37,6 +37,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(); } - _RAFT_HOST_DEVICE [[nodiscard]] auto team_size() const -> uint32_t { return TeamSize; } - - _RAFT_HOST_DEVICE [[nodiscard]] auto smem_ws_size_in_bytes() const -> uint32_t - { - /* SMEM workspace layout: - 1. Codebook (kSMemCodeBookSizeInBytes bytes) - 2. Queries (smem_query_buffer_length elems) - */ - return kSMemCodeBookSizeInBytes + - raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); - } _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle { @@ -147,6 +137,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t RAFT_DEVICE_INLINE_FUNCTION DISTANCE_T compute_similarity(ws_handle smem_workspace, const INDEX_T node_id, @@ -263,7 +254,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t CODE_BOOK_T* { @@ -276,6 +266,17 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(reinterpret_cast(smem_workspace) + kSMemCodeBookSizeInBytes); } + + _RAFT_HOST_DEVICE [[nodiscard]] constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) + -> uint32_t + { + /* SMEM workspace layout: + 1. Codebook (kSMemCodeBookSizeInBytes bytes) + 2. Queries (smem_query_buffer_length elems) + */ + return kSMemCodeBookSizeInBytes + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } }; extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; @@ -421,8 +422,7 @@ auto vpq_dataset_descriptor_init(const vpq_dataset& data pq_scale, IndexT(dataset.n_rows()), dataset.dim()}; - dataset_descriptor_host result{ - dd_host, stream, TeamSize, DatasetBlockDim}; + dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; vpq_dataset_descriptor_init_kernelset_smem_ws(smem); auto result_indices_buffer = - reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto parent_indices_buffer = diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 3d25128e9..7f4e1f316 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -113,7 +113,7 @@ RAFT_KERNEL random_pickup_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - const auto team_size = dataset_desc->team_size(); + const auto team_size = dataset_desc->team_size; const auto ldb = hashmap::get_size(hash_bitlen); const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / team_size; const uint32_t query_id = blockIdx.y; @@ -324,7 +324,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - const auto team_size = dataset_desc->team_size(); + const auto team_size = dataset_desc->team_size; const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; const auto global_team_id = tid / team_size; diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 6e64a12a9..f4887a142 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -527,7 +527,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( auto distance_workspace = dataset_desc->set_smem_ws(smem); auto result_indices_buffer = - reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto visited_hash_buffer = From 35c38134596650802b682088fdaedaf3bf56a82c Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 20 Aug 2024 13:31:25 +0200 Subject: [PATCH 06/41] Minor updates to improve reg count --- .../detail/cagra/compute_distance.hpp | 41 ++++++++++++------- .../detail/cagra/compute_distance_vpq.cuh | 4 +- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index c867cff5f..f0aef1ae9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -74,6 +74,19 @@ struct dataset_descriptor_base_t { { } + RAFT_DEVICE_INLINE_FUNCTION void copy_descriptor_per_block( + dataset_descriptor_base_t* target) const + { + using word_type = uint32_t; + constexpr auto kStructWords = kMaxStructSize / sizeof(word_type); + auto* dst = reinterpret_cast(target); + auto* src = reinterpret_cast(this); + for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { + dst[i] = src[i]; + } + __syncthreads(); + } + /** Setup the shared memory workspace (e.g. assign pointers or prepare a lookup table). */ _RAFT_DEVICE [[nodiscard]] virtual auto set_smem_ws(void* smem_ptr) const -> ws_handle = 0; @@ -90,10 +103,11 @@ struct dataset_descriptor_base_t { template struct dataset_descriptor_host { - dataset_descriptor_base_t* dev_ptr = nullptr; - uint32_t smem_ws_size_in_bytes = 0; - uint32_t team_size = 0; - uint32_t dataset_block_dim = 0; + using dev_descriptor_t = dataset_descriptor_base_t; + dev_descriptor_t* dev_ptr = nullptr; + uint32_t smem_ws_size_in_bytes = 0; + uint32_t team_size = 0; + uint32_t dataset_block_dim = 0; template dataset_descriptor_host(const DescriptorImpl& dd_host, @@ -104,7 +118,7 @@ struct dataset_descriptor_host { team_size{dd_host.team_size}, dataset_block_dim{dataset_block_dim} { - RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); + RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, dev_descriptor_t::kMaxStructSize, stream_)); } ~dataset_descriptor_host() noexcept @@ -222,17 +236,16 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T { - auto query_ptr = smem_query_buffer(smem_workspace); - const auto dataset_ptr = ptr + dataset_i * ld; - const unsigned lane_id = threadIdx.x % TeamSize; - constexpr unsigned vlen = device::get_vlen(); - // #include (DatasetBlockDim, TeamSize * vlen); - raft::TxN_t dl_buff[reg_nelem]; + auto query_ptr = smem_query_buffer(smem_workspace); + const auto dataset_ptr = ptr + dataset_i * ld; + const unsigned lane_id = threadIdx.x % TeamSize; DISTANCE_T norm2 = 0; if (valid) { for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { + constexpr unsigned vlen = device::get_vlen(); + constexpr unsigned reg_nelem = raft::ceildiv(DatasetBlockDim, TeamSize * vlen); + raft::TxN_t dl_buff[reg_nelem]; #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; @@ -257,6 +270,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t 0; offset >>= 1) { norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); } @@ -269,8 +283,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(smem_workspace); } - _RAFT_HOST_DEVICE [[nodiscard]] constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) - -> uint32_t + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t { return raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 45fe23a76..1932f131c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -248,6 +248,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t 0; offset >>= 1) { norm += __shfl_xor_sync(0xffffffff, norm, offset); } @@ -267,8 +268,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t uint32_t + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t { /* SMEM workspace layout: 1. Codebook (kSMemCodeBookSizeInBytes bytes) From 4b5dcd3b1b835ad0de68da2da2ff147edf21831f Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 21 Aug 2024 10:31:32 +0200 Subject: [PATCH 07/41] Refactor distance_core -> compute_distance, and update the instance list --- cpp/CMakeLists.txt | 128 ++++---- cpp/include/cuvs/neighbors/common.hpp | 41 ++- .../neighbors/detail/cagra/cagra_search.cuh | 7 +- .../detail/cagra/compute_distance-ext.hpp | 86 +++++ .../detail/cagra/compute_distance.hpp | 219 ------------- ...ate.py => compute_distance_00_generate.py} | 48 ++- .../cagra/compute_distance_standard.cuh | 307 ++++++++++++++++++ ...tance_standard_float_uint32_dim128_t16.cu} | 8 +- ...tance_standard_float_uint32_dim256_t32.cu} | 8 +- ...istance_standard_float_uint32_dim64_t8.cu} | 8 +- ...tance_standard_float_uint64_dim128_t16.cu} | 8 +- ...tance_standard_float_uint64_dim256_t32.cu} | 8 +- ...istance_standard_float_uint64_dim64_t8.cu} | 8 +- ...stance_standard_half_uint32_dim128_t16.cu} | 8 +- ...stance_standard_half_uint32_dim256_t32.cu} | 8 +- ...distance_standard_half_uint32_dim64_t8.cu} | 8 +- ...stance_standard_half_uint64_dim128_t16.cu} | 8 +- ...stance_standard_half_uint64_dim256_t32.cu} | 8 +- ..._distance_standard_half_uint64_dim64_t8.cu | 32 ++ ...stance_standard_int8_uint32_dim128_t16.cu} | 8 +- ...stance_standard_int8_uint32_dim256_t32.cu} | 8 +- ...distance_standard_int8_uint32_dim64_t8.cu} | 8 +- ...tance_standard_uint8_uint32_dim128_t16.cu} | 8 +- ...tance_standard_uint8_uint32_dim256_t32.cu} | 8 +- ...istance_standard_uint8_uint32_dim64_t8.cu} | 8 +- .../detail/cagra/compute_distance_vpq.cuh | 136 ++------ ...float_uint32_dim128_t16_8pq_2subd_half.cu} | 6 +- ...float_uint32_dim128_t16_8pq_4subd_half.cu} | 6 +- ...float_uint32_dim256_t32_8pq_2subd_half.cu} | 6 +- ...float_uint32_dim256_t32_8pq_4subd_half.cu} | 6 +- ...pq_float_uint32_dim64_t8_8pq_2subd_half.cu | 32 ++ ...pq_float_uint32_dim64_t8_8pq_4subd_half.cu | 32 ++ ...float_uint64_dim128_t16_8pq_2subd_half.cu} | 6 +- ...float_uint64_dim128_t16_8pq_4subd_half.cu} | 6 +- ...float_uint64_dim256_t32_8pq_2subd_half.cu} | 6 +- ...float_uint64_dim256_t32_8pq_4subd_half.cu} | 6 +- ...pq_float_uint64_dim64_t8_8pq_2subd_half.cu | 32 ++ ...pq_float_uint64_dim64_t8_8pq_4subd_half.cu | 32 ++ ..._half_uint32_dim128_t16_8pq_2subd_half.cu} | 6 +- ..._half_uint32_dim128_t16_8pq_4subd_half.cu} | 6 +- ..._half_uint32_dim256_t32_8pq_2subd_half.cu} | 6 +- ..._half_uint32_dim256_t32_8pq_4subd_half.cu} | 6 +- ...pq_half_uint32_dim64_t8_8pq_2subd_half.cu} | 6 +- ...pq_half_uint32_dim64_t8_8pq_4subd_half.cu} | 6 +- ..._half_uint64_dim128_t16_8pq_2subd_half.cu} | 6 +- ..._half_uint64_dim128_t16_8pq_4subd_half.cu} | 6 +- ..._half_uint64_dim256_t32_8pq_2subd_half.cu} | 6 +- ..._half_uint64_dim256_t32_8pq_4subd_half.cu} | 6 +- ...pq_half_uint64_dim64_t8_8pq_2subd_half.cu} | 6 +- ...pq_half_uint64_dim64_t8_8pq_4subd_half.cu} | 6 +- ..._int8_uint32_dim128_t16_8pq_2subd_half.cu} | 6 +- ..._int8_uint32_dim128_t16_8pq_4subd_half.cu} | 6 +- ..._int8_uint32_dim256_t32_8pq_2subd_half.cu} | 6 +- ..._int8_uint32_dim256_t32_8pq_4subd_half.cu} | 6 +- ...vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu | 32 ++ ...vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu | 32 ++ ...uint8_uint32_dim128_t16_8pq_2subd_half.cu} | 6 +- ...uint8_uint32_dim128_t16_8pq_4subd_half.cu} | 6 +- ...uint8_uint32_dim256_t32_8pq_2subd_half.cu} | 6 +- ...uint8_uint32_dim256_t32_8pq_4subd_half.cu} | 6 +- ...pq_uint8_uint32_dim64_t8_8pq_2subd_half.cu | 32 ++ ...pq_uint8_uint32_dim64_t8_8pq_4subd_half.cu | 32 ++ .../neighbors/detail/cagra/device_common.hpp | 2 +- .../detail/cagra/distance_core-ext.cuh | 103 ------ .../detail/cagra/distance_core-impl.cuh | 41 --- .../distance_core_float_uint64_dim1024_t32.cu | 32 -- .../distance_core_half_uint64_dim1024_t32.cu | 32 -- .../distance_core_half_uint64_dim512_t32.cu | 32 -- .../distance_core_int8_uint32_dim128_t8.cu | 32 -- .../distance_core_uint8_uint32_dim1024_t32.cu | 32 -- .../distance_core_uint8_uint32_dim128_t8.cu | 32 -- .../distance_core_uint8_uint32_dim512_t32.cu | 32 -- ...float_uint64_dim1024_t32_8pq_2subd_half.cu | 32 -- ...float_uint64_dim1024_t32_8pq_4subd_half.cu | 32 -- ...q_float_uint64_dim128_t8_8pq_2subd_half.cu | 32 -- ...q_float_uint64_dim128_t8_8pq_4subd_half.cu | 32 -- ..._half_uint32_dim1024_t32_8pq_2subd_half.cu | 32 -- ..._half_uint32_dim1024_t32_8pq_4subd_half.cu | 32 -- ...pq_half_uint32_dim128_t8_8pq_2subd_half.cu | 32 -- ...pq_half_uint32_dim128_t8_8pq_4subd_half.cu | 32 -- ..._half_uint64_dim1024_t32_8pq_2subd_half.cu | 32 -- ..._half_uint64_dim1024_t32_8pq_4subd_half.cu | 32 -- ...pq_half_uint64_dim128_t8_8pq_2subd_half.cu | 32 -- ...pq_half_uint64_dim128_t8_8pq_4subd_half.cu | 32 -- ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu | 32 -- ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu | 32 -- ...pq_int8_uint32_dim128_t8_8pq_2subd_half.cu | 32 -- ...pq_int8_uint32_dim128_t8_8pq_4subd_half.cu | 32 -- ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu | 32 -- ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu | 32 -- ...q_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 32 -- ...q_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 32 -- .../detail/cagra/search_multi_kernel.cuh | 1 - 93 files changed, 985 insertions(+), 1590 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp rename cpp/src/neighbors/detail/cagra/{distance_core_00_generate.py => compute_distance_00_generate.py} (70%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint32_dim1024_t32.cu => compute_distance_standard_float_uint32_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint32_dim512_t32.cu => compute_distance_standard_float_uint32_dim256_t32.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint32_dim128_t8.cu => compute_distance_standard_float_uint32_dim64_t8.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint64_dim256_t16.cu => compute_distance_standard_float_uint64_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint64_dim512_t32.cu => compute_distance_standard_float_uint64_dim256_t32.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint64_dim128_t8.cu => compute_distance_standard_float_uint64_dim64_t8.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint32_dim128_t8.cu => compute_distance_standard_half_uint32_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint32_dim256_t16.cu => compute_distance_standard_half_uint32_dim256_t32.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint32_dim1024_t32.cu => compute_distance_standard_half_uint32_dim64_t8.cu} (77%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint64_dim128_t8.cu => compute_distance_standard_half_uint64_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint64_dim256_t16.cu => compute_distance_standard_half_uint64_dim256_t32.cu} (78%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu rename cpp/src/neighbors/detail/cagra/{distance_core_int8_uint32_dim256_t16.cu => compute_distance_standard_int8_uint32_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_int8_uint32_dim512_t32.cu => compute_distance_standard_int8_uint32_dim256_t32.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_half_uint32_dim512_t32.cu => compute_distance_standard_int8_uint32_dim64_t8.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_int8_uint32_dim1024_t32.cu => compute_distance_standard_uint8_uint32_dim128_t16.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_uint8_uint32_dim256_t16.cu => compute_distance_standard_uint8_uint32_dim256_t32.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_float_uint32_dim256_t16.cu => compute_distance_standard_uint8_uint32_dim64_t8.cu} (78%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu} (82%) rename cpp/src/neighbors/detail/cagra/{distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core-ext.cuh delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core-impl.cuh delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cf2baa8b7..e685e4582 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -204,78 +204,60 @@ add_library( src/neighbors/cagra_search_float.cu src/neighbors/cagra_search_int8.cu src/neighbors/cagra_search_uint8.cu - src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu @@ -290,7 +272,7 @@ add_library( src/neighbors/detail/cagra/search_single_cta_half_uint64.cu ) -file(GLOB_RECURSE distance_core_sources "src/neighbors/detail/cagra/distance_core_*.cu") +file(GLOB_RECURSE distance_core_sources "src/neighbors/detail/cagra/compute_distance_*.cu") set_source_files_properties(${distance_core_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) set_target_properties( diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 414438067..8218b5f52 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -172,6 +172,22 @@ struct owning_dataset : public strided_dataset { }; }; +template +struct is_strided_dataset : std::false_type {}; + +template +struct is_strided_dataset> : std::true_type {}; + +template +struct is_strided_dataset> : std::true_type {}; + +template +struct is_strided_dataset> + : std::true_type {}; + +template +inline constexpr bool is_strided_dataset_v = is_strided_dataset::value; + /** * @brief Contstruct a strided matrix from any mdarray or mdspan. * @@ -284,23 +300,25 @@ auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t */ template struct vpq_dataset : public dataset { + using index_type = IdxT; + using math_type = MathT; /** Vector Quantization codebook - "coarse cluster centers". */ - raft::device_matrix vq_code_book; + raft::device_matrix vq_code_book; /** Product Quantization codebook - "fine cluster centers". */ - raft::device_matrix pq_code_book; + raft::device_matrix pq_code_book; /** Compressed dataset. */ - raft::device_matrix data; + raft::device_matrix data; - vpq_dataset(raft::device_matrix&& vq_code_book, - raft::device_matrix&& pq_code_book, - raft::device_matrix&& data) + vpq_dataset(raft::device_matrix&& vq_code_book, + raft::device_matrix&& pq_code_book, + raft::device_matrix&& data) : vq_code_book{std::move(vq_code_book)}, pq_code_book{std::move(pq_code_book)}, data{std::move(data)} { } - [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); } + [[nodiscard]] auto n_rows() const noexcept -> index_type final { return data.extent(0); } [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); } [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; } @@ -354,6 +372,15 @@ struct vpq_dataset : public dataset { } }; +template +struct is_vpq_dataset : std::false_type {}; + +template +struct is_vpq_dataset> : std::true_type {}; + +template +inline constexpr bool is_vpq_dataset_v = is_vpq_dataset::value; + namespace filtering { /* A filter that filters nothing. This is the default behavior. */ diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index ace278c45..a47916cb3 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -16,7 +16,7 @@ #pragma once -#include "compute_distance_vpq.cuh" +#include "compute_distance-ext.hpp" #include "factory.cuh" #include "search_plan.cuh" #include "search_single_cta_inst.cuh" @@ -195,8 +195,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto desc = - dataset_descriptor_init(*strided_dset, stream); + auto desc = dataset_descriptor_init(*strided_dset, stream); search_main_core(res, params, desc, @@ -212,7 +211,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto desc = dataset_descriptor_init(*vpq_dset, stream); + auto desc = dataset_descriptor_init(*vpq_dset, stream); search_main_core(res, params, desc, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp b/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp new file mode 100644 index 000000000..18e445333 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +extern template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index f0aef1ae9..542a3eb7b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -152,223 +152,4 @@ struct dataset_descriptor_host { rmm::cuda_stream_view stream_; }; -template -struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { - using base_type = dataset_descriptor_base_t; - using LOAD_T = device::LOAD_128BIT_T; - using QUERY_T = float; - using base_type::dim; - using base_type::smem_ws_size_in_bytes; - using typename base_type::DATA_T; - using typename base_type::DISTANCE_T; - using typename base_type::INDEX_T; - using typename base_type::ws_handle; - - const DATA_T* ptr; - size_t ld; - - _RAFT_HOST_DEVICE standard_dataset_descriptor_t(const DATA_T* ptr, - INDEX_T size, - uint32_t dim, - size_t ld) - : base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), ptr(ptr), ld(ld) - { - base_type::template assert_struct_size(); - } - - _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle - { - return reinterpret_cast(smem_ptr); - } - - _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const - { - auto buf = smem_query_buffer(smem_workspace); - auto buf_len = smem_ws_size_in_bytes / sizeof(QUERY_T); - for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dim) { - buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(query_ptr[i]); - } else { - buf[j] = 0.0; - } - } - } - - _RAFT_DEVICE auto compute_distance(ws_handle smem_workspace, - INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) const -> DISTANCE_T - { - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - return compute_similarity( - smem_workspace, dataset_index, valid); - case cuvs::distance::DistanceType::InnerProduct: - return compute_similarity( - smem_workspace, dataset_index, valid); - default: return 0; - } - } - - private: - template - RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) - -> std::enable_if_t - { - T diff = a - b; - return diff * diff; - } - - template - RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) - -> std::enable_if_t - { - return -a * b; - } - - template - RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, - const INDEX_T dataset_i, - const bool valid) const -> DISTANCE_T - { - auto query_ptr = smem_query_buffer(smem_workspace); - const auto dataset_ptr = ptr + dataset_i * ld; - const unsigned lane_id = threadIdx.x % TeamSize; - - DISTANCE_T norm2 = 0; - if (valid) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { - constexpr unsigned vlen = device::get_vlen(); - constexpr unsigned reg_nelem = raft::ceildiv(DatasetBlockDim, TeamSize * vlen); - raft::TxN_t dl_buff[reg_nelem]; -#pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; - dl_buff[e].load(dataset_ptr, k); - } -#pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; -#pragma unroll - for (uint32_t v = 0; v < vlen; v++) { - const uint32_t kv = k + v; - // Note this loop can go above the dataset_dim for padded arrays. This is not a problem - // because: - // - Above the last element (dataset_dim-1), the query array is filled with zeros. - // - The data buffer has to be also padded with zeros. - DISTANCE_T d = query_ptr[device::swizzling(kv)]; - norm2 += dist_op( - d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); - } - } - } - } -#pragma unroll - for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { - norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); - } - return norm2; - } - - RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const - -> QUERY_T* - { - return reinterpret_cast(smem_workspace); - } - - RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t - { - return raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); - } -}; - -extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; - -template -__launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( - dataset_descriptor_base_t* out, - const DataT* ptr, - IndexT size, - uint32_t dim, - size_t ld) -{ - new (out) standard_dataset_descriptor_t( - ptr, size, dim, ld); -} - -template -auto standard_dataset_descriptor_init(const strided_dataset& dataset, - rmm::cuda_stream_view stream) - -> dataset_descriptor_host -{ - standard_dataset_descriptor_t dd_host{ - dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; - dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; - standard_dataset_descriptor_init_kernel - <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); - return result; -} - -template -auto dataset_descriptor_init(const strided_dataset& dataset, - rmm::cuda_stream_view stream) - -> dataset_descriptor_host -{ - constexpr int64_t max_dataset_block_dim = 512; - int64_t dataset_block_dim = 128; - while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { - dataset_block_dim *= 2; - } - switch (dataset_block_dim) { - case 128: - return standard_dataset_descriptor_init<8, 128, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); - case 256: - return standard_dataset_descriptor_init<16, 256, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); - default: - return standard_dataset_descriptor_init<32, 512, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); - } -} - } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py similarity index 70% rename from cpp/src/neighbors/detail/cagra/distance_core_00_generate.py rename to cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index a6d563bf5..53c8f7f28 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import glob template = """/* * Copyright (c) 2024, NVIDIA CORPORATION. @@ -29,11 +31,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -46,7 +48,7 @@ }} // namespace cuvs::neighbors::cagra::detail """ -mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] +mxdim_team = [(64, 8), (128, 16), (256, 32)] # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] # itopk_candidates = [64, 128, 256] # itopk_size = [64, 128, 256, 512] @@ -69,31 +71,49 @@ half_uint64=("half", "uint64_t", "float"), ) -distance_core_ext = [] +compute_distance_ext = [] +cmake_list = [] -# knn + + + +# Cleanup first +for f in glob.glob("compute_distance_standard_*.cu"): + os.remove(f) +for f in glob.glob("compute_distance_vpq_*.cu"): + os.remove(f) + +# Generate new files for type_path, (data_t, idx_t, distance_t) in search_types.items(): for (mxdim, team) in mxdim_team: # CAGRA - path = f"distance_core_{type_path}_dim{mxdim}_t{team}.cu" - includes = '#include "compute_distance.hpp"' + path = f"compute_distance_standard_{type_path}_dim{mxdim}_t{team}.cu" + includes = '#include "compute_distance_standard.cuh"' decl = f"template struct standard_dataset_descriptor_t<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>;" - distance_core_ext.append(f"extern {decl}") + compute_distance_ext.append(f"extern {decl}") with open(path, "w") as f: f.write(template.format(includes=includes, instances=decl)); - print(f"src/neighbors/detail/cagra/{path}") + cmake_list.append(f" src/neighbors/detail/cagra/{path}") # CAGRA-Q for code_book_t in code_book_types: for pq_len in pq_lens: for pq_bit in pq_bits: - path = f"distance_core_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" + path = f"compute_distance_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" decl = f"template struct cagra_q_dataset_descriptor_t<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>;" includes = '#include "compute_distance_vpq.cuh"' - distance_core_ext.append(f"extern {decl}") + compute_distance_ext.append(f"extern {decl}") with open(path, "w") as f: f.write(template.format(includes=includes, instances=decl)); - print(f"src/neighbors/detail/cagra/{path}") + cmake_list.append(f" src/neighbors/detail/cagra/{path}") + +with open("compute_distance-ext.hpp", "w") as f: + includes = ''' +#include "compute_distance_standard.cuh" +#include "compute_distance_vpq.cuh" +''' + f.write(template.format(includes=includes, instances="\n".join(compute_distance_ext))) -with open("distance_core-ext.cuh", "w") as f: - f.write(template.format(includes='#include "compute_distance.hpp"', instances="\n".join(distance_core_ext))) +cmake_list.sort() +for path in cmake_list: + print(path) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh new file mode 100644 index 000000000..1760b68a2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_distance.hpp" + +#include +#include +#include +#include + +// TODO: This shouldn't be invoking spatial/knn +#include "../ann_utils.cuh" + +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; + using LOAD_T = device::LOAD_128BIT_T; + using QUERY_T = float; + using base_type::dim; + using base_type::smem_ws_size_in_bytes; + using typename base_type::DATA_T; + using typename base_type::DISTANCE_T; + using typename base_type::INDEX_T; + using typename base_type::ws_handle; + + const DATA_T* ptr; + size_t ld; + + _RAFT_HOST_DEVICE standard_dataset_descriptor_t(const DATA_T* ptr, + INDEX_T size, + uint32_t dim, + size_t ld) + : base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), ptr(ptr), ld(ld) + { + base_type::template assert_struct_size(); + } + + _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle + { + return reinterpret_cast(smem_ptr); + } + + _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const + { + auto buf = smem_query_buffer(smem_workspace); + auto buf_len = smem_ws_size_in_bytes / sizeof(QUERY_T); + for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { + unsigned j = device::swizzling(i); + if (i < dim) { + buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(query_ptr[i]); + } else { + buf[j] = 0.0; + } + } + } + + _RAFT_DEVICE auto compute_distance(ws_handle smem_workspace, + INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) const -> DISTANCE_T + { + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + return compute_similarity( + smem_workspace, dataset_index, valid); + case cuvs::distance::DistanceType::InnerProduct: + return compute_similarity( + smem_workspace, dataset_index, valid); + default: return 0; + } + } + + private: + template + RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) + -> std::enable_if_t + { + T diff = a - b; + return diff * diff; + } + + template + RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) + -> std::enable_if_t + { + return -a * b; + } + + template + RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, + const INDEX_T dataset_i, + const bool valid) const -> DISTANCE_T + { + auto query_ptr = smem_query_buffer(smem_workspace); + const auto dataset_ptr = ptr + dataset_i * ld; + const unsigned lane_id = threadIdx.x % TeamSize; + + DISTANCE_T norm2 = 0; + if (valid) { + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { + constexpr unsigned vlen = device::get_vlen(); + constexpr unsigned reg_nelem = raft::ceildiv(DatasetBlockDim, TeamSize * vlen); + raft::TxN_t dl_buff[reg_nelem]; +#pragma unroll + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; + dl_buff[e].load(dataset_ptr, k); + } +#pragma unroll + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; +#pragma unroll + for (uint32_t v = 0; v < vlen; v++) { + const uint32_t kv = k + v; + // Note this loop can go above the dataset_dim for padded arrays. This is not a problem + // because: + // - Above the last element (dataset_dim-1), the query array is filled with zeros. + // - The data buffer has to be also padded with zeros. + DISTANCE_T d = query_ptr[device::swizzling(kv)]; + norm2 += dist_op( + d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); + } + } + } + } +#pragma unroll + for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { + norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); + } + return norm2; + } + + RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const + -> QUERY_T* + { + return reinterpret_cast(smem_workspace); + } + + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t + { + return raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +__launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( + dataset_descriptor_base_t* out, + const DataT* ptr, + IndexT size, + uint32_t dim, + size_t ld) +{ + new (out) standard_dataset_descriptor_t( + ptr, size, dim, ld); +} + +template +auto standard_dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + standard_dataset_descriptor_t dd_host{ + dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; + dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; + standard_dataset_descriptor_init_kernel + <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); + return result; +} + +template +using enable_strided = std::enable_if_t, ReturnT>; + +template +auto dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) + -> enable_strided> +{ + constexpr int64_t max_dataset_block_dim = 256; + int64_t dataset_block_dim = 64; + while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { + dataset_block_dim *= 2; + } + switch (dataset_block_dim) { + case 64: + return standard_dataset_descriptor_init<8, 64, DataT, IndexT, DistanceT>(dataset, stream); + case 128: + return standard_dataset_descriptor_init<16, 128, DataT, IndexT, DistanceT>(dataset, stream); + default: + return standard_dataset_descriptor_init<32, 256, DataT, IndexT, DistanceT>(dataset, stream); + } +} + +// template +// struct descriptor_instance_spec { +// template +// struct standard_descriptor { +// template +// struct dataset_instance_spec { +// static auto init(const cagra::search_params& params, +// const DatasetT& dataset, +// rmm::cuda_stream_view stream) +// -> dataset_descriptor_host +// { +// standard_dataset_descriptor_t +// dd_host{ +// dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), +// dataset.stride()}; +// dataset_descriptor_host result{dd_host, stream, +// DatasetBlockDim}; standard_dataset_descriptor_init_kernel +// <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, +// dd_host.ld); +// return result; +// } + +// static auto error(const cagra::search_params& params, +// const DatasetT& dataset, +// rmm::cuda_stream_view stream) +// -> dataset_descriptor_host +// { +// RAFT_FAIL("Invalid team_size {%u} - no kernel instance found for this value."); +// } + +// static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double +// { +// // If explicit team_size is specified and doesn't match the instance, discard it +// if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } +// // Otherwise, favor the closest dataset dimensionality. +// return 1.0 / (0.1 + std::abs(dataset.dim() - DatasetBlockDim)); +// } +// }; +// }; +// }; + +// template +// struct instance_selector { +// template +// static auto init(const cagra::search_params& params, +// const DatasetT& dataset, +// rmm::cuda_stream_view stream) -> ReturnT = 0; +// }; + +// template +// struct instance_selector { +// template +// static auto select_worker(const cagra::search_params& params, const DatasetT& dataset) +// { +// auto p = Spec::priority(params, dataset); +// return std::make_tuple(p >= 0 ? &(Spec::init) : &(Spec::error), p); +// } +// } + +// template +// struct instance_selector { +// template +// static auto select_worker(const cagra::search_params& params, const DatasetT& dataset) +// { +// auto p0 = Spec::priority(params, dataset); +// auto sel = instance_selector::select_worker(params, dataset); +// return p0 > std::get(sel) ? std::make_tuple(&(Spec::init), p0) : sel; +// } +// } + +// template + +// template +// auto dataset_descriptor_init(const cagra::search_params& params, +// const strided_dataset& dataset, +// rmm::cuda_stream_view stream) +// -> dataset_descriptor_host +// { +// } + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu index e66c27e86..6b0303285 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim1024_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu index ebe530cae..a10f67c85 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu index 353569911..9718aa7bc 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu index b654d509b..1467dad74 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu index 2d95145eb..c48b9cd10 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu index 39a3723f0..295df79ed 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; +template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu index 1e79be961..bfe636577 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu index df34f0f64..45bb2a253 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu similarity index 77% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu index dcdd35467..4ffa62df9 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim1024_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu index 43b4ad1fb..0bd9c3b67 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu index a1ca21a47..4506a783a 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; +template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu new file mode 100644 index 000000000..c1d5b4746 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu index dde230be5..d816a76a8 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu index 168a5e534..294717d64 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu index bfb038bbb..30041dc4b 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu index 3ab24df00..50fe00608 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim1024_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu index e30f8be8a..a006b8113 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu index a2a50f110..13181abdd 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu @@ -15,18 +15,18 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ -#include "compute_distance.hpp" +#include "compute_distance_standard.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 1932f131c..f799eb8b9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -279,79 +279,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; - template -auto vpq_dataset_descriptor_init(const vpq_dataset& dataset, - rmm::cuda_stream_view stream) + typename DatasetT> +auto vpq_dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) -> dataset_descriptor_host { + using codebook_type = typename DatasetT::math_type; const float vq_scale = 1.0f; const float pq_scale = 1.0f; cagra_q_dataset_descriptor_t @@ -427,7 +353,7 @@ auto vpq_dataset_descriptor_init(const vpq_dataset& data DatasetBlockDim, PqBits, PqLen, - CodeBookT, + codebook_type, DataT, IndexT, DistanceT><<<1, 1, 0, stream>>>(result.dev_ptr, @@ -448,30 +374,17 @@ template -auto vpq_dataset_descriptor_init_runtime(const vpq_dataset& dataset, - rmm::cuda_stream_view stream) + typename DatasetT> +auto vpq_dataset_descriptor_init_runtime(const DatasetT& dataset, rmm::cuda_stream_view stream) { if (dataset.pq_bits() == 8) { if (dataset.pq_len() == 2) { - return vpq_dataset_descriptor_init(dataset, stream); + return vpq_dataset_descriptor_init( + dataset, stream); } else if (dataset.pq_len() == 4) { - return vpq_dataset_descriptor_init(dataset, stream); + return vpq_dataset_descriptor_init( + dataset, stream); } else { RAFT_FAIL("Subspace dimension must be 2 or 4"); } @@ -480,26 +393,27 @@ auto vpq_dataset_descriptor_init_runtime(const vpq_dataset& d } } -template -auto dataset_descriptor_init(const vpq_dataset& dataset, - rmm::cuda_stream_view stream) - -> dataset_descriptor_host +template +using enable_vpq = std::enable_if_t, ReturnT>; + +template +auto dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) + -> enable_vpq> { - constexpr int64_t max_dataset_block_dim = 512; - int64_t dataset_block_dim = 128; + constexpr int64_t max_dataset_block_dim = 256; + int64_t dataset_block_dim = 64; while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { dataset_block_dim *= 2; } switch (dataset_block_dim) { + case 64: + return vpq_dataset_descriptor_init_runtime<8, 64, DataT, IndexT, DistanceT>(dataset, stream); case 128: - return vpq_dataset_descriptor_init_runtime<8, 128, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); - case 256: - return vpq_dataset_descriptor_init_runtime<16, 256, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); + return vpq_dataset_descriptor_init_runtime<16, 128, DataT, IndexT, DistanceT>(dataset, + stream); default: - return vpq_dataset_descriptor_init_runtime<32, 512, DataT, IndexT, DistanceT, DatasetIdxT>( - dataset, stream); + return vpq_dataset_descriptor_init_runtime<32, 256, DataT, IndexT, DistanceT>(dataset, + stream); } } diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu index 5d2359cd3..e44fd003e 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu index a35f988b6..ea084df2d 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu index 04489fae2..79c290b06 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu index 862c6ef5b..946865a8b 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..8decfb7f5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..d211eb358 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu index 71417b4b1..194a146bc 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu index cb9de4e7a..7f24f6913 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu index 007dc7a5a..354054ba5 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu index 8b03a6188..423fbc7a7 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..9c0d55356 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..e83addd88 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu index b71e6f4d2..b50d9cdb1 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu index d459f0807..944773c21 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu index 6263bd775..266a1e09a 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu index fc5af809f..7257b25a9 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu index 9ec47cb7c..7a59fc1cc 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu index 7f9f2d0c3..037dfd891 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim1024_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu index 9d25412d3..1865abc69 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu index b2379526f..6f7d226c4 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu index aa5e5147d..e0afc8f13 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu index 89aa65c78..bdc0e53aa 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu index d04b7fcbf..7d44b06e0 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu index 4f8148cdc..d694a2c09 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu index 227192197..9555c3107 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu index f87036bb2..76c2d53b7 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu index e581bad72..6d659133e 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu index 02b621192..702fdcea5 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..c0602c6e5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..02f8da86c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu index b51b42159..bbf543112 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu index ac86a9489..3babbf47a 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu index 414e21ece..a97337798 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu index 7b98c37b3..3e63ccca0 100644 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu @@ -15,11 +15,11 @@ */ /* - * NOTE: this file is generated by distance_core_00_generate.py + * NOTE: this file is generated by compute_distance_00_generate.py * * Make changes there and run in this directory: * - * > python distance_core_00_generate.py + * > python compute_distance_00_generate.py * */ @@ -27,6 +27,6 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..c8357c84e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..885127632 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 4e6b808f2..1b213bc6d 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -57,7 +57,7 @@ _RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u) } template -_RAFT_DEVICE inline T swizzling(T x) +RAFT_DEVICE_INLINE_FUNCTION constexpr T swizzling(T x) { // Address swizzling reduces bank conflicts in shared memory, but increases // the amount of operation instead. diff --git a/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh b/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh deleted file mode 100644 index 2dec5aa10..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core-ext.cuh +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh b/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh deleted file mode 100644 index 4e72daf63..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core-impl.cuh +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -// template -// __launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( -// dataset_descriptor_base_t* out, -// const DataT* ptr, -// IndexT size, -// uint32_t dim, -// size_t ld) -// { -// new (out) standard_dataset_descriptor_t( -// ptr, size, dim, ld); -// (void)out->set_smem_ws(out); -// } - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu deleted file mode 100644 index 47c31adfc..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_float_uint64_dim1024_t32.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 1024, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu deleted file mode 100644 index 6d28a341e..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim1024_t32.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 1024, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu deleted file mode 100644 index f051409bb..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu deleted file mode 100644 index 29c4f30bc..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu deleted file mode 100644 index fe9de9690..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim1024_t32.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 1024, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu deleted file mode 100644 index d664cdc64..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu deleted file mode 100644 index a14a6cfb9..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance.hpp" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index cdbac6a11..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 0bb71833a..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 3ac72c549..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 086219443..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 74436b559..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 8bcae9232..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 971ac1e2e..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 3c8eb14bb..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index 38ef65dac..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 9cae97b0d..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 91859bf6a..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index ed6435244..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index a9132d69c..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index 9c2e1b798..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 422145374..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 515119b54..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu deleted file mode 100644 index e51d80aaa..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 2, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu deleted file mode 100644 index c6975e620..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim1024_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 1024, 8, 4, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index c55f75af0..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 2bf8ab622..000000000 --- a/cpp/src/neighbors/detail/cagra/distance_core_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by distance_core_00_generate.py - * - * Make changes there and run in this directory: - * - * > python distance_core_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 7f4e1f316..9a855a268 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -16,7 +16,6 @@ #pragma once #include "compute_distance.hpp" -#include "compute_distance_vpq.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" From 385a8c4ab35dafbc1646597b0dcc8e69d1242075 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 21 Aug 2024 18:51:22 +0200 Subject: [PATCH 08/41] Make the compute_distance instances controlled from a single place --- .../neighbors/detail/cagra/cagra_search.cuh | 6 +- .../detail/cagra/compute_distance-ext.cuh | 211 ++++++++++++++++++ .../detail/cagra/compute_distance-ext.hpp | 86 ------- .../detail/cagra/compute_distance.hpp | 77 +++++++ .../cagra/compute_distance_00_generate.py | 56 ++++- .../cagra/compute_distance_standard.cuh | 151 +++---------- ...stance_standard_float_uint32_dim128_t16.cu | 1 + ...stance_standard_float_uint32_dim256_t32.cu | 1 + ...distance_standard_float_uint32_dim64_t8.cu | 1 + ...stance_standard_float_uint64_dim128_t16.cu | 1 + ...stance_standard_float_uint64_dim256_t32.cu | 1 + ...distance_standard_float_uint64_dim64_t8.cu | 1 + ...istance_standard_half_uint32_dim128_t16.cu | 1 + ...istance_standard_half_uint32_dim256_t32.cu | 1 + ..._distance_standard_half_uint32_dim64_t8.cu | 1 + ...istance_standard_half_uint64_dim128_t16.cu | 1 + ...istance_standard_half_uint64_dim256_t32.cu | 1 + ..._distance_standard_half_uint64_dim64_t8.cu | 1 + ...istance_standard_int8_uint32_dim128_t16.cu | 1 + ...istance_standard_int8_uint32_dim256_t32.cu | 1 + ..._distance_standard_int8_uint32_dim64_t8.cu | 1 + ...stance_standard_uint8_uint32_dim128_t16.cu | 1 + ...stance_standard_uint8_uint32_dim256_t32.cu | 1 + ...distance_standard_uint8_uint32_dim64_t8.cu | 1 + .../detail/cagra/compute_distance_vpq.cuh | 161 +++++++------ ..._float_uint32_dim128_t16_8pq_2subd_half.cu | 1 + ..._float_uint32_dim128_t16_8pq_4subd_half.cu | 1 + ..._float_uint32_dim256_t32_8pq_2subd_half.cu | 1 + ..._float_uint32_dim256_t32_8pq_4subd_half.cu | 1 + ...pq_float_uint32_dim64_t8_8pq_2subd_half.cu | 1 + ...pq_float_uint32_dim64_t8_8pq_4subd_half.cu | 1 + ..._float_uint64_dim128_t16_8pq_2subd_half.cu | 1 + ..._float_uint64_dim128_t16_8pq_4subd_half.cu | 1 + ..._float_uint64_dim256_t32_8pq_2subd_half.cu | 1 + ..._float_uint64_dim256_t32_8pq_4subd_half.cu | 1 + ...pq_float_uint64_dim64_t8_8pq_2subd_half.cu | 1 + ...pq_float_uint64_dim64_t8_8pq_4subd_half.cu | 1 + ...q_half_uint32_dim128_t16_8pq_2subd_half.cu | 1 + ...q_half_uint32_dim128_t16_8pq_4subd_half.cu | 1 + ...q_half_uint32_dim256_t32_8pq_2subd_half.cu | 1 + ...q_half_uint32_dim256_t32_8pq_4subd_half.cu | 1 + ...vpq_half_uint32_dim64_t8_8pq_2subd_half.cu | 1 + ...vpq_half_uint32_dim64_t8_8pq_4subd_half.cu | 1 + ...q_half_uint64_dim128_t16_8pq_2subd_half.cu | 1 + ...q_half_uint64_dim128_t16_8pq_4subd_half.cu | 1 + ...q_half_uint64_dim256_t32_8pq_2subd_half.cu | 1 + ...q_half_uint64_dim256_t32_8pq_4subd_half.cu | 1 + ...vpq_half_uint64_dim64_t8_8pq_2subd_half.cu | 1 + ...vpq_half_uint64_dim64_t8_8pq_4subd_half.cu | 1 + ...q_int8_uint32_dim128_t16_8pq_2subd_half.cu | 1 + ...q_int8_uint32_dim128_t16_8pq_4subd_half.cu | 1 + ...q_int8_uint32_dim256_t32_8pq_2subd_half.cu | 1 + ...q_int8_uint32_dim256_t32_8pq_4subd_half.cu | 1 + ...vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu | 1 + ...vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu | 1 + ..._uint8_uint32_dim128_t16_8pq_2subd_half.cu | 1 + ..._uint8_uint32_dim128_t16_8pq_4subd_half.cu | 1 + ..._uint8_uint32_dim256_t32_8pq_2subd_half.cu | 1 + ..._uint8_uint32_dim256_t32_8pq_4subd_half.cu | 1 + ...pq_uint8_uint32_dim64_t8_8pq_2subd_half.cu | 1 + ...pq_uint8_uint32_dim64_t8_8pq_4subd_half.cu | 1 + .../detail/cagra/search_multi_cta.cuh | 2 +- .../cagra/search_multi_cta_00_generate.py | 2 - .../cagra/search_multi_cta_float_uint32.cu | 2 - .../cagra/search_multi_cta_float_uint64.cu | 2 - .../cagra/search_multi_cta_half_uint32.cu | 2 - .../cagra/search_multi_cta_half_uint64.cu | 2 - .../cagra/search_multi_cta_int8_uint32.cu | 2 - .../cagra/search_multi_cta_kernel-inl.cuh | 2 +- .../detail/cagra/search_multi_cta_kernel.cuh | 2 +- .../cagra/search_multi_cta_uint8_uint32.cu | 2 - .../detail/cagra/search_multi_kernel.cuh | 2 +- .../neighbors/detail/cagra/search_plan.cuh | 2 +- .../detail/cagra/search_single_cta.cuh | 2 +- .../cagra/search_single_cta_00_generate.py | 2 - .../cagra/search_single_cta_float_uint32.cu | 2 - .../cagra/search_single_cta_float_uint64.cu | 2 - .../cagra/search_single_cta_half_uint32.cu | 2 - .../cagra/search_single_cta_half_uint64.cu | 2 - .../cagra/search_single_cta_int8_uint32.cu | 2 - .../cagra/search_single_cta_kernel-inl.cuh | 2 +- .../detail/cagra/search_single_cta_kernel.cuh | 2 +- .../cagra/search_single_cta_uint8_uint32.cu | 2 - 83 files changed, 511 insertions(+), 335 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index a47916cb3..0a576e849 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -16,7 +16,7 @@ #pragma once -#include "compute_distance-ext.hpp" +#include "compute_distance-ext.cuh" #include "factory.cuh" #include "search_plan.cuh" #include "search_single_cta_inst.cuh" @@ -195,7 +195,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto desc = dataset_descriptor_init(*strided_dset, stream); + auto desc = dataset_descriptor_init(params, *strided_dset, stream); search_main_core(res, params, desc, @@ -211,7 +211,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto desc = dataset_descriptor_init(*vpq_dset, stream); + auto desc = dataset_descriptor_init(params, *vpq_dset, stream); search_main_core(res, params, desc, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh new file mode 100644 index 000000000..2b972b30f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#pragma once + +#include "compute_distance_standard.cuh" +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +extern template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<8, 64, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 128, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 256, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 64, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 128, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 256, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 64, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 128, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 256, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 64, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<16, 128, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<32, 256, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<8, 64, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<16, 128, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; + +using descriptor_instances = + instance_selector, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 128, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 256, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 64, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 128, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 256, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 128, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 256, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 64, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 128, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 256, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + +template +auto dataset_descriptor_init(const cagra::search_params& params, + const DatasetT& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{ + auto [init, priority] = descriptor_instances::select(params, dataset); + if (init == nullptr || priority < 0) { + RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); + } + return init(params, dataset, stream); +} + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp b/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp deleted file mode 100644 index 18e445333..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -extern template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 542a3eb7b..e9948ec98 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -20,6 +20,7 @@ #include "utils.hpp" #include +#include #include #include #include @@ -152,4 +153,80 @@ struct dataset_descriptor_host { rmm::cuda_stream_view stream_; }; +template +using init_desc_type = dataset_descriptor_host (*)( + const cagra::search_params&, const DatasetT&, rmm::cuda_stream_view); + +template +struct instance_spec { + using data_type = DataT; + using index_type = IndexT; + using distance_type = DistanceT; + using host_type = dataset_descriptor_host; + /** Use this to constrain the input dataset type. */ + template + constexpr static inline bool accepts_dataset() + { + return false; + } +}; + +template +constexpr bool spec_sound = std::is_same_v && + std::is_same_v && + std::is_same_v && + InstanceSpec::template accepts_dataset(); + +template +constexpr auto spec_match(const cagra::search_params& params, const DatasetT& dataset) + -> std::tuple, double> +{ + if constexpr (spec_sound) { + return std::make_tuple(InstanceSpec::template init, + InstanceSpec::template priority(params, dataset)); + } + return std::make_tuple(nullptr, -1.0); +} + +template +struct instance_selector { + template + static auto select(const cagra::search_params&, const DatasetT&) + -> std::tuple, double> + { + return std::make_tuple(nullptr, -1.0); + } +}; + +template +struct instance_selector { + template + static auto select(const cagra::search_params& params, const DatasetT& dataset) + -> std::enable_if_t, + std::tuple, double>> + { + auto s0 = spec_match(params, dataset); + auto ss = instance_selector::template select( + params, dataset); + return std::get(s0) >= std::get(ss) ? s0 : ss; + } + + template + static auto select(const cagra::search_params& params, const DatasetT& dataset) + -> std::enable_if_t, + std::tuple, double>> + { + return instance_selector::template select( + params, dataset); + } +}; + } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 53c8f7f28..927af8da8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -43,7 +43,7 @@ namespace cuvs::neighbors::cagra::detail {{ -{instances} +{content} }} // namespace cuvs::neighbors::cagra::detail """ @@ -71,7 +71,8 @@ half_uint64=("half", "uint64_t", "float"), ) -compute_distance_ext = [] +specs = [] +descs = [] cmake_list = [] @@ -89,10 +90,16 @@ # CAGRA path = f"compute_distance_standard_{type_path}_dim{mxdim}_t{team}.cu" includes = '#include "compute_distance_standard.cuh"' - decl = f"template struct standard_dataset_descriptor_t<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>;" - compute_distance_ext.append(f"extern {decl}") + spec = f"standard_descriptor_spec<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>" + desc = f"standard_dataset_descriptor_t<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>" + content = f""" +template struct {desc}; +template struct {spec}; +""" + descs.append(desc) + specs.append(spec) with open(path, "w") as f: - f.write(template.format(includes=includes, instances=decl)); + f.write(template.format(includes=includes, content=content)) cmake_list.append(f" src/neighbors/detail/cagra/{path}") # CAGRA-Q @@ -100,19 +107,48 @@ for pq_len in pq_lens: for pq_bit in pq_bits: path = f"compute_distance_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" - decl = f"template struct cagra_q_dataset_descriptor_t<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>;" includes = '#include "compute_distance_vpq.cuh"' - compute_distance_ext.append(f"extern {decl}") + spec = f"vpq_descriptor_spec<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>" + desc = f"cagra_q_dataset_descriptor_t<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>" + content = f""" +template struct {desc}; +template struct {spec}; +""" + descs.append(desc) + specs.append(spec) with open(path, "w") as f: - f.write(template.format(includes=includes, instances=decl)); + f.write(template.format(includes=includes, content=content)) cmake_list.append(f" src/neighbors/detail/cagra/{path}") -with open("compute_distance-ext.hpp", "w") as f: +with open("compute_distance-ext.cuh", "w") as f: includes = ''' +#pragma once + #include "compute_distance_standard.cuh" #include "compute_distance_vpq.cuh" ''' - f.write(template.format(includes=includes, instances="\n".join(compute_distance_ext))) + newline = "\n" + contents = f''' +{newline.join(map(lambda s: "extern template struct " + s + ";", descs))} +{newline.join(map(lambda s: "extern template struct " + s + ";", specs))} + +using descriptor_instances = + instance_selector<{("," + newline + " ").join(specs)}>; + +template +auto dataset_descriptor_init(const cagra::search_params& params, + const DatasetT& dataset, + rmm::cuda_stream_view stream) + -> dataset_descriptor_host +{{ + auto [init, priority] = descriptor_instances::select(params, dataset); + if (init == nullptr || priority < 0) {{ + RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); + }} + return init(params, dataset, stream); +}} +''' + f.write(template.format(includes=includes, content=contents)) cmake_list.sort() for path in cmake_list: diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 1760b68a2..b7b67f4f5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -22,9 +22,6 @@ #include #include -// TODO: This shouldn't be invoking spatial/knn -#include "../ann_utils.cuh" - #include #include @@ -188,120 +185,46 @@ template -auto standard_dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) - -> dataset_descriptor_host -{ - standard_dataset_descriptor_t dd_host{ - dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; - dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; - standard_dataset_descriptor_init_kernel - <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); - return result; -} - -template -using enable_strided = std::enable_if_t, ReturnT>; - -template -auto dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) - -> enable_strided> -{ - constexpr int64_t max_dataset_block_dim = 256; - int64_t dataset_block_dim = 64; - while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { - dataset_block_dim *= 2; - } - switch (dataset_block_dim) { - case 64: - return standard_dataset_descriptor_init<8, 64, DataT, IndexT, DistanceT>(dataset, stream); - case 128: - return standard_dataset_descriptor_init<16, 128, DataT, IndexT, DistanceT>(dataset, stream); - default: - return standard_dataset_descriptor_init<32, 256, DataT, IndexT, DistanceT>(dataset, stream); + typename DistanceT> +struct standard_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; + + template + constexpr static inline bool accepts_dataset() + { + return is_strided_dataset_v; } -} -// template -// struct descriptor_instance_spec { -// template -// struct standard_descriptor { -// template -// struct dataset_instance_spec { -// static auto init(const cagra::search_params& params, -// const DatasetT& dataset, -// rmm::cuda_stream_view stream) -// -> dataset_descriptor_host -// { -// standard_dataset_descriptor_t -// dd_host{ -// dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), -// dataset.stride()}; -// dataset_descriptor_host result{dd_host, stream, -// DatasetBlockDim}; standard_dataset_descriptor_init_kernel -// <<<1, 1, 0, stream>>>(result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, -// dd_host.ld); -// return result; -// } + using descriptor_type = + standard_dataset_descriptor_t; + static constexpr auto init_kernel = + standard_dataset_descriptor_init_kernel; -// static auto error(const cagra::search_params& params, -// const DatasetT& dataset, -// rmm::cuda_stream_view stream) -// -> dataset_descriptor_host -// { -// RAFT_FAIL("Invalid team_size {%u} - no kernel instance found for this value."); -// } - -// static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double -// { -// // If explicit team_size is specified and doesn't match the instance, discard it -// if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } -// // Otherwise, favor the closest dataset dimensionality. -// return 1.0 / (0.1 + std::abs(dataset.dim() - DatasetBlockDim)); -// } -// }; -// }; -// }; - -// template -// struct instance_selector { -// template -// static auto init(const cagra::search_params& params, -// const DatasetT& dataset, -// rmm::cuda_stream_view stream) -> ReturnT = 0; -// }; - -// template -// struct instance_selector { -// template -// static auto select_worker(const cagra::search_params& params, const DatasetT& dataset) -// { -// auto p = Spec::priority(params, dataset); -// return std::make_tuple(p >= 0 ? &(Spec::init) : &(Spec::error), p); -// } -// } - -// template -// struct instance_selector { -// template -// static auto select_worker(const cagra::search_params& params, const DatasetT& dataset) -// { -// auto p0 = Spec::priority(params, dataset); -// auto sel = instance_selector::select_worker(params, dataset); -// return p0 > std::get(sel) ? std::make_tuple(&(Spec::init), p0) : sel; -// } -// } - -// template + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + rmm::cuda_stream_view stream) -> host_type + { + descriptor_type dd_host{ + dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; + host_type result{dd_host, stream, DatasetBlockDim}; + init_kernel<<<1, 1, 0, stream>>>( + result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); + return result; + } -// template -// auto dataset_descriptor_init(const cagra::search_params& params, -// const strided_dataset& dataset, -// rmm::cuda_stream_view stream) -// -> dataset_descriptor_host -// { -// } + template + static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); + } +}; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu index 6b0303285..887ffcc7f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +template struct standard_descriptor_spec<16, 128, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu index a10f67c85..3d82daf1a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +template struct standard_descriptor_spec<32, 256, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu index 9718aa7bc..2d7932eea 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +template struct standard_descriptor_spec<8, 64, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu index 1467dad74..a7c287fcb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +template struct standard_descriptor_spec<16, 128, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu index c48b9cd10..e34fa5b20 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +template struct standard_descriptor_spec<32, 256, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu index 295df79ed..5bd8e8197 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +template struct standard_descriptor_spec<8, 64, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu index bfe636577..cf04254e7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +template struct standard_descriptor_spec<16, 128, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu index 45bb2a253..2d3c1bc3c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +template struct standard_descriptor_spec<32, 256, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu index 4ffa62df9..1924ee42d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; +template struct standard_descriptor_spec<8, 64, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu index 0bd9c3b67..9e826ac84 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +template struct standard_descriptor_spec<16, 128, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu index 4506a783a..590927f2d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu index c1d5b4746..68f065d7c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; +template struct standard_descriptor_spec<8, 64, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu index d816a76a8..1700f66db 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +template struct standard_descriptor_spec<16, 128, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu index 294717d64..a74e28be8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +template struct standard_descriptor_spec<32, 256, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu index 30041dc4b..2c5f33603 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +template struct standard_descriptor_spec<8, 64, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu index 50fe00608..a2cb3460f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +template struct standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu index a006b8113..e358773df 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +template struct standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu index 13181abdd..0eb541853 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +template struct standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index f799eb8b9..58730f33f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -321,100 +321,89 @@ template -auto vpq_dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) - -> dataset_descriptor_host -{ - using codebook_type = typename DatasetT::math_type; - const float vq_scale = 1.0f; - const float pq_scale = 1.0f; - cagra_q_dataset_descriptor_t - dd_host{dataset.data.data_handle(), - dataset.encoded_row_length(), - dataset.pq_dim(), - dataset.vq_code_book.data_handle(), - vq_scale, - dataset.pq_code_book.data_handle(), - pq_scale, - IndexT(dataset.n_rows()), - dataset.dim()}; - dataset_descriptor_host result{dd_host, stream, DatasetBlockDim}; - vpq_dataset_descriptor_init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr, - dd_host.encoded_dataset_ptr, - dd_host.encoded_dataset_dim, - dd_host.n_subspace, - dd_host.vq_code_book_ptr, - dd_host.vq_scale, - dd_host.pq_code_book_ptr, - dd_host.pq_scale, - dd_host.size, - dd_host.dim); - return result; -} + typename DistanceT> +struct vpq_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; -template -auto vpq_dataset_descriptor_init_runtime(const DatasetT& dataset, rmm::cuda_stream_view stream) + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return std::is_same_v; + } -{ - if (dataset.pq_bits() == 8) { - if (dataset.pq_len() == 2) { - return vpq_dataset_descriptor_init( - dataset, stream); - } else if (dataset.pq_len() == 4) { - return vpq_dataset_descriptor_init( - dataset, stream); - } else { - RAFT_FAIL("Subspace dimension must be 2 or 4"); - } - } else { - RAFT_FAIL("Only 8-bit PQ is supported now"); + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return false; } -} -template -using enable_vpq = std::enable_if_t, ReturnT>; + using descriptor_type = cagra_q_dataset_descriptor_t; + static constexpr auto init_kernel = vpq_dataset_descriptor_init_kernel; -template -auto dataset_descriptor_init(const DatasetT& dataset, rmm::cuda_stream_view stream) - -> enable_vpq> -{ - constexpr int64_t max_dataset_block_dim = 256; - int64_t dataset_block_dim = 64; - while (dataset_block_dim < dataset.dim() && dataset_block_dim < max_dataset_block_dim) { - dataset_block_dim *= 2; + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + rmm::cuda_stream_view stream) -> host_type + { + const float vq_scale = 1.0f; + const float pq_scale = 1.0f; + descriptor_type dd_host{dataset.data.data_handle(), + dataset.encoded_row_length(), + dataset.pq_dim(), + dataset.vq_code_book.data_handle(), + vq_scale, + dataset.pq_code_book.data_handle(), + pq_scale, + IndexT(dataset.n_rows()), + dataset.dim()}; + host_type result{dd_host, stream, DatasetBlockDim}; + init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr, + dd_host.encoded_dataset_ptr, + dd_host.encoded_dataset_dim, + dd_host.n_subspace, + dd_host.vq_code_book_ptr, + dd_host.vq_scale, + dd_host.pq_code_book_ptr, + dd_host.pq_scale, + dd_host.size, + dd_host.dim); + return result; } - switch (dataset_block_dim) { - case 64: - return vpq_dataset_descriptor_init_runtime<8, 64, DataT, IndexT, DistanceT>(dataset, stream); - case 128: - return vpq_dataset_descriptor_init_runtime<16, 128, DataT, IndexT, DistanceT>(dataset, - stream); - default: - return vpq_dataset_descriptor_init_runtime<32, 256, DataT, IndexT, DistanceT>(dataset, - stream); + + template + static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + // Match codebook params + if (dataset.pq_bits() != PqBits) { return -1.0; } + if (dataset.pq_len() != PqLen) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); } -} +}; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu index e44fd003e..c0a861a3f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu index ea084df2d..23662a14d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu index 79c290b06..f07609951 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu index 946865a8b..1a141ce07 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu index 8decfb7f5..6cfc60508 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu index d211eb358..fd8915c7e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu index 194a146bc..721dadc38 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu index 7f24f6913..a86f41873 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu index 354054ba5..7e3f9d367 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu index 423fbc7a7..281306cda 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu index 9c0d55356..72856f713 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu index e83addd88..ed6ae9405 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu index b50d9cdb1..be732c8a3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu index 944773c21..647964ccc 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu index 266a1e09a..65d17103c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu index 7257b25a9..58f098ebe 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu index 7a59fc1cc..b148d86b8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu index 037dfd891..52d289e14 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu index 1865abc69..0472729bc 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu index 6f7d226c4..6a856e26a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu index e0afc8f13..ef518d065 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu index bdc0e53aa..89cfb217c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu index 7d44b06e0..526b5b821 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu index d694a2c09..452895555 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu index 9555c3107..2fed3b534 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu index 76c2d53b7..f4c723954 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu index 6d659133e..89c75c607 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu index 702fdcea5..133a1841b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu index c0602c6e5..ca3ab6bb0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu index 02f8da86c..1f8c37494 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu index bbf543112..471a49eda 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu index 3babbf47a..d02f17246 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu index a97337798..461367fa8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu index 3e63ccca0..8563fc512 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu index c8357c84e..21489b293 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu index 885127632..1e5888c85 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,5 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +template struct vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index 34467c916..bb9b5f647 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -16,7 +16,7 @@ #pragma once #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_multi_cta_kernel.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py index 42d104f89..3153a3a9f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py @@ -39,8 +39,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { """ diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu index 39064e003..fae5a9387 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(float, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu index fc2f6ce9c..88167b843 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(float, uint64_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu index 8a255c450..9606d510f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(half, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu index 016c8d875..dafb89cc3 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(half, uint64_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu index 17e0c67ff..a3322c435 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(int8_t, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 7cded613c..87b1c7904 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -18,7 +18,7 @@ #include "search_multi_cta_kernel.cuh" #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh index aa403647d..a3dc42424 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu index 480f7ab45..51fc6526f 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu @@ -25,8 +25,6 @@ #include "search_multi_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::multi_cta_search { instantiate_kernel_selection(uint8_t, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 9a855a268..0247bea11 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index e483a33cf..293b01e4f 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -18,7 +18,7 @@ #include "hashmap.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include #include // #include "search_single_cta_inst.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index 187fe71e3..e48b03940 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -16,7 +16,7 @@ #pragma once #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py index b401aed1a..e37ceb1fa 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py @@ -39,8 +39,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { """ diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu index 6cc3f1976..f8495bc01 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(float, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu index 0e5039733..0ef5c366f 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(float, uint64_t, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu index 5c8dc25bd..c21e6d1f4 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(half, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu index 63c089850..b96ed0b22 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(half, uint64_t, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu index c0c0e9c02..56a0d8ba9 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(int8_t, uint32_t, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index f4887a142..0b41adc40 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -18,7 +18,7 @@ #include "search_single_cta_kernel.cuh" #include "bitonic.hpp" -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh index d21c7f7aa..972bd6101 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include "compute_distance.hpp" +#include "compute_distance-ext.cuh" #include diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu index 80ea54fe5..ee6427170 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu @@ -25,8 +25,6 @@ #include "search_single_cta_inst.cuh" -#include "compute_distance.hpp" - namespace cuvs::neighbors::cagra::detail::single_cta_search { instantiate_kernel_selection(uint8_t, uint32_t, From 3f77cdac42cb55b962218f5266aabaa393675792 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 22 Aug 2024 10:58:33 +0200 Subject: [PATCH 09/41] Refactor usage of init_kernel to make sure it instantiated in the same place as compute_distance components --- cpp/CMakeLists.txt | 1 + .../detail/cagra/compute_distance-ext.cuh | 56 ++++++++++++ .../detail/cagra/compute_distance.cu | 86 +++++++++++++++++++ .../detail/cagra/compute_distance.hpp | 4 +- .../cagra/compute_distance_00_generate.py | 26 +++++- .../cagra/compute_distance_standard.cuh | 14 +-- ...stance_standard_float_uint32_dim128_t16.cu | 4 + ...stance_standard_float_uint32_dim256_t32.cu | 4 + ...distance_standard_float_uint32_dim64_t8.cu | 4 + ...stance_standard_float_uint64_dim128_t16.cu | 4 + ...stance_standard_float_uint64_dim256_t32.cu | 4 + ...distance_standard_float_uint64_dim64_t8.cu | 4 + ...istance_standard_half_uint32_dim128_t16.cu | 4 + ...istance_standard_half_uint32_dim256_t32.cu | 4 + ..._distance_standard_half_uint32_dim64_t8.cu | 4 + ...istance_standard_half_uint64_dim128_t16.cu | 4 + ...istance_standard_half_uint64_dim256_t32.cu | 4 + ..._distance_standard_half_uint64_dim64_t8.cu | 4 + ...istance_standard_int8_uint32_dim128_t16.cu | 4 + ...istance_standard_int8_uint32_dim256_t32.cu | 4 + ..._distance_standard_int8_uint32_dim64_t8.cu | 4 + ...stance_standard_uint8_uint32_dim128_t16.cu | 4 + ...stance_standard_uint8_uint32_dim256_t32.cu | 4 + ...distance_standard_uint8_uint32_dim64_t8.cu | 4 + .../detail/cagra/compute_distance_vpq.cuh | 33 +++---- ..._float_uint32_dim128_t16_8pq_2subd_half.cu | 4 + ..._float_uint32_dim128_t16_8pq_4subd_half.cu | 4 + ..._float_uint32_dim256_t32_8pq_2subd_half.cu | 4 + ..._float_uint32_dim256_t32_8pq_4subd_half.cu | 4 + ...pq_float_uint32_dim64_t8_8pq_2subd_half.cu | 4 + ...pq_float_uint32_dim64_t8_8pq_4subd_half.cu | 4 + ..._float_uint64_dim128_t16_8pq_2subd_half.cu | 4 + ..._float_uint64_dim128_t16_8pq_4subd_half.cu | 4 + ..._float_uint64_dim256_t32_8pq_2subd_half.cu | 4 + ..._float_uint64_dim256_t32_8pq_4subd_half.cu | 4 + ...pq_float_uint64_dim64_t8_8pq_2subd_half.cu | 4 + ...pq_float_uint64_dim64_t8_8pq_4subd_half.cu | 4 + ...q_half_uint32_dim128_t16_8pq_2subd_half.cu | 4 + ...q_half_uint32_dim128_t16_8pq_4subd_half.cu | 4 + ...q_half_uint32_dim256_t32_8pq_2subd_half.cu | 4 + ...q_half_uint32_dim256_t32_8pq_4subd_half.cu | 4 + ...vpq_half_uint32_dim64_t8_8pq_2subd_half.cu | 4 + ...vpq_half_uint32_dim64_t8_8pq_4subd_half.cu | 4 + ...q_half_uint64_dim128_t16_8pq_2subd_half.cu | 4 + ...q_half_uint64_dim128_t16_8pq_4subd_half.cu | 4 + ...q_half_uint64_dim256_t32_8pq_2subd_half.cu | 4 + ...q_half_uint64_dim256_t32_8pq_4subd_half.cu | 4 + ...vpq_half_uint64_dim64_t8_8pq_2subd_half.cu | 4 + ...vpq_half_uint64_dim64_t8_8pq_4subd_half.cu | 4 + ...q_int8_uint32_dim128_t16_8pq_2subd_half.cu | 4 + ...q_int8_uint32_dim128_t16_8pq_4subd_half.cu | 4 + ...q_int8_uint32_dim256_t32_8pq_2subd_half.cu | 4 + ...q_int8_uint32_dim256_t32_8pq_4subd_half.cu | 4 + ...vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu | 4 + ...vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu | 4 + ..._uint8_uint32_dim128_t16_8pq_2subd_half.cu | 4 + ..._uint8_uint32_dim128_t16_8pq_4subd_half.cu | 4 + ..._uint8_uint32_dim256_t32_8pq_2subd_half.cu | 4 + ..._uint8_uint32_dim256_t32_8pq_4subd_half.cu | 4 + ...pq_uint8_uint32_dim64_t8_8pq_2subd_half.cu | 4 + ...pq_uint8_uint32_dim64_t8_8pq_4subd_half.cu | 4 + .../cagra/search_single_cta_kernel-inl.cuh | 3 +- 62 files changed, 406 insertions(+), 33 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index dc889fbfb..65d6d789e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -204,6 +204,7 @@ add_library( src/neighbors/cagra_search_float.cu src/neighbors/cagra_search_int8.cu src/neighbors/cagra_search_uint8.cu + src/neighbors/detail/cagra/compute_distance.cu src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 2b972b30f..52ede7aac 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -139,6 +139,62 @@ extern template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; +extern template struct instance_selector< + standard_descriptor_spec<8, 64, float, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 128, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 256, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 64, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 128, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 256, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 128, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 256, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 64, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 128, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 256, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + using descriptor_instances = instance_selector, vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu new file mode 100644 index 000000000..a84fa0c97 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance-ext.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct instance_selector< + standard_descriptor_spec<8, 64, float, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 128, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 256, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 64, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 128, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 256, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 64, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 128, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 256, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 64, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 128, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 256, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index e9948ec98..b4d8c391d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -45,7 +45,7 @@ struct dataset_descriptor_base_t { * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global * memory. Increase this if new fields are needed (but try to keep the descriptors small really). */ - static constexpr size_t kMaxStructSize = 72; + static constexpr size_t kMaxStructSize = 128; template static inline constexpr void assert_struct_size() @@ -216,7 +216,7 @@ struct instance_selector { auto s0 = spec_match(params, dataset); auto ss = instance_selector::template select( params, dataset); - return std::get(s0) >= std::get(ss) ? s0 : ss; + return std::get<1>(s0) >= std::get<1>(ss) ? s0 : ss; } template diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 927af8da8..b34458abb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -90,10 +90,13 @@ # CAGRA path = f"compute_distance_standard_{type_path}_dim{mxdim}_t{team}.cu" includes = '#include "compute_distance_standard.cuh"' - spec = f"standard_descriptor_spec<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>" - desc = f"standard_dataset_descriptor_t<{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}>" + params = f"{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}" + spec = f"standard_descriptor_spec<{params}>" + desc = f"standard_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; +template <> +const void* {spec}::init_kernel = reinterpret_cast(&standard_dataset_descriptor_init_kernel<{params}>); template struct {spec}; """ descs.append(desc) @@ -108,10 +111,13 @@ for pq_bit in pq_bits: path = f"compute_distance_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" includes = '#include "compute_distance_vpq.cuh"' - spec = f"vpq_descriptor_spec<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>" - desc = f"cagra_q_dataset_descriptor_t<{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}>" + params = f"{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" + spec = f"vpq_descriptor_spec<{params}>" + desc = f"cagra_q_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; +template <> +const void* {spec}::init_kernel = reinterpret_cast(&vpq_dataset_descriptor_init_kernel<{params}>); template struct {spec}; """ descs.append(desc) @@ -132,6 +138,9 @@ {newline.join(map(lambda s: "extern template struct " + s + ";", descs))} {newline.join(map(lambda s: "extern template struct " + s + ";", specs))} +extern template struct + instance_selector<{("," + newline + " ").join(specs)}>; + using descriptor_instances = instance_selector<{("," + newline + " ").join(specs)}>; @@ -150,6 +159,15 @@ ''' f.write(template.format(includes=includes, content=contents)) + +with open("compute_distance.cu", "w") as f: + includes = '#include "compute_distance-ext.cuh"' + newline = "\n" + contents = f''' +template struct instance_selector<{("," + newline + " ").join(specs)}>; +''' + f.write(template.format(includes=includes, content=contents)) + cmake_list.sort() for path in cmake_list: print(path) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index b7b67f4f5..cd98ddcee 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -45,12 +45,12 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(); @@ -175,7 +175,7 @@ __launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( const DataT* ptr, IndexT size, uint32_t dim, - size_t ld) + uint32_t ld) { new (out) standard_dataset_descriptor_t( ptr, size, dim, ld); @@ -201,8 +201,7 @@ struct standard_descriptor_spec : public instance_spec using descriptor_type = standard_dataset_descriptor_t; - static constexpr auto init_kernel = - standard_dataset_descriptor_init_kernel; + static const void* init_kernel; template static auto init(const cagra::search_params& params, @@ -212,8 +211,9 @@ struct standard_descriptor_spec : public instance_spec descriptor_type dd_host{ dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; host_type result{dd_host, stream, DatasetBlockDim}; - init_kernel<<<1, 1, 0, stream>>>( - result.dev_ptr, dd_host.ptr, dd_host.size, dd_host.dim, dd_host.ld); + void* args[] = // NOLINT + {&result.dev_ptr, &dd_host.ptr, &dd_host.size, &dd_host.dim, &dd_host.ld}; + RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); return result; } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu index 887ffcc7f..a0bfefff7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, float, uint32_t, float>); template struct standard_descriptor_spec<16, 128, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu index 3d82daf1a..86ff6720b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, float, uint32_t, float>); template struct standard_descriptor_spec<32, 256, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu index 2d7932eea..79eadbb9e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, float, uint32_t, float>); template struct standard_descriptor_spec<8, 64, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu index a7c287fcb..96bba4bbf 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, float, uint64_t, float>); template struct standard_descriptor_spec<16, 128, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu index e34fa5b20..f8ca508af 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, float, uint64_t, float>); template struct standard_descriptor_spec<32, 256, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu index 5bd8e8197..4c47d2fe3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, float, uint64_t, float>); template struct standard_descriptor_spec<8, 64, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu index cf04254e7..12afca22f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, half, uint32_t, float>); template struct standard_descriptor_spec<16, 128, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu index 2d3c1bc3c..50b631809 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, half, uint32_t, float>); template struct standard_descriptor_spec<32, 256, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu index 1924ee42d..f55dc6b69 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, half, uint32_t, float>); template struct standard_descriptor_spec<8, 64, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu index 9e826ac84..62a9d0128 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, half, uint64_t, float>); template struct standard_descriptor_spec<16, 128, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu index 590927f2d..f05e92c19 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, half, uint64_t, float>); template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu index 68f065d7c..fbb9f1d05 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, half, uint64_t, float>); template struct standard_descriptor_spec<8, 64, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu index 1700f66db..f1fcf21c1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, int8_t, uint32_t, float>); template struct standard_descriptor_spec<16, 128, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu index a74e28be8..9769be4b7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, int8_t, uint32_t, float>); template struct standard_descriptor_spec<32, 256, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu index 2c5f33603..20b8a1cb6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, int8_t, uint32_t, float>); template struct standard_descriptor_spec<8, 64, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu index a2cb3460f..8a560702d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<16, 128, uint8_t, uint32_t, float>); template struct standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu index e358773df..a43020750 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 256, uint8_t, uint32_t, float>); template struct standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu index 0eb541853..ccdd1db3d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 64, uint8_t, uint32_t, float>); template struct standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 58730f33f..8c314bf8a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -346,7 +346,7 @@ struct vpq_descriptor_spec : public instance_spec { return false; } - using descriptor_type = cagra_q_dataset_descriptor_t { DataT, IndexT, DistanceT>; - static constexpr auto init_kernel = vpq_dataset_descriptor_init_kernel; + static const void* init_kernel; template static auto init(const cagra::search_params& params, @@ -380,16 +373,18 @@ struct vpq_descriptor_spec : public instance_spec { IndexT(dataset.n_rows()), dataset.dim()}; host_type result{dd_host, stream, DatasetBlockDim}; - init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr, - dd_host.encoded_dataset_ptr, - dd_host.encoded_dataset_dim, - dd_host.n_subspace, - dd_host.vq_code_book_ptr, - dd_host.vq_scale, - dd_host.pq_code_book_ptr, - dd_host.pq_scale, - dd_host.size, - dd_host.dim); + void* args[] = // NOLINT + {&result.dev_ptr, + &dd_host.encoded_dataset_ptr, + &dd_host.encoded_dataset_dim, + &dd_host.n_subspace, + &dd_host.vq_code_book_ptr, + &dd_host.vq_scale, + &dd_host.pq_code_book_ptr, + &dd_host.pq_scale, + &dd_host.size, + &dd_host.dim}; + RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); return result; } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu index c0a861a3f..c1ed84237 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, float, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu index 23662a14d..bbf00dbc6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, float, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu index f07609951..8d8e362ba 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, float, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu index 1a141ce07..e15768763 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, float, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu index 6cfc60508..68e2778b3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, float, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu index fd8915c7e..62789ef33 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, float, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu index 721dadc38..4ff1ca7d4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, float, uint64_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu index a86f41873..a7c1b2cc7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, float, uint64_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu index 7e3f9d367..cb5d0c592 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, float, uint64_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu index 281306cda..a48603b23 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, float, uint64_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu index 72856f713..02c88fc00 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, float, uint64_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu index ed6ae9405..8ca8b6de7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, float, uint64_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu index be732c8a3..6c3b8643e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, half, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu index 647964ccc..57b50e7b0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, half, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu index 65d17103c..a9ed297c3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, half, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu index 58f098ebe..12696685a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, half, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu index b148d86b8..9bee50622 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, half, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu index 52d289e14..3a159e041 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, half, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu index 0472729bc..9b269593a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, half, uint64_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu index 6a856e26a..464d0b6a2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, half, uint64_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu index ef518d065..4012291e0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, half, uint64_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu index 89cfb217c..2339a7174 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, half, uint64_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu index 526b5b821..b3ca6c6eb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, half, uint64_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu index 452895555..a1224a1e7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, half, uint64_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu index 2fed3b534..bcc8bb81e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu index f4c723954..1c11d398d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu index 89c75c607..7bf78c8d0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu index 133a1841b..1934cb347 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu index ca3ab6bb0..d5b235063 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu index 1f8c37494..7707c1d31 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, int8_t, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu index 471a49eda..17cab7eee 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu index d02f17246..9a60a2afe 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu index 461367fa8..a402455b8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu index 8563fc512..5c33d098c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu index 21489b293..77364b65d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu index 1e5888c85..455617b72 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,10 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, uint8_t, uint32_t, float>); template struct vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 0b41adc40..1149eb2f4 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -918,7 +918,8 @@ void select_and_run(const dataset_descriptor_base_t* d small_hash_reset_interval, sample_filter, metric); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + // RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } } // namespace single_cta_search } // namespace cuvs::neighbors::cagra::detail From ddb048808c6d4179c80e965c8c47fad5dfb1ae62 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 22 Aug 2024 14:57:38 +0200 Subject: [PATCH 10/41] Reduce the register usage in distance functions --- .../detail/cagra/compute_distance.hpp | 2 +- .../cagra/compute_distance_00_generate.py | 6 +- .../cagra/compute_distance_standard.cuh | 52 ++++--------- .../detail/cagra/compute_distance_vpq.cuh | 75 +++++-------------- 4 files changed, 38 insertions(+), 97 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index b4d8c391d..775057789 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -45,7 +45,7 @@ struct dataset_descriptor_base_t { * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global * memory. Increase this if new fields are needed (but try to keep the descriptors small really). */ - static constexpr size_t kMaxStructSize = 128; + static constexpr size_t kMaxStructSize = 64; template static inline constexpr void assert_struct_size() diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index b34458abb..e407178ad 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -48,11 +48,9 @@ }} // namespace cuvs::neighbors::cagra::detail """ +#mxdim_team = [(128, 8), (256, 16), (512, 32)] mxdim_team = [(64, 8), (128, 16), (256, 32)] -# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] -# itopk_candidates = [64, 128, 256] -# itopk_size = [64, 128, 256, 512] -# mxelem = [64, 128, 256] +#mxdim_team = [(32, 8), (64, 16), (128, 32)] pq_bits = [8] pq_lens = [2, 4] diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index cd98ddcee..3e98f77d9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -79,41 +80,9 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T - { - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - return compute_similarity( - smem_workspace, dataset_index, valid); - case cuvs::distance::DistanceType::InnerProduct: - return compute_similarity( - smem_workspace, dataset_index, valid); - default: return 0; - } - } - - private: - template - RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) - -> std::enable_if_t - { - T diff = a - b; - return diff * diff; - } - - template - RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b) - -> std::enable_if_t - { - return -a * b; - } - - template - RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace, - const INDEX_T dataset_i, - const bool valid) const -> DISTANCE_T { auto query_ptr = smem_query_buffer(smem_workspace); - const auto dataset_ptr = ptr + dataset_i * ld; + const auto dataset_ptr = ptr + dataset_index * ld; const unsigned lane_id = threadIdx.x % TeamSize; DISTANCE_T norm2 = 0; @@ -134,14 +103,22 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t= dim) break; #pragma unroll for (uint32_t v = 0; v < vlen; v++) { - const uint32_t kv = k + v; // Note this loop can go above the dataset_dim for padded arrays. This is not a problem // because: // - Above the last element (dataset_dim-1), the query array is filled with zeros. // - The data buffer has to be also padded with zeros. - DISTANCE_T d = query_ptr[device::swizzling(kv)]; - norm2 += dist_op( - d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); + DISTANCE_T d; + raft::lds(d, query_ptr + device::swizzling(k + v)); + constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + d -= mapping(dl_buff[e].val.data[v]); + norm2 += d * d; + break; + case cuvs::distance::DistanceType::InnerProduct: + norm2 -= d * mapping(dl_buff[e].val.data[v]); + break; + } } } } @@ -153,6 +130,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t QUERY_T* { diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 8c314bf8a..ac83623cb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -30,7 +30,7 @@ template + typename DistanceT> struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodeBookT; @@ -43,18 +43,13 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, - "Only CODE_BOOK_T = " - "`half` is supported " - "now"); + static_assert(std::is_same_v, "Only CODE_BOOK_T = `half` is supported now"); const std::uint8_t* encoded_dataset_ptr; const CODE_BOOK_T* vq_code_book_ptr; const CODE_BOOK_T* pq_code_book_ptr; std::uint32_t encoded_dataset_dim; std::uint32_t n_subspace; - float vq_scale; - float pq_scale; static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of(); @@ -63,9 +58,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(); } @@ -110,7 +101,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(i / 2)] = @@ -121,36 +112,21 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T + _RAFT_DEVICE auto compute_distance( + ws_handle smem_workspace, + INDEX_T dataset_index, + cuvs::distance::DistanceType /* only L2 metric is implemented */, + bool valid) const -> DISTANCE_T { - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - return compute_similarity( - smem_workspace, dataset_index, valid); - case cuvs::distance::DistanceType::InnerProduct: - return compute_similarity( - smem_workspace, dataset_index, valid); - default: return 0; - } - } - - private: - template - RAFT_DEVICE_INLINE_FUNCTION DISTANCE_T compute_similarity(ws_handle smem_workspace, - const INDEX_T node_id, - const bool valid) const - { - auto codebook_ptr = smem_pq_code_book_ptr(smem_workspace); - auto query_ptr = smem_query_buffer(smem_workspace); - float norm = 0; + auto* __restrict__ codebook_ptr = smem_pq_code_book_ptr(smem_workspace); + auto* __restrict__ query_ptr = smem_query_buffer(smem_workspace); + auto* __restrict__ node_ptr = + encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * dataset_index); + float norm = 0; if (valid) { const unsigned lane_id = threadIdx.x % TeamSize; - const uint32_t vq_code = *(reinterpret_cast( - encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * node_id))); - if (PQ_BITS == 8) { + const uint32_t vq_code = *reinterpret_cast(node_ptr); + if constexpr (PQ_BITS == 8) { for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** constexpr unsigned nelem = @@ -162,9 +138,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t= n_subspace) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - pq_codes[e] = *(reinterpret_cast( - encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * node_id) + - 4 + k)); + pq_codes[e] = *(reinterpret_cast(node_ptr + 4 + k)); } // if constexpr (PQ_LEN % 2 == 0) { @@ -237,8 +211,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t= dataset_dim) break; DISTANCE_T diff = query_ptr[d]; // (from smem) - diff -= pq_scale * static_cast(pq_vals.data[m]); - diff -= vq_scale * static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); + diff -= static_cast(pq_vals.data[m]); + diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); norm += diff * diff; } pq_code >>= 8; @@ -255,6 +229,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t CODE_BOOK_T* { @@ -293,9 +268,7 @@ __launch_bounds__(1, 1) __global__ std::uint32_t encoded_dataset_dim, std::uint32_t n_subspace, const CodeBookT* vq_code_book_ptr, - float vq_scale, const CodeBookT* pq_code_book_ptr, - float pq_scale, std::size_t size, std::uint32_t dim) { @@ -310,9 +283,7 @@ __launch_bounds__(1, 1) __global__ encoded_dataset_dim, n_subspace, vq_code_book_ptr, - vq_scale, pq_code_book_ptr, - pq_scale, size, dim); } @@ -361,15 +332,11 @@ struct vpq_descriptor_spec : public instance_spec { const DatasetT& dataset, rmm::cuda_stream_view stream) -> host_type { - const float vq_scale = 1.0f; - const float pq_scale = 1.0f; descriptor_type dd_host{dataset.data.data_handle(), dataset.encoded_row_length(), dataset.pq_dim(), dataset.vq_code_book.data_handle(), - vq_scale, dataset.pq_code_book.data_handle(), - pq_scale, IndexT(dataset.n_rows()), dataset.dim()}; host_type result{dd_host, stream, DatasetBlockDim}; @@ -379,9 +346,7 @@ struct vpq_descriptor_spec : public instance_spec { &dd_host.encoded_dataset_dim, &dd_host.n_subspace, &dd_host.vq_code_book_ptr, - &dd_host.vq_scale, &dd_host.pq_code_book_ptr, - &dd_host.pq_scale, &dd_host.size, &dd_host.dim}; RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); From c244eadb07b323455ede8199eafde51516880755 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 23 Aug 2024 11:03:36 +0200 Subject: [PATCH 11/41] Partially implemented manual dispatch --- .../detail/cagra/compute_distance.hpp | 23 +- .../cagra/compute_distance_00_generate.py | 12 + .../cagra/compute_distance_standard.cuh | 180 ++++++---- ...stance_standard_float_uint32_dim128_t16.cu | 6 + ...stance_standard_float_uint32_dim256_t32.cu | 6 + ...distance_standard_float_uint32_dim64_t8.cu | 6 + ...stance_standard_float_uint64_dim128_t16.cu | 6 + ...stance_standard_float_uint64_dim256_t32.cu | 6 + ...distance_standard_float_uint64_dim64_t8.cu | 6 + ...istance_standard_half_uint32_dim128_t16.cu | 6 + ...istance_standard_half_uint32_dim256_t32.cu | 6 + ..._distance_standard_half_uint32_dim64_t8.cu | 6 + ...istance_standard_half_uint64_dim128_t16.cu | 6 + ...istance_standard_half_uint64_dim256_t32.cu | 6 + ..._distance_standard_half_uint64_dim64_t8.cu | 6 + ...istance_standard_int8_uint32_dim128_t16.cu | 6 + ...istance_standard_int8_uint32_dim256_t32.cu | 6 + ..._distance_standard_int8_uint32_dim64_t8.cu | 6 + ...stance_standard_uint8_uint32_dim128_t16.cu | 6 + ...stance_standard_uint8_uint32_dim256_t32.cu | 6 + ...distance_standard_uint8_uint32_dim64_t8.cu | 6 + .../detail/cagra/compute_distance_vpq.cuh | 314 ++++++++++-------- ..._float_uint32_dim128_t16_8pq_2subd_half.cu | 8 + ..._float_uint32_dim128_t16_8pq_4subd_half.cu | 8 + ..._float_uint32_dim256_t32_8pq_2subd_half.cu | 8 + ..._float_uint32_dim256_t32_8pq_4subd_half.cu | 8 + ...pq_float_uint32_dim64_t8_8pq_2subd_half.cu | 7 + ...pq_float_uint32_dim64_t8_8pq_4subd_half.cu | 7 + ..._float_uint64_dim128_t16_8pq_2subd_half.cu | 8 + ..._float_uint64_dim128_t16_8pq_4subd_half.cu | 8 + ..._float_uint64_dim256_t32_8pq_2subd_half.cu | 8 + ..._float_uint64_dim256_t32_8pq_4subd_half.cu | 8 + ...pq_float_uint64_dim64_t8_8pq_2subd_half.cu | 7 + ...pq_float_uint64_dim64_t8_8pq_4subd_half.cu | 7 + ...q_half_uint32_dim128_t16_8pq_2subd_half.cu | 8 + ...q_half_uint32_dim128_t16_8pq_4subd_half.cu | 8 + ...q_half_uint32_dim256_t32_8pq_2subd_half.cu | 8 + ...q_half_uint32_dim256_t32_8pq_4subd_half.cu | 8 + ...vpq_half_uint32_dim64_t8_8pq_2subd_half.cu | 6 + ...vpq_half_uint32_dim64_t8_8pq_4subd_half.cu | 6 + ...q_half_uint64_dim128_t16_8pq_2subd_half.cu | 8 + ...q_half_uint64_dim128_t16_8pq_4subd_half.cu | 8 + ...q_half_uint64_dim256_t32_8pq_2subd_half.cu | 8 + ...q_half_uint64_dim256_t32_8pq_4subd_half.cu | 8 + ...vpq_half_uint64_dim64_t8_8pq_2subd_half.cu | 6 + ...vpq_half_uint64_dim64_t8_8pq_4subd_half.cu | 6 + ...q_int8_uint32_dim128_t16_8pq_2subd_half.cu | 8 + ...q_int8_uint32_dim128_t16_8pq_4subd_half.cu | 8 + ...q_int8_uint32_dim256_t32_8pq_2subd_half.cu | 8 + ...q_int8_uint32_dim256_t32_8pq_4subd_half.cu | 8 + ...vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu | 8 + ...vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu | 8 + ..._uint8_uint32_dim128_t16_8pq_2subd_half.cu | 9 + ..._uint8_uint32_dim128_t16_8pq_4subd_half.cu | 9 + ..._uint8_uint32_dim256_t32_8pq_2subd_half.cu | 9 + ..._uint8_uint32_dim256_t32_8pq_4subd_half.cu | 9 + ...pq_uint8_uint32_dim64_t8_8pq_2subd_half.cu | 8 + ...pq_uint8_uint32_dim64_t8_8pq_4subd_half.cu | 8 + 58 files changed, 705 insertions(+), 212 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 775057789..3623b06cc 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -45,7 +45,7 @@ struct dataset_descriptor_base_t { * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global * memory. Increase this if new fields are needed (but try to keep the descriptors small really). */ - static constexpr size_t kMaxStructSize = 64; + static constexpr size_t kMaxStructSize = 128; template static inline constexpr void assert_struct_size() @@ -58,6 +58,11 @@ struct dataset_descriptor_base_t { struct distance_workspace; using ws_handle = distance_workspace*; + using compute_distance_type = DISTANCE_T(ws_handle, INDEX_T, cuvs::distance::DistanceType, bool); + + /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector + * given by the dataset_index. */ + compute_distance_type* compute_distance; /** Number of records in the database. */ INDEX_T size; /** Dimensionality of the data/queries. */ @@ -67,11 +72,16 @@ struct dataset_descriptor_base_t { /** Total dynamic shared memory required by the descriptor. */ uint32_t smem_ws_size_in_bytes; - _RAFT_HOST_DEVICE dataset_descriptor_base_t(INDEX_T size, + _RAFT_HOST_DEVICE dataset_descriptor_base_t(compute_distance_type* compute_distance, + INDEX_T size, uint32_t dim, uint32_t team_size, uint32_t smem_ws_size_in_bytes) - : size(size), dim(dim), team_size(team_size), smem_ws_size_in_bytes(smem_ws_size_in_bytes) + : compute_distance(compute_distance), + size(size), + dim(dim), + team_size(team_size), + smem_ws_size_in_bytes(smem_ws_size_in_bytes) { } @@ -93,13 +103,6 @@ struct dataset_descriptor_base_t { /** Copy the query to the shared memory. */ _RAFT_DEVICE virtual void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const = 0; - - /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector - * given by the dataset_index. */ - _RAFT_DEVICE virtual auto compute_distance(ws_handle smem_workspace, - INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) const -> DISTANCE_T = 0; }; template diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index e407178ad..753c48a5b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -93,6 +93,12 @@ desc = f"standard_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; +template +_RAFT_DEVICE auto compute_distance_standard<{desc}>( + {desc}::ws_handle, + {desc}::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> {desc}::DISTANCE_T; template <> const void* {spec}::init_kernel = reinterpret_cast(&standard_dataset_descriptor_init_kernel<{params}>); template struct {spec}; @@ -114,6 +120,12 @@ desc = f"cagra_q_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; +template +_RAFT_DEVICE auto compute_distance_vpq<{desc}>( + {desc}::ws_handle smem_workspace, + {desc}::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> {desc}::DISTANCE_T; template <> const void* {spec}::init_kernel = reinterpret_cast(&vpq_dataset_descriptor_init_kernel<{params}>); template struct {spec}; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 3e98f77d9..ee5e71921 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -33,39 +33,53 @@ template + typename DistanceT> struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using LOAD_T = device::LOAD_128BIT_T; using QUERY_T = float; using base_type::dim; using base_type::smem_ws_size_in_bytes; + using typename base_type::compute_distance_type; using typename base_type::DATA_T; using typename base_type::DISTANCE_T; using typename base_type::INDEX_T; using typename base_type::ws_handle; + constexpr static inline auto kTeamSize = TeamSize; + constexpr static inline auto kDatasetBlockDim = DatasetBlockDim; const DATA_T* ptr; uint32_t ld; - _RAFT_HOST_DEVICE standard_dataset_descriptor_t(const DATA_T* ptr, + _RAFT_HOST_DEVICE standard_dataset_descriptor_t(compute_distance_type* compute_distance, + const DATA_T* ptr, INDEX_T size, uint32_t dim, uint32_t ld) - : base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), ptr(ptr), ld(ld) + : base_type(compute_distance, size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), + ptr(ptr), + ld(ld) { base_type::template assert_struct_size(); } _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle { + using word_type = uint32_t; + constexpr auto kStructWords = base_type::kMaxStructSize / sizeof(word_type); + auto* dst = reinterpret_cast(smem_ptr); + auto* src = reinterpret_cast(this); + for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { + dst[i] = src[i]; + } return reinterpret_cast(smem_ptr); } _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const { - auto buf = smem_query_buffer(smem_workspace); - auto buf_len = smem_ws_size_in_bytes / sizeof(QUERY_T); + auto buf = reinterpret_cast(reinterpret_cast(smem_workspace) + + base_type::kMaxStructSize); + auto buf_len = raft::round_up_safe(dim, DatasetBlockDim); for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { unsigned j = device::swizzling(i); if (i < dim) { @@ -76,72 +90,103 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T + private: + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t { - auto query_ptr = smem_query_buffer(smem_workspace); - const auto dataset_ptr = ptr + dataset_index * ld; - const unsigned lane_id = threadIdx.x % TeamSize; - - DISTANCE_T norm2 = 0; - if (valid) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { - constexpr unsigned vlen = device::get_vlen(); - constexpr unsigned reg_nelem = raft::ceildiv(DatasetBlockDim, TeamSize * vlen); - raft::TxN_t dl_buff[reg_nelem]; + return base_type::kMaxStructSize + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +_RAFT_DEVICE __noinline__ auto compute_distance_standard( + typename DescriptorT::ws_handle smem_workspace, + typename DescriptorT::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> typename DescriptorT::DISTANCE_T +{ + using DATA_T = typename DescriptorT::DATA_T; + using DISTANCE_T = typename DescriptorT::DISTANCE_T; + using INDEX_T = typename DescriptorT::INDEX_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using QUERY_T = typename DescriptorT::QUERY_T; + using ws_handle = typename DescriptorT::ws_handle; + constexpr auto kTeamSize = DescriptorT::kTeamSize; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto kMaxStructSize = DescriptorT::base_type::kMaxStructSize; + + auto* __restrict__ desc = + const_cast(reinterpret_cast(smem_workspace)); + auto* __restrict__ query_ptr = reinterpret_cast( + reinterpret_cast(smem_workspace) + kMaxStructSize); + const auto dataset_ptr = desc->ptr + (static_cast(desc->ld) * dataset_index); + const unsigned lane_id = threadIdx.x % kTeamSize; + auto dim = desc->dim; + // if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) { + // printf( + // "computing distance\n desc = %p, query = %p, dataset_ptr = %p\n ptr = %p, dim = %u, ld = + // " + // "%u\n", + // desc, + // query_ptr, + // dataset_ptr, + // desc->ptr, + // desc->dim, + // desc->ld); + // printf(" kTeamSize = %u, kDatasetBlockDim = %u, kMaxStructSize = %u\n", + // kTeamSize, + // kDatasetBlockDim, + // uint32_t(kMaxStructSize)); + // } + // return 0; + + DISTANCE_T norm2 = 0; + if (valid) { + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += kDatasetBlockDim) { + constexpr unsigned vlen = device::get_vlen(); + constexpr unsigned reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); + raft::TxN_t dl_buff[reg_nelem]; #pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; - dl_buff[e].load(dataset_ptr, k); - } + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; + dl_buff[e].load(dataset_ptr, k); + } #pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; #pragma unroll - for (uint32_t v = 0; v < vlen; v++) { - // Note this loop can go above the dataset_dim for padded arrays. This is not a problem - // because: - // - Above the last element (dataset_dim-1), the query array is filled with zeros. - // - The data buffer has to be also padded with zeros. - DISTANCE_T d; - raft::lds(d, query_ptr + device::swizzling(k + v)); - constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - d -= mapping(dl_buff[e].val.data[v]); - norm2 += d * d; - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 -= d * mapping(dl_buff[e].val.data[v]); - break; - } + for (uint32_t v = 0; v < vlen; v++) { + // Note this loop can go above the dataset_dim for padded arrays. This is not a problem + // because: + // - Above the last element (dataset_dim-1), the query array is filled with zeros. + // - The data buffer has to be also padded with zeros. + DISTANCE_T d; + raft::lds(d, query_ptr + device::swizzling(k + v)); + constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + d -= mapping(dl_buff[e].val.data[v]); + norm2 += d * d; + break; + case cuvs::distance::DistanceType::InnerProduct: + norm2 -= d * mapping(dl_buff[e].val.data[v]); + break; } } } } -#pragma unroll - for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { - norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); - } - return norm2; } - - private: - RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const - -> QUERY_T* - { - return reinterpret_cast(smem_workspace); +#pragma unroll + for (uint32_t offset = kTeamSize / 2; offset > 0; offset >>= 1) { + norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); } + return norm2; +} - RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t - { - return raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); - } -}; +// template +// __device__ typename DescriptorT::compute_distance_type* compute_distance_standard_ptr; template ( - ptr, size, dim, ld); + using desc_type = + standard_dataset_descriptor_t; + new (out) desc_type(&compute_distance_standard, ptr, size, dim, ld); + // printf("compute-distance: %p, dataset: %p\n", + // out->compute_distance, + // reinterpret_cast(out)->ptr); } template const DatasetT& dataset, rmm::cuda_stream_view stream) -> host_type { - descriptor_type dd_host{ - dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), dataset.stride()}; + descriptor_type dd_host{nullptr, + dataset.view().data_handle(), + IndexT(dataset.n_rows()), + dataset.dim(), + dataset.stride()}; host_type result{dd_host, stream, DatasetBlockDim}; void* args[] = // NOLINT {&result.dev_ptr, &dd_host.ptr, &dd_host.size, &dd_host.dim, &dd_host.ld}; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu index a0bfefff7..e8c0df121 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu index 86ff6720b..cc1ee30e4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu index 79eadbb9e..5f78cb106 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu index 96bba4bbf..ba23e1020 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu index f8ca508af..e1b4dcc00 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu index 4c47d2fe3..10eb706de 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu index 12afca22f..ea70b24b6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu index 50b631809..51702ebcd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu index f55dc6b69..0a24dd82b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu index 62a9d0128..f6dabf3c3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu index f05e92c19..d43f18861 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu index fbb9f1d05..ccb851c21 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu index f1fcf21c1..857d54beb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu index 9769be4b7..c6315a802 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu index 20b8a1cb6..cc0ea5799 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu index 8a560702d..0e67f9390 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu index a43020750..50b89cf5a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu index ccdd1db3d..100defe35 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_standard>( + standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::ws_handle, + standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::INDEX_T, + cuvs::distance::DistanceType, + bool valid) -> standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index ac83623cb..d6a459d52 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -38,10 +38,15 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); @@ -54,14 +59,15 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(); - _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr, + _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(compute_distance_type* compute_distance, + const std::uint8_t* encoded_dataset_ptr, std::uint32_t encoded_dataset_dim, std::uint32_t n_subspace, const CODE_BOOK_T* vq_code_book_ptr, const CODE_BOOK_T* pq_code_book_ptr, std::size_t size, std::uint32_t dim) - : base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), + : base_type(compute_distance, size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)), encoded_dataset_ptr(encoded_dataset_ptr), encoded_dataset_dim(encoded_dataset_dim), n_subspace(n_subspace), @@ -73,7 +79,16 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t ws_handle { - auto codebook_buf = reinterpret_cast(smem_ptr); + using word_type = uint32_t; + constexpr auto kStructWords = base_type::kMaxStructSize / sizeof(word_type); + auto* dst = reinterpret_cast(smem_ptr); + auto* src = reinterpret_cast(this); + for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { + dst[i] = src[i]; + } + + auto codebook_buf = + reinterpret_cast(reinterpret_cast(smem_ptr) + base_type::kMaxStructSize); // Copy PQ table for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { @@ -112,135 +127,19 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t DISTANCE_T - { - auto* __restrict__ codebook_ptr = smem_pq_code_book_ptr(smem_workspace); - auto* __restrict__ query_ptr = smem_query_buffer(smem_workspace); - auto* __restrict__ node_ptr = - encoded_dataset_ptr + (static_cast(encoded_dataset_dim) * dataset_index); - float norm = 0; - if (valid) { - const unsigned lane_id = threadIdx.x % TeamSize; - const uint32_t vq_code = *reinterpret_cast(node_ptr); - if constexpr (PQ_BITS == 8) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { - constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** - constexpr unsigned nelem = - raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); - // Loading PQ codes - uint32_t pq_codes[nelem]; -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - pq_codes[e] = *(reinterpret_cast(node_ptr + 4 + k)); - } - // - if constexpr (PQ_LEN % 2 == 0) { - // **** Use half2 for distance computation **** - half2 norm2{0, 0}; -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { - const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - vq_vals[m].load( - reinterpret_cast(vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; -#pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // Loading query vector in smem - half2 diff2 = (reinterpret_cast( - query_ptr))[device::swizzling(d / 2)]; - // Loading PQ code book in smem - diff2 -= *(reinterpret_cast(codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + - (2 * (pq_code & 0xff)))); - diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; - norm2 += diff2 * diff2; - } - pq_code >>= 8; - } - } - norm += static_cast(norm2.x + norm2.y); - } else { - // **** Use float for distance computation **** -#pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device - // memory) - vq_vals[m].load( - reinterpret_cast(vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; -#pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; - raft::TxN_t pq_vals; - pq_vals.load( - reinterpret_cast(codebook_ptr + PQ_LEN * (pq_code & 0xff)), - 0); // (from L1$ or smem) -#pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // if (d >= dataset_dim) break; - DISTANCE_T diff = query_ptr[d]; // (from smem) - diff -= static_cast(pq_vals.data[m]); - diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); - norm += diff * diff; - } - pq_code >>= 8; - } - } - } - } - } - } -#pragma unroll - for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { - norm += __shfl_xor_sync(0xffffffff, norm, offset); - } - return norm; - } - private: RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_pq_code_book_ptr(ws_handle smem_workspace) const -> CODE_BOOK_T* { - return reinterpret_cast(smem_workspace); + return reinterpret_cast(reinterpret_cast(smem_workspace) + + base_type::kMaxStructSize); } RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const -> QUERY_T* { return reinterpret_cast(reinterpret_cast(smem_workspace) + - kSMemCodeBookSizeInBytes); + base_type::kMaxStructSize + kSMemCodeBookSizeInBytes); } RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t @@ -249,11 +148,147 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * sizeof(QUERY_T); } }; +template +_RAFT_DEVICE __noinline__ auto compute_distance_vpq( + typename DescriptorT::ws_handle smem_workspace, + typename DescriptorT::INDEX_T dataset_index, + cuvs::distance::DistanceType /* only L2 metric is implemented */, + bool valid) -> typename DescriptorT::DISTANCE_T +{ + using DATA_T = typename DescriptorT::DATA_T; + using DISTANCE_T = typename DescriptorT::DISTANCE_T; + using INDEX_T = typename DescriptorT::INDEX_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using ws_handle = typename DescriptorT::ws_handle; + constexpr auto TeamSize = DescriptorT::kTeamSize; + constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + constexpr auto kMaxStructSize = DescriptorT::base_type::kMaxStructSize; + + auto* __restrict__ desc = + const_cast(reinterpret_cast(smem_workspace)); + auto* __restrict__ codebook_ptr = reinterpret_cast( + reinterpret_cast(smem_workspace) + kMaxStructSize); + auto* __restrict__ query_ptr = reinterpret_cast( + reinterpret_cast(codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); + auto* __restrict__ node_ptr = + desc->encoded_dataset_ptr + + (static_cast(desc->encoded_dataset_dim) * dataset_index); + const auto dim = desc->dim; + float norm = 0; + if (valid) { + const unsigned lane_id = threadIdx.x % TeamSize; + const uint32_t vq_code = *reinterpret_cast(node_ptr); + if constexpr (PQ_BITS == 8) { + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { + constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** + constexpr unsigned nelem = + raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + // Loading PQ codes + uint32_t pq_codes[nelem]; +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= desc->n_subspace) break; + // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) + pq_codes[e] = *(reinterpret_cast(node_ptr + 4 + k)); + } + // + if constexpr (PQ_LEN % 2 == 0) { + // **** Use half2 for distance computation **** + half2 norm2{0, 0}; +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= desc->n_subspace) break; + // Loading VQ code-book + raft::TxN_t vq_vals[PQ_LEN]; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { + const uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + vq_vals[m].load( + reinterpret_cast(desc->vq_code_book_ptr + d + (dim * vq_code)), 0); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; +#pragma unroll + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { + const std::uint32_t d1 = m + (PQ_LEN * v); + const std::uint32_t d = d1 + (PQ_LEN * k); + // Loading query vector in smem + half2 diff2 = (reinterpret_cast( + query_ptr))[device::swizzling(d / 2)]; + // Loading PQ code book in smem + diff2 -= *(reinterpret_cast( + codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); + diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; + norm2 += diff2 * diff2; + } + pq_code >>= 8; + } + } + norm += static_cast(norm2.x + norm2.y); + } else { + // **** Use float for distance computation **** +#pragma unroll + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= desc->n_subspace) break; + // Loading VQ code-book + raft::TxN_t vq_vals[PQ_LEN]; +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device + // memory) + vq_vals[m].load( + reinterpret_cast(desc->vq_code_book_ptr + d + (dim * vq_code)), 0); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; +#pragma unroll + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; + raft::TxN_t pq_vals; + pq_vals.load(reinterpret_cast(codebook_ptr + PQ_LEN * (pq_code & 0xff)), + 0); // (from L1$ or smem) +#pragma unroll + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d1 = m + (PQ_LEN * v); + const std::uint32_t d = d1 + (PQ_LEN * k); + // if (d >= dataset_dim) break; + DISTANCE_T diff = query_ptr[d]; // (from smem) + diff -= static_cast(pq_vals.data[m]); + diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); + norm += diff * diff; + } + pq_code >>= 8; + } + } + } + } + } + } +#pragma unroll + for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { + norm += __shfl_xor_sync(0xffffffff, norm, offset); + } + return norm; +} + template (encoded_dataset_ptr, - encoded_dataset_dim, - n_subspace, - vq_code_book_ptr, - pq_code_book_ptr, - size, - dim); + using desc_type = cagra_q_dataset_descriptor_t; + new (out) desc_type(&compute_distance_vpq, + encoded_dataset_ptr, + encoded_dataset_dim, + n_subspace, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim); } template { const DatasetT& dataset, rmm::cuda_stream_view stream) -> host_type { - descriptor_type dd_host{dataset.data.data_handle(), + descriptor_type dd_host{nullptr, + dataset.data.data_handle(), dataset.encoded_row_length(), dataset.pq_dim(), dataset.vq_code_book.data_handle(), diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu index c1ed84237..2b446bfe7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu index bbf00dbc6..b9a4d4e24 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu index 8d8e362ba..ecfdd0ba1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu index e15768763..0ca0c469f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu index 68e2778b3..31fccc42d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,13 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu index 62789ef33..ca777ddc1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,13 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu index 4ff1ca7d4..a266c5604 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu index a7c1b2cc7..52fa99f35 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu index cb5d0c592..1feb9ec56 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu index a48603b23..510979e4f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu index 02c88fc00..1de8799b2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,13 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu index 8ca8b6de7..77a706d79 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,13 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu index 6c3b8643e..f8f2dab7b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu index 57b50e7b0..16ab30586 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu index a9ed297c3..a84669242 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu index 12696685a..a5370f9d5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu index 9bee50622..c7a20cde5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu index 3a159e041..33122652c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu index 9b269593a..9512811eb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu index 464d0b6a2..1d5adc203 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu index 4012291e0..e5879fa94 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu index 2339a7174..238f5bcd3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu index b3ca6c6eb..bf909c76e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu index a1224a1e7..b096396db 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,12 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::ws_handle smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu index bcc8bb81e..feabbe9a2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu index 1c11d398d..eb11f489b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu index 7bf78c8d0..5b91653c6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu index 1934cb347..46bf1111a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu index d5b235063..2db7e9f71 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu index 7707c1d31..a9652cfe0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu index 17cab7eee..9376ce951 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,6 +28,15 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T + dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu index 9a60a2afe..5930da566 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,6 +28,15 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T + dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu index a402455b8..99f953908 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,6 +28,15 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T + dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu index 5c33d098c..2091fe0aa 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,6 +28,15 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T + dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu index 77364b65d..7a976adfa 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu index 455617b72..745d9916f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,6 +28,14 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +template _RAFT_DEVICE auto +compute_distance_vpq>( + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::ws_handle + smem_workspace, + cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) + -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( From ff2fdbe2a484f2cc31b5978ef3c9f5be11eae358 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 23 Aug 2024 15:06:19 +0200 Subject: [PATCH 12/41] Finish manual dispatch --- .../detail/cagra/compute_distance.hpp | 71 ++++----- .../cagra/compute_distance_00_generate.py | 12 -- .../cagra/compute_distance_standard.cuh | 121 +++++++-------- ...stance_standard_float_uint32_dim128_t16.cu | 6 - ...stance_standard_float_uint32_dim256_t32.cu | 6 - ...distance_standard_float_uint32_dim64_t8.cu | 6 - ...stance_standard_float_uint64_dim128_t16.cu | 6 - ...stance_standard_float_uint64_dim256_t32.cu | 6 - ...distance_standard_float_uint64_dim64_t8.cu | 6 - ...istance_standard_half_uint32_dim128_t16.cu | 6 - ...istance_standard_half_uint32_dim256_t32.cu | 6 - ..._distance_standard_half_uint32_dim64_t8.cu | 6 - ...istance_standard_half_uint64_dim128_t16.cu | 6 - ...istance_standard_half_uint64_dim256_t32.cu | 6 - ..._distance_standard_half_uint64_dim64_t8.cu | 6 - ...istance_standard_int8_uint32_dim128_t16.cu | 6 - ...istance_standard_int8_uint32_dim256_t32.cu | 6 - ..._distance_standard_int8_uint32_dim64_t8.cu | 6 - ...stance_standard_uint8_uint32_dim128_t16.cu | 6 - ...stance_standard_uint8_uint32_dim256_t32.cu | 6 - ...distance_standard_uint8_uint32_dim64_t8.cu | 6 - .../detail/cagra/compute_distance_vpq.cuh | 141 ++++++++++-------- ..._float_uint32_dim128_t16_8pq_2subd_half.cu | 8 - ..._float_uint32_dim128_t16_8pq_4subd_half.cu | 8 - ..._float_uint32_dim256_t32_8pq_2subd_half.cu | 8 - ..._float_uint32_dim256_t32_8pq_4subd_half.cu | 8 - ...pq_float_uint32_dim64_t8_8pq_2subd_half.cu | 7 - ...pq_float_uint32_dim64_t8_8pq_4subd_half.cu | 7 - ..._float_uint64_dim128_t16_8pq_2subd_half.cu | 8 - ..._float_uint64_dim128_t16_8pq_4subd_half.cu | 8 - ..._float_uint64_dim256_t32_8pq_2subd_half.cu | 8 - ..._float_uint64_dim256_t32_8pq_4subd_half.cu | 8 - ...pq_float_uint64_dim64_t8_8pq_2subd_half.cu | 7 - ...pq_float_uint64_dim64_t8_8pq_4subd_half.cu | 7 - ...q_half_uint32_dim128_t16_8pq_2subd_half.cu | 8 - ...q_half_uint32_dim128_t16_8pq_4subd_half.cu | 8 - ...q_half_uint32_dim256_t32_8pq_2subd_half.cu | 8 - ...q_half_uint32_dim256_t32_8pq_4subd_half.cu | 8 - ...vpq_half_uint32_dim64_t8_8pq_2subd_half.cu | 6 - ...vpq_half_uint32_dim64_t8_8pq_4subd_half.cu | 6 - ...q_half_uint64_dim128_t16_8pq_2subd_half.cu | 8 - ...q_half_uint64_dim128_t16_8pq_4subd_half.cu | 8 - ...q_half_uint64_dim256_t32_8pq_2subd_half.cu | 8 - ...q_half_uint64_dim256_t32_8pq_4subd_half.cu | 8 - ...vpq_half_uint64_dim64_t8_8pq_2subd_half.cu | 6 - ...vpq_half_uint64_dim64_t8_8pq_4subd_half.cu | 6 - ...q_int8_uint32_dim128_t16_8pq_2subd_half.cu | 8 - ...q_int8_uint32_dim128_t16_8pq_4subd_half.cu | 8 - ...q_int8_uint32_dim256_t32_8pq_2subd_half.cu | 8 - ...q_int8_uint32_dim256_t32_8pq_4subd_half.cu | 8 - ...vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu | 8 - ...vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu | 8 - ..._uint8_uint32_dim128_t16_8pq_2subd_half.cu | 9 -- ..._uint8_uint32_dim128_t16_8pq_4subd_half.cu | 9 -- ..._uint8_uint32_dim256_t32_8pq_2subd_half.cu | 9 -- ..._uint8_uint32_dim256_t32_8pq_4subd_half.cu | 9 -- ...pq_uint8_uint32_dim64_t8_8pq_2subd_half.cu | 8 - ...pq_uint8_uint32_dim64_t8_8pq_4subd_half.cu | 8 - .../neighbors/detail/cagra/device_common.hpp | 8 +- .../cagra/search_multi_cta_kernel-inl.cuh | 7 +- .../detail/cagra/search_multi_kernel.cuh | 13 +- .../cagra/search_single_cta_kernel-inl.cuh | 8 +- 62 files changed, 175 insertions(+), 594 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 3623b06cc..bb1c70616 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -36,33 +36,23 @@ namespace cuvs::neighbors::cagra::detail { template struct dataset_descriptor_base_t { + using base_type = dataset_descriptor_base_t; using DATA_T = DataT; using INDEX_T = IndexT; using DISTANCE_T = DistanceT; - /** - * Maximum expected size of the descriptor struct. - * This covers all standard and VPQ descriptors; we need this to copy the descriptor from global - * memory. Increase this if new fields are needed (but try to keep the descriptors small really). - */ - static constexpr size_t kMaxStructSize = 128; - - template - static inline constexpr void assert_struct_size() - { - static_assert(ActualSize <= MaximumSize, - "The maximum descriptor size is tracked in the dataset_descriptor_base_t. " - "Update this constant if implementing a new, larger descriptor."); - } - - struct distance_workspace; - using ws_handle = distance_workspace*; - - using compute_distance_type = DISTANCE_T(ws_handle, INDEX_T, cuvs::distance::DistanceType, bool); + using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); + using compute_distance_type = DISTANCE_T(const base_type*, + INDEX_T, + cuvs::distance::DistanceType, + bool); + /** Copy the descriptor and the query into shared memory and do any other work, such as + * initializing the codebook. */ + setup_workspace_type* setup_workspace_impl; /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector * given by the dataset_index. */ - compute_distance_type* compute_distance; + compute_distance_type* compute_distance_impl; /** Number of records in the database. */ INDEX_T size; /** Dimensionality of the data/queries. */ @@ -72,12 +62,14 @@ struct dataset_descriptor_base_t { /** Total dynamic shared memory required by the descriptor. */ uint32_t smem_ws_size_in_bytes; - _RAFT_HOST_DEVICE dataset_descriptor_base_t(compute_distance_type* compute_distance, - INDEX_T size, - uint32_t dim, - uint32_t team_size, - uint32_t smem_ws_size_in_bytes) - : compute_distance(compute_distance), + RAFT_INLINE_FUNCTION dataset_descriptor_base_t(setup_workspace_type* setup_workspace_impl, + compute_distance_type* compute_distance_impl, + INDEX_T size, + uint32_t dim, + uint32_t team_size, + uint32_t smem_ws_size_in_bytes) + : setup_workspace_impl(setup_workspace_impl), + compute_distance_impl(compute_distance_impl), size(size), dim(dim), team_size(team_size), @@ -85,24 +77,19 @@ struct dataset_descriptor_base_t { { } - RAFT_DEVICE_INLINE_FUNCTION void copy_descriptor_per_block( - dataset_descriptor_base_t* target) const + RAFT_DEVICE_INLINE_FUNCTION auto setup_workspace(void* smem_ptr, + const DATA_T* queries_ptr, + uint32_t query_id) const -> const base_type* { - using word_type = uint32_t; - constexpr auto kStructWords = kMaxStructSize / sizeof(word_type); - auto* dst = reinterpret_cast(target); - auto* src = reinterpret_cast(this); - for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { - dst[i] = src[i]; - } - __syncthreads(); + return setup_workspace_impl(this, smem_ptr, queries_ptr, query_id); } - /** Setup the shared memory workspace (e.g. assign pointers or prepare a lookup table). */ - _RAFT_DEVICE [[nodiscard]] virtual auto set_smem_ws(void* smem_ptr) const -> ws_handle = 0; - - /** Copy the query to the shared memory. */ - _RAFT_DEVICE virtual void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const = 0; + RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, + cuvs::distance::DistanceType metric, + bool valid) const -> DISTANCE_T + { + return compute_distance_impl(this, dataset_index, metric, valid); + } }; template @@ -122,7 +109,7 @@ struct dataset_descriptor_host { team_size{dd_host.team_size}, dataset_block_dim{dataset_block_dim} { - RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, dev_descriptor_t::kMaxStructSize, stream_)); + RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); } ~dataset_descriptor_host() noexcept diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 753c48a5b..e407178ad 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -93,12 +93,6 @@ desc = f"standard_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; -template -_RAFT_DEVICE auto compute_distance_standard<{desc}>( - {desc}::ws_handle, - {desc}::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> {desc}::DISTANCE_T; template <> const void* {spec}::init_kernel = reinterpret_cast(&standard_dataset_descriptor_init_kernel<{params}>); template struct {spec}; @@ -120,12 +114,6 @@ desc = f"cagra_q_dataset_descriptor_t<{params}>" content = f""" template struct {desc}; -template -_RAFT_DEVICE auto compute_distance_vpq<{desc}>( - {desc}::ws_handle smem_workspace, - {desc}::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> {desc}::DISTANCE_T; template <> const void* {spec}::init_kernel = reinterpret_cast(&vpq_dataset_descriptor_init_kernel<{params}>); template struct {spec}; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index ee5e71921..6e028f096 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -34,7 +34,8 @@ template -struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { +struct alignas(device::LOAD_128BIT_T) standard_dataset_descriptor_t + : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using LOAD_T = device::LOAD_128BIT_T; using QUERY_T = float; @@ -44,63 +45,78 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t(); } - _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle + private: + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t { - using word_type = uint32_t; - constexpr auto kStructWords = base_type::kMaxStructSize / sizeof(word_type); + return sizeof(standard_dataset_descriptor_t) + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +_RAFT_DEVICE __noinline__ auto setup_workspace_standard( + const typename DescriptorT::base_type* that, + void* smem_ptr, + const typename DescriptorT::DATA_T* queries_ptr, + uint32_t query_id) -> const typename DescriptorT::base_type* +{ + using descriptor_type = DescriptorT; + using base_type = typename DescriptorT::base_type; + using QUERY_T = typename descriptor_type::QUERY_T; + constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; + using word_type = uint32_t; + if (((void*)that) != smem_ptr) { + constexpr auto kStructWords = sizeof(DescriptorT) / sizeof(word_type); auto* dst = reinterpret_cast(smem_ptr); - auto* src = reinterpret_cast(this); + auto* src = reinterpret_cast(that); for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { dst[i] = src[i]; } - return reinterpret_cast(smem_ptr); } - _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const - { - auto buf = reinterpret_cast(reinterpret_cast(smem_workspace) + - base_type::kMaxStructSize); - auto buf_len = raft::round_up_safe(dim, DatasetBlockDim); - for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { - unsigned j = device::swizzling(i); - if (i < dim) { - buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(query_ptr[i]); - } else { - buf[j] = 0.0; - } + uint32_t dim = that->dim; + auto buf = reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT)); + auto buf_len = raft::round_up_safe(dim, DatasetBlockDim); + queries_ptr += dim * query_id; + for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { + unsigned j = device::swizzling(i); + if (i < dim) { + buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(queries_ptr[i]); + } else { + buf[j] = 0.0; } } - private: - RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t - { - return base_type::kMaxStructSize + - raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); - } -}; + return const_cast(reinterpret_cast(smem_ptr)); +} template _RAFT_DEVICE __noinline__ auto compute_distance_standard( - typename DescriptorT::ws_handle smem_workspace, + const typename DescriptorT::base_type* desc_, typename DescriptorT::INDEX_T dataset_index, cuvs::distance::DistanceType metric, bool valid) -> typename DescriptorT::DISTANCE_T @@ -110,35 +126,14 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( using INDEX_T = typename DescriptorT::INDEX_T; using LOAD_T = typename DescriptorT::LOAD_T; using QUERY_T = typename DescriptorT::QUERY_T; - using ws_handle = typename DescriptorT::ws_handle; constexpr auto kTeamSize = DescriptorT::kTeamSize; constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto kMaxStructSize = DescriptorT::base_type::kMaxStructSize; - auto* __restrict__ desc = - const_cast(reinterpret_cast(smem_workspace)); - auto* __restrict__ query_ptr = reinterpret_cast( - reinterpret_cast(smem_workspace) + kMaxStructSize); - const auto dataset_ptr = desc->ptr + (static_cast(desc->ld) * dataset_index); - const unsigned lane_id = threadIdx.x % kTeamSize; - auto dim = desc->dim; - // if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) { - // printf( - // "computing distance\n desc = %p, query = %p, dataset_ptr = %p\n ptr = %p, dim = %u, ld = - // " - // "%u\n", - // desc, - // query_ptr, - // dataset_ptr, - // desc->ptr, - // desc->dim, - // desc->ld); - // printf(" kTeamSize = %u, kDatasetBlockDim = %u, kMaxStructSize = %u\n", - // kTeamSize, - // kDatasetBlockDim, - // uint32_t(kMaxStructSize)); - // } - // return 0; + auto* __restrict__ desc = reinterpret_cast(desc_); + auto* __restrict__ query_ptr = reinterpret_cast(desc + 1); + const auto dataset_ptr = desc->ptr + (static_cast(desc->ld) * dataset_index); + const unsigned lane_id = threadIdx.x % kTeamSize; + auto dim = desc->dim; DISTANCE_T norm2 = 0; if (valid) { @@ -185,9 +180,6 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( return norm2; } -// template -// __device__ typename DescriptorT::compute_distance_type* compute_distance_standard_ptr; - template ; - new (out) desc_type(&compute_distance_standard, ptr, size, dim, ld); - // printf("compute-distance: %p, dataset: %p\n", - // out->compute_distance, - // reinterpret_cast(out)->ptr); + new (out) desc_type(&setup_workspace_standard, + &compute_distance_standard, + ptr, + size, + dim, + ld); } template rmm::cuda_stream_view stream) -> host_type { descriptor_type dd_host{nullptr, + nullptr, dataset.view().data_handle(), IndexT(dataset.n_rows()), dataset.dim(), diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu index e8c0df121..a0bfefff7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu index cc1ee30e4..86ff6720b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu index 5f78cb106..79eadbb9e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, float, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu index ba23e1020..96bba4bbf 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu index e1b4dcc00..f8ca508af 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu index 10eb706de..4c47d2fe3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, float, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu index ea70b24b6..12afca22f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu index 51702ebcd..50b631809 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu index 0a24dd82b..f55dc6b69 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, half, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu index f6dabf3c3..62a9d0128 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu index d43f18861..f05e92c19 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu index ccb851c21..fbb9f1d05 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, half, uint64_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu index 857d54beb..f1fcf21c1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu index c6315a802..9769be4b7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu index cc0ea5799..20b8a1cb6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu index 0e67f9390..8a560702d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu index 50b89cf5a..a43020750 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu index 100defe35..ccdd1db3d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_standard>( - standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::ws_handle, - standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::INDEX_T, - cuvs::distance::DistanceType, - bool valid) -> standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index d6a459d52..896814ff6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -31,7 +31,8 @@ template -struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { +struct alignas(device::LOAD_128BIT_T) cagra_q_dataset_descriptor_t + : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodeBookT; using LOAD_T = device::LOAD_128BIT_T; @@ -42,7 +43,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(); - _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(compute_distance_type* compute_distance, + _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, + compute_distance_type* compute_distance_impl, const std::uint8_t* encoded_dataset_ptr, std::uint32_t encoded_dataset_dim, std::uint32_t n_subspace, @@ -67,34 +69,72 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(); } - _RAFT_DEVICE [[nodiscard]] auto set_smem_ws(void* smem_ptr) const -> ws_handle + private: + RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t { - using word_type = uint32_t; - constexpr auto kStructWords = base_type::kMaxStructSize / sizeof(word_type); + /* SMEM workspace layout: + 1. The descriptor itself + 2. Codebook (kSMemCodeBookSizeInBytes bytes) + 3. Queries (smem_query_buffer_length elems) + */ + return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes + + raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); + } +}; + +template +_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::base_type* that_, + void* smem_ptr, + const typename DescriptorT::DATA_T* queries_ptr, + uint32_t query_id) -> const + typename DescriptorT::base_type* +{ + using descriptor_type = DescriptorT; + using base_type = typename DescriptorT::base_type; + using DATA_T = typename DescriptorT::DATA_T; + using DISTANCE_T = typename DescriptorT::DISTANCE_T; + using INDEX_T = typename DescriptorT::INDEX_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + constexpr auto TeamSize = DescriptorT::kTeamSize; + constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + using word_type = uint32_t; + + auto* that = reinterpret_cast(that_); + + if (((void*)that) != smem_ptr) { + constexpr auto kStructWords = sizeof(DescriptorT) / sizeof(word_type); auto* dst = reinterpret_cast(smem_ptr); - auto* src = reinterpret_cast(this); + auto* src = reinterpret_cast(that); for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { dst[i] = src[i]; } auto codebook_buf = - reinterpret_cast(reinterpret_cast(smem_ptr) + base_type::kMaxStructSize); + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT)); // Copy PQ table for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { half2 buf2; - buf2.x = pq_code_book_ptr[i]; - buf2.y = pq_code_book_ptr[i + 1]; + buf2.x = that->pq_code_book_ptr[i]; + buf2.y = that->pq_code_book_ptr[i + 1]; // Change the order of PQ code book array to reduce the // frequency of bank conflicts. @@ -105,57 +145,34 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(smem_ptr); - } - - _RAFT_DEVICE void copy_query(ws_handle smem_workspace, const DATA_T* query_ptr) const - { - constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; - auto smem_query_ptr = smem_query_buffer(smem_workspace); - for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { - half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(query_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(query_ptr[i + 1]); } - if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { - // Use swizzling in the condition to reduce bank conflicts in shared - // memory, which are likely to occur when pq_code_book_dim is large. - ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = - buf2; - } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf2; - } - } } - private: - RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_pq_code_book_ptr(ws_handle smem_workspace) const - -> CODE_BOOK_T* - { - return reinterpret_cast(reinterpret_cast(smem_workspace) + - base_type::kMaxStructSize); - } + uint32_t dim = that->dim; + queries_ptr += dim * query_id; - RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const - -> QUERY_T* - { - return reinterpret_cast(reinterpret_cast(smem_workspace) + - base_type::kMaxStructSize + kSMemCodeBookSizeInBytes); + constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + auto smem_query_ptr = + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { + half2 buf2{0, 0}; + if (i < dim) { buf2.x = mapping(queries_ptr[i]); } + if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } + if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { + // Use swizzling in the condition to reduce bank conflicts in shared + // memory, which are likely to occur when pq_code_book_dim is large. + ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = buf2; + } else { + (reinterpret_cast(smem_query_ptr + i))[0] = buf2; + } } - RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t - { - /* SMEM workspace layout: - 1. Codebook (kSMemCodeBookSizeInBytes bytes) - 2. Queries (smem_query_buffer_length elems) - */ - return base_type::kMaxStructSize + kSMemCodeBookSizeInBytes + - raft::round_up_safe(dim, DatasetBlockDim) * sizeof(QUERY_T); - } -}; + return const_cast(reinterpret_cast(smem_ptr)); +} template _RAFT_DEVICE __noinline__ auto compute_distance_vpq( - typename DescriptorT::ws_handle smem_workspace, + const typename DescriptorT::base_type* desc_, typename DescriptorT::INDEX_T dataset_index, cuvs::distance::DistanceType /* only L2 metric is implemented */, bool valid) -> typename DescriptorT::DISTANCE_T @@ -166,18 +183,14 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( using LOAD_T = typename DescriptorT::LOAD_T; using QUERY_T = typename DescriptorT::QUERY_T; using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using ws_handle = typename DescriptorT::ws_handle; constexpr auto TeamSize = DescriptorT::kTeamSize; constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - constexpr auto kMaxStructSize = DescriptorT::base_type::kMaxStructSize; - auto* __restrict__ desc = - const_cast(reinterpret_cast(smem_workspace)); - auto* __restrict__ codebook_ptr = reinterpret_cast( - reinterpret_cast(smem_workspace) + kMaxStructSize); - auto* __restrict__ query_ptr = reinterpret_cast( + auto* __restrict__ desc = reinterpret_cast(desc_); + auto* __restrict__ codebook_ptr = reinterpret_cast(desc + 1); + auto* __restrict__ query_ptr = reinterpret_cast( reinterpret_cast(codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); auto* __restrict__ node_ptr = desc->encoded_dataset_ptr + @@ -315,7 +328,8 @@ __launch_bounds__(1, 1) __global__ DataT, IndexT, DistanceT>; - new (out) desc_type(&compute_distance_vpq, + new (out) desc_type(&setup_workspace_vpq, + &compute_distance_vpq, encoded_dataset_ptr, encoded_dataset_dim, n_subspace, @@ -370,6 +384,7 @@ struct vpq_descriptor_spec : public instance_spec { rmm::cuda_stream_view stream) -> host_type { descriptor_type dd_host{nullptr, + nullptr, dataset.data.data_handle(), dataset.encoded_row_length(), dataset.pq_dim(), diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu index 2b446bfe7..c1ed84237 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu index b9a4d4e24..bbf00dbc6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu index ecfdd0ba1..8d8e362ba 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu index 0ca0c469f..e15768763 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu index 31fccc42d..68e2778b3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,13 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu index ca777ddc1..62789ef33 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,13 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu index a266c5604..4ff1ca7d4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu index 52fa99f35..a7c1b2cc7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu index 1feb9ec56..cb5d0c592 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu index 510979e4f..a48603b23 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu index 1de8799b2..02c88fc00 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,13 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu index 77a706d79..8ca8b6de7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,13 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu index f8f2dab7b..6c3b8643e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu index 16ab30586..57b50e7b0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu index a84669242..a9ed297c3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu index a5370f9d5..12696685a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu index c7a20cde5..9bee50622 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu index 33122652c..3a159e041 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu index 9512811eb..9b269593a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu index 1d5adc203..464d0b6a2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu index e5879fa94..4012291e0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu index 238f5bcd3..2339a7174 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu index bf909c76e..b3ca6c6eb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu index b096396db..a1224a1e7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu @@ -28,12 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::ws_handle smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu index feabbe9a2..bcc8bb81e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu index eb11f489b..1c11d398d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu index 5b91653c6..7bf78c8d0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu index 46bf1111a..1934cb347 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu index 2db7e9f71..d5b235063 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu index a9652cfe0..7707c1d31 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu index 9376ce951..17cab7eee 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu @@ -28,15 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T - dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu index 5930da566..9a60a2afe 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu @@ -28,15 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T - dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu index 99f953908..a402455b8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu @@ -28,15 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T - dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu index 2091fe0aa..5c33d098c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu @@ -28,15 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T - dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu index 7a976adfa..77364b65d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu index 745d9916f..455617b72 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu @@ -28,14 +28,6 @@ namespace cuvs::neighbors::cagra::detail { template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; -template _RAFT_DEVICE auto -compute_distance_vpq>( - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::ws_handle - smem_workspace, - cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) - -> cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>::DISTANCE_T; template <> const void* vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 1b213bc6d..88c2b2546 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -75,7 +75,6 @@ template RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( IndexT* result_child_indices_ptr, DistanceT* result_child_distances_ptr, - // query - typename DATASET_DESCRIPTOR_T::ws_handle workspace, // [dataset_dim, dataset_size] const DATASET_DESCRIPTOR_T& dataset_desc, // [knn_k, dataset_size] @@ -185,8 +182,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( IndexT child_id = invalid_index; if (valid_i) { child_id = result_child_indices_ptr[i]; } - auto norm2 = - dataset_desc.compute_distance(workspace, child_id, metric, child_id != invalid_index); + auto norm2 = dataset_desc.compute_distance(child_id, metric, child_id != invalid_index); // Store the distance const unsigned lane_id = threadIdx.x % team_size; diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 87b1c7904..56dd0d8f6 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -195,7 +195,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( assert(result_buffer_size_32 <= MAX_ELEMENTS); // Set smem working buffer for the distance calculation - auto distance_workspace = dataset_desc->set_smem_ws(smem); + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + __syncthreads(); auto result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); @@ -212,8 +213,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( result_distances_buffer[i] = utils::get_max_value(); } #endif - const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc->dim; - dataset_desc->copy_query(distance_workspace, query_ptr); if (threadIdx.x == 0) { terminate_flag[0] = 0; } INDEX_T* const local_visited_hashmap_ptr = @@ -229,7 +228,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( device::compute_distance_to_random_nodes(result_indices_buffer, result_distances_buffer, - distance_workspace, *dataset_desc, result_buffer_size, num_distilation, @@ -272,7 +270,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( _CLK_START(); device::compute_distance_to_child_nodes(result_indices_buffer + itopk_size, result_distances_buffer + itopk_size, - distance_workspace, *dataset_desc, knn_graph, graph_degree, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 0247bea11..7aa7cd9d3 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -118,10 +118,7 @@ RAFT_KERNEL random_pickup_kernel( const uint32_t query_id = blockIdx.y; if (global_team_index >= num_pickup) { return; } extern __shared__ uint8_t smem[]; - auto distance_workspace = dataset_desc->set_smem_ws(smem); - // Load a query - dataset_desc->copy_query(distance_workspace, queries_ptr + query_id * dataset_desc->dim); - + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); __syncthreads(); INDEX_T best_index_team_local; @@ -136,7 +133,7 @@ RAFT_KERNEL random_pickup_kernel( device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size; } - DISTANCE_T norm2 = dataset_desc->compute_distance(distance_workspace, seed_index, metric, true); + DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, metric, true); if (norm2 < best_norm2_team_local) { best_norm2_team_local = norm2; best_index_team_local = seed_index; @@ -330,9 +327,8 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const auto query_id = blockIdx.y; extern __shared__ uint8_t smem[]; - auto distance_workspace = dataset_desc->set_smem_ws(smem); // Load a query - dataset_desc->copy_query(distance_workspace, query_ptr + query_id * dataset_desc->dim); + dataset_desc = dataset_desc->setup_workspace(smem, query_ptr, query_id); __syncthreads(); if (global_team_id >= search_width * graph_degree) { return; } @@ -358,8 +354,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const auto compute_distance_flag = hashmap::insert( team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); - DISTANCE_T norm2 = - dataset_desc->compute_distance(distance_workspace, child_id, metric, compute_distance_flag); + DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, metric, compute_distance_flag); if (compute_distance_flag) { if (threadIdx.x % team_size == 0) { diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 1149eb2f4..f3b47b846 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -524,7 +524,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const auto small_hash_size = hashmap::get_size(small_hash_bitlen); // Set smem working buffer for the distance calculation - auto distance_workspace = dataset_desc->set_smem_ws(smem); + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + __syncthreads(); auto result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); @@ -540,9 +541,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // A flag for filtering. auto filter_flag = terminate_flag; - const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc->dim; - dataset_desc->copy_query(distance_workspace, query_ptr); - if (threadIdx.x == 0) { terminate_flag[0] = 0; topk_ws[0] = ~0u; @@ -564,7 +562,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; device::compute_distance_to_random_nodes(result_indices_buffer, result_distances_buffer, - distance_workspace, *dataset_desc, result_buffer_size, num_distilation, @@ -696,7 +693,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( _CLK_START(); device::compute_distance_to_child_nodes(result_indices_buffer + internal_topk, result_distances_buffer + internal_topk, - distance_workspace, *dataset_desc, knn_graph, graph_degree, From 78a980945350aed3089bab7a874ad5d2ab1c8146 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 23 Aug 2024 15:30:26 +0200 Subject: [PATCH 13/41] Change instance generator to have blockdim/team_size ratio 16 --- cpp/CMakeLists.txt | 108 ++--- .../detail/cagra/compute_distance-ext.cuh | 432 +++++++++--------- .../detail/cagra/compute_distance.cu | 108 ++--- .../cagra/compute_distance_00_generate.py | 4 +- ...stance_standard_float_uint32_dim128_t8.cu} | 8 +- ...tance_standard_float_uint32_dim256_t16.cu} | 8 +- ...tance_standard_float_uint32_dim512_t32.cu} | 8 +- ...stance_standard_float_uint64_dim128_t8.cu} | 8 +- ...tance_standard_float_uint64_dim256_t16.cu} | 8 +- ...tance_standard_float_uint64_dim512_t32.cu} | 8 +- ...istance_standard_half_uint32_dim128_t8.cu} | 8 +- ...stance_standard_half_uint32_dim256_t16.cu} | 8 +- ...stance_standard_half_uint32_dim512_t32.cu} | 8 +- ..._distance_standard_half_uint32_dim64_t8.cu | 37 -- ...istance_standard_half_uint64_dim128_t8.cu} | 8 +- ...stance_standard_half_uint64_dim256_t16.cu} | 8 +- ...istance_standard_half_uint64_dim512_t32.cu | 37 ++ ..._distance_standard_half_uint64_dim64_t8.cu | 37 -- ...istance_standard_int8_uint32_dim128_t8.cu} | 8 +- ...stance_standard_int8_uint32_dim256_t16.cu} | 8 +- ...stance_standard_int8_uint32_dim512_t32.cu} | 8 +- ...istance_standard_uint8_uint32_dim128_t8.cu | 37 ++ ...tance_standard_uint8_uint32_dim256_t16.cu} | 8 +- ...tance_standard_uint8_uint32_dim512_t32.cu} | 8 +- ..._float_uint32_dim128_t8_8pq_2subd_half.cu} | 8 +- ..._float_uint32_dim128_t8_8pq_4subd_half.cu} | 8 +- ...float_uint32_dim256_t16_8pq_2subd_half.cu} | 8 +- ...float_uint32_dim256_t16_8pq_4subd_half.cu} | 8 +- ...float_uint32_dim512_t32_8pq_2subd_half.cu} | 8 +- ...float_uint32_dim512_t32_8pq_4subd_half.cu} | 8 +- ...q_float_uint64_dim128_t8_8pq_2subd_half.cu | 37 ++ ...q_float_uint64_dim128_t8_8pq_4subd_half.cu | 37 ++ ...float_uint64_dim256_t16_8pq_2subd_half.cu} | 8 +- ...float_uint64_dim256_t16_8pq_4subd_half.cu} | 8 +- ...float_uint64_dim512_t32_8pq_2subd_half.cu} | 8 +- ...float_uint64_dim512_t32_8pq_4subd_half.cu} | 8 +- ...q_half_uint32_dim128_t8_8pq_2subd_half.cu} | 8 +- ...q_half_uint32_dim128_t8_8pq_4subd_half.cu} | 8 +- ..._half_uint32_dim256_t16_8pq_2subd_half.cu} | 8 +- ..._half_uint32_dim256_t16_8pq_4subd_half.cu} | 8 +- ..._half_uint32_dim512_t32_8pq_2subd_half.cu} | 8 +- ..._half_uint32_dim512_t32_8pq_4subd_half.cu} | 8 +- ...vpq_half_uint32_dim64_t8_8pq_2subd_half.cu | 37 -- ...vpq_half_uint32_dim64_t8_8pq_4subd_half.cu | 37 -- ...q_half_uint64_dim128_t8_8pq_2subd_half.cu} | 8 +- ...q_half_uint64_dim128_t8_8pq_4subd_half.cu} | 8 +- ..._half_uint64_dim256_t16_8pq_2subd_half.cu} | 8 +- ..._half_uint64_dim256_t16_8pq_4subd_half.cu} | 8 +- ..._half_uint64_dim512_t32_8pq_2subd_half.cu} | 8 +- ..._half_uint64_dim512_t32_8pq_4subd_half.cu} | 8 +- ...vpq_half_uint64_dim64_t8_8pq_2subd_half.cu | 37 -- ...vpq_half_uint64_dim64_t8_8pq_4subd_half.cu | 37 -- ...q_int8_uint32_dim128_t8_8pq_2subd_half.cu} | 8 +- ...q_int8_uint32_dim128_t8_8pq_4subd_half.cu} | 8 +- ..._int8_uint32_dim256_t16_8pq_2subd_half.cu} | 8 +- ..._int8_uint32_dim256_t16_8pq_4subd_half.cu} | 8 +- ..._int8_uint32_dim512_t32_8pq_2subd_half.cu} | 8 +- ..._int8_uint32_dim512_t32_8pq_4subd_half.cu} | 8 +- ...q_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 37 ++ ...q_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 37 ++ ...uint8_uint32_dim256_t16_8pq_2subd_half.cu} | 8 +- ...uint8_uint32_dim256_t16_8pq_4subd_half.cu} | 8 +- ...uint8_uint32_dim512_t32_8pq_2subd_half.cu} | 8 +- ...uint8_uint32_dim512_t32_8pq_4subd_half.cu} | 8 +- 64 files changed, 740 insertions(+), 740 deletions(-) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_int8_uint32_dim64_t8.cu => compute_distance_standard_float_uint32_dim128_t8.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint32_dim128_t16.cu => compute_distance_standard_float_uint32_dim256_t16.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint32_dim256_t32.cu => compute_distance_standard_float_uint32_dim512_t32.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_half_uint64_dim128_t16.cu => compute_distance_standard_float_uint64_dim128_t8.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint64_dim256_t32.cu => compute_distance_standard_float_uint64_dim256_t16.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint64_dim128_t16.cu => compute_distance_standard_float_uint64_dim512_t32.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint32_dim64_t8.cu => compute_distance_standard_half_uint32_dim128_t8.cu} (79%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_half_uint32_dim128_t16.cu => compute_distance_standard_half_uint32_dim256_t16.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_half_uint32_dim256_t32.cu => compute_distance_standard_half_uint32_dim512_t32.cu} (80%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_float_uint64_dim64_t8.cu => compute_distance_standard_half_uint64_dim128_t8.cu} (79%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_half_uint64_dim256_t32.cu => compute_distance_standard_half_uint64_dim256_t16.cu} (80%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_uint8_uint32_dim64_t8.cu => compute_distance_standard_int8_uint32_dim128_t8.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_int8_uint32_dim128_t16.cu => compute_distance_standard_int8_uint32_dim256_t16.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_int8_uint32_dim256_t32.cu => compute_distance_standard_int8_uint32_dim512_t32.cu} (80%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_uint8_uint32_dim128_t16.cu => compute_distance_standard_uint8_uint32_dim256_t16.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard_uint8_uint32_dim256_t32.cu => compute_distance_standard_uint8_uint32_dim512_t32.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu => compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu => compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu} (80%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu} (80%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu} (80%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu => compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu => compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu} (78%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu} (80%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu => compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu => compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu => compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu} (80%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu => compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu} (80%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 65d6d789e..1843ca6e7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,60 +205,60 @@ add_library( src/neighbors/cagra_search_int8.cu src/neighbors/cagra_search_uint8.cu src/neighbors/detail/cagra/compute_distance.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 52ede7aac..cdec569c1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -30,226 +30,226 @@ namespace cuvs::neighbors::cagra::detail { -extern template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<8, 64, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 128, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 256, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 64, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 128, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 256, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 64, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 128, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 256, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 64, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<16, 128, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<32, 256, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<8, 64, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<16, 128, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; +extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<8, 128, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 256, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 512, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 128, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 256, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 512, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 128, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 256, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 512, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>; +extern template struct standard_descriptor_spec<8, 128, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<16, 256, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<32, 512, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>; +extern template struct standard_descriptor_spec<8, 128, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<16, 256, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>; +extern template struct standard_descriptor_spec<32, 512, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>; +extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>; extern template struct instance_selector< - standard_descriptor_spec<8, 64, float, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 128, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 256, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 64, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 128, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 256, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 128, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 256, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 64, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 128, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 256, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + standard_descriptor_spec<8, 128, float, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 256, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 512, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 128, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 256, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 512, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 256, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 512, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 128, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 256, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 512, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; using descriptor_instances = - instance_selector, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 128, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 256, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 64, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 128, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 256, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 128, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 256, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 64, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 128, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 256, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + instance_selector, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 256, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 512, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 128, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 256, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 512, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 256, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 512, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 128, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 256, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 512, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; template auto dataset_descriptor_init(const cagra::search_params& params, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index a84fa0c97..d18a60dd3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -28,59 +28,59 @@ namespace cuvs::neighbors::cagra::detail { template struct instance_selector< - standard_descriptor_spec<8, 64, float, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 128, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 256, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 64, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 128, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 256, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 64, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 64, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 128, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 256, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 64, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 128, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 256, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>>; + standard_descriptor_spec<8, 128, float, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<16, 256, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<32, 512, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, + standard_descriptor_spec<8, 128, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<16, 256, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<32, 512, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, + standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, + standard_descriptor_spec<8, 128, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<16, 256, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<32, 512, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, + standard_descriptor_spec<8, 128, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<16, 256, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, + standard_descriptor_spec<32, 512, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, + vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index e407178ad..3927b1ed7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -48,8 +48,8 @@ }} // namespace cuvs::neighbors::cagra::detail """ -#mxdim_team = [(128, 8), (256, 16), (512, 32)] -mxdim_team = [(64, 8), (128, 16), (256, 32)] +mxdim_team = [(128, 8), (256, 16), (512, 32)] +#mxdim_team = [(64, 8), (128, 16), (256, 32)] #mxdim_team = [(32, 8), (64, 16), (128, 32)] pq_bits = [8] diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu index 20b8a1cb6..6d21b9364 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 64, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; template <> -const void* standard_descriptor_spec<8, 64, int8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<8, 128, float, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<8, 64, int8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<8, 128, float, uint32_t, float>); +template struct standard_descriptor_spec<8, 128, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu index a0bfefff7..b535aa716 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; template <> -const void* standard_descriptor_spec<16, 128, float, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, float, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, float, uint32_t, float>); -template struct standard_descriptor_spec<16, 128, float, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, float, uint32_t, float>); +template struct standard_descriptor_spec<16, 256, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu index 86ff6720b..37c804a4e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; template <> -const void* standard_descriptor_spec<32, 256, float, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<32, 512, float, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, float, uint32_t, float>); -template struct standard_descriptor_spec<32, 256, float, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<32, 512, float, uint32_t, float>); +template struct standard_descriptor_spec<32, 512, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu index 62a9d0128..ad6b12ef8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, half, uint64_t, float>; +template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; template <> -const void* standard_descriptor_spec<16, 128, half, uint64_t, float>::init_kernel = +const void* standard_descriptor_spec<8, 128, float, uint64_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, half, uint64_t, float>); -template struct standard_descriptor_spec<16, 128, half, uint64_t, float>; + &standard_dataset_descriptor_init_kernel<8, 128, float, uint64_t, float>); +template struct standard_descriptor_spec<8, 128, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu index f8ca508af..1c41899d4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; template <> -const void* standard_descriptor_spec<32, 256, float, uint64_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, float, uint64_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, float, uint64_t, float>); -template struct standard_descriptor_spec<32, 256, float, uint64_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, float, uint64_t, float>); +template struct standard_descriptor_spec<16, 256, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu index 96bba4bbf..36f58bb68 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; template <> -const void* standard_descriptor_spec<16, 128, float, uint64_t, float>::init_kernel = +const void* standard_descriptor_spec<32, 512, float, uint64_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, float, uint64_t, float>); -template struct standard_descriptor_spec<16, 128, float, uint64_t, float>; + &standard_dataset_descriptor_init_kernel<32, 512, float, uint64_t, float>); +template struct standard_descriptor_spec<32, 512, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu similarity index 79% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu index 79eadbb9e..dd73a2363 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 64, float, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; template <> -const void* standard_descriptor_spec<8, 64, float, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<8, 128, half, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, float, uint32_t, float>); -template struct standard_descriptor_spec<8, 64, float, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<8, 128, half, uint32_t, float>); +template struct standard_descriptor_spec<8, 128, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu index 12afca22f..b431e468a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; template <> -const void* standard_descriptor_spec<16, 128, half, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, half, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, half, uint32_t, float>); -template struct standard_descriptor_spec<16, 128, half, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, half, uint32_t, float>); +template struct standard_descriptor_spec<16, 256, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu index 50b631809..29eaf36eb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, half, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; template <> -const void* standard_descriptor_spec<32, 256, half, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<32, 512, half, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, half, uint32_t, float>); -template struct standard_descriptor_spec<32, 256, half, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<32, 512, half, uint32_t, float>); +template struct standard_descriptor_spec<32, 512, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu deleted file mode 100644 index f55dc6b69..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim64_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 64, half, uint32_t, float>; -template <> -const void* standard_descriptor_spec<8, 64, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, half, uint32_t, float>); -template struct standard_descriptor_spec<8, 64, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu similarity index 79% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu index 4c47d2fe3..066d08793 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 64, float, uint64_t, float>; +template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; template <> -const void* standard_descriptor_spec<8, 64, float, uint64_t, float>::init_kernel = +const void* standard_descriptor_spec<8, 128, half, uint64_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, float, uint64_t, float>); -template struct standard_descriptor_spec<8, 64, float, uint64_t, float>; + &standard_dataset_descriptor_init_kernel<8, 128, half, uint64_t, float>); +template struct standard_descriptor_spec<8, 128, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu index f05e92c19..a2ace4528 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, half, uint64_t, float>; +template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; template <> -const void* standard_descriptor_spec<32, 256, half, uint64_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, half, uint64_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, half, uint64_t, float>); -template struct standard_descriptor_spec<32, 256, half, uint64_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, half, uint64_t, float>); +template struct standard_descriptor_spec<16, 256, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu new file mode 100644 index 000000000..1417d3284 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; +template <> +const void* standard_descriptor_spec<32, 512, half, uint64_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<32, 512, half, uint64_t, float>); +template struct standard_descriptor_spec<32, 512, half, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu deleted file mode 100644 index fbb9f1d05..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim64_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 64, half, uint64_t, float>; -template <> -const void* standard_descriptor_spec<8, 64, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, half, uint64_t, float>); -template struct standard_descriptor_spec<8, 64, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu index ccdd1db3d..01970b374 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim64_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<8, 64, uint8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; template <> -const void* standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<8, 128, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 64, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<8, 64, uint8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<8, 128, int8_t, uint32_t, float>); +template struct standard_descriptor_spec<8, 128, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu index f1fcf21c1..296070314 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; template <> -const void* standard_descriptor_spec<16, 128, int8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<16, 128, int8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, int8_t, uint32_t, float>); +template struct standard_descriptor_spec<16, 256, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu index 9769be4b7..95f3c94d1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, int8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; template <> -const void* standard_descriptor_spec<32, 256, int8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<32, 512, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<32, 256, int8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<32, 512, int8_t, uint32_t, float>); +template struct standard_descriptor_spec<32, 512, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..c5fe8e28e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; +template <> +const void* standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel<8, 128, uint8_t, uint32_t, float>); +template struct standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu index 8a560702d..a6fc25350 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<16, 128, uint8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; template <> -const void* standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 128, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<16, 128, uint8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<16, 256, uint8_t, uint32_t, float>); +template struct standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu index a43020750..fb86dc8d4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t<32, 256, uint8_t, uint32_t, float>; +template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; template <> -const void* standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>::init_kernel = +const void* standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 256, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<32, 256, uint8_t, uint32_t, float>; + &standard_dataset_descriptor_init_kernel<32, 512, uint8_t, uint32_t, float>); +template struct standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu index d5b235063..d6831560f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu index 7707c1d31..548a9b75a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu index 8d8e362ba..828008555 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu index e15768763..49a449384 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu index c1ed84237..2d40ae3dd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu index bbf00dbc6..2dfad4f28 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, float, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..e97a4a840 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..2b20e7af3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; +template <> +const void* vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu index 4ff1ca7d4..b624e4cc9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu index a7c1b2cc7..8efd32c96 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu index cb5d0c592..9a62f74f0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu index a48603b23..7b344cd07 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, float, uint64_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu index 68e2778b3..a4f9676d8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu index 62789ef33..eccd180d2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu index 6c3b8643e..ebda2c92d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu index 57b50e7b0..88b1e1678 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu index a9ed297c3..94d4f1f84 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu index 12696685a..e8249238d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, half, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu deleted file mode 100644 index 9bee50622..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu deleted file mode 100644 index 3a159e041..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim64_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu index 02c88fc00..cee5f07a3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu index 8ca8b6de7..9b1daa3e3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, float, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, float, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu index 9b269593a..7fb295f55 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, half, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu index 464d0b6a2..712d28082 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, half, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu index 4012291e0..307991526 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, half, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu index 2339a7174..341f70bc0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, half, uint64_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, half, uint64_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, half, uint64_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu deleted file mode 100644 index b3ca6c6eb..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu deleted file mode 100644 index a1224a1e7..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim64_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu index 77364b65d..f17e58da7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 2, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 2, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu similarity index 78% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu index 455617b72..21568247a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim64_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<8, 64, 8, 4, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 64, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 64, 8, 4, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu index bcc8bb81e..e164c976f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu index 1c11d398d..4880d6718 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu index 7bf78c8d0..7c9c44911 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu index 1934cb347..c44f82c2e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, int8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, int8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, int8_t, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..dac083b05 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..15fe73593 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; +template <> +const void* vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu index 17cab7eee..df5b01e2a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 2, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 2, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu index 9a60a2afe..edf8361a3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<16, 128, 8, 4, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 128, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 128, 8, 4, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu index a402455b8..fc40634d5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 2, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 2, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu similarity index 80% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu index 5c33d098c..0b0d269f7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu @@ -27,11 +27,11 @@ namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t<32, 256, 8, 4, half, uint8_t, uint32_t, float>; +template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; template <> -const void* vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = +const void* vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 256, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 256, 8, 4, half, uint8_t, uint32_t, float>; + &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, uint8_t, uint32_t, float>); +template struct vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>; } // namespace cuvs::neighbors::cagra::detail From 6082bf72becc016a44ffe239494caeb7ccbaddae Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 23 Aug 2024 17:30:12 +0200 Subject: [PATCH 14/41] Trying various minor things to reduce register spilling --- cpp/CMakeLists.txt | 4 +- .../cagra/compute_distance_standard.cuh | 17 +++--- .../detail/cagra/compute_distance_vpq.cuh | 16 ++--- .../neighbors/detail/cagra/device_common.hpp | 59 ++++++++++--------- cpp/src/neighbors/detail/cagra/utils.hpp | 10 ++-- 5 files changed, 54 insertions(+), 52 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1843ca6e7..8bd706ac8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -273,8 +273,8 @@ add_library( src/neighbors/detail/cagra/search_single_cta_half_uint64.cu ) -file(GLOB_RECURSE distance_core_sources "src/neighbors/detail/cagra/compute_distance_*.cu") -set_source_files_properties(${distance_core_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) +file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu") +set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) set_target_properties( cuvs-cagra-search diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 6e028f096..4b3bca1f2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -117,9 +117,9 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( template _RAFT_DEVICE __noinline__ auto compute_distance_standard( const typename DescriptorT::base_type* desc_, - typename DescriptorT::INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::INDEX_T dataset_index, + const cuvs::distance::DistanceType metric, + const bool valid) -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; @@ -129,11 +129,12 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( constexpr auto kTeamSize = DescriptorT::kTeamSize; constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - auto* __restrict__ desc = reinterpret_cast(desc_); - auto* __restrict__ query_ptr = reinterpret_cast(desc + 1); - const auto dataset_ptr = desc->ptr + (static_cast(desc->ld) * dataset_index); - const unsigned lane_id = threadIdx.x % kTeamSize; - auto dim = desc->dim; + const auto* __restrict__ desc = reinterpret_cast(desc_); + const auto* __restrict__ query_ptr = reinterpret_cast(desc + 1); + const auto* __restrict__ dataset_ptr = + desc->ptr + (static_cast(desc->ld) * dataset_index); + const auto lane_id = threadIdx.x % kTeamSize; + const auto dim = desc->dim; DISTANCE_T norm2 = 0; if (valid) { diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 896814ff6..84c3617a9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -173,9 +173,9 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::b template _RAFT_DEVICE __noinline__ auto compute_distance_vpq( const typename DescriptorT::base_type* desc_, - typename DescriptorT::INDEX_T dataset_index, - cuvs::distance::DistanceType /* only L2 metric is implemented */, - bool valid) -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::INDEX_T dataset_index, + const cuvs::distance::DistanceType /* only L2 metric is implemented */, + const bool valid) -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; @@ -188,11 +188,11 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - auto* __restrict__ desc = reinterpret_cast(desc_); - auto* __restrict__ codebook_ptr = reinterpret_cast(desc + 1); - auto* __restrict__ query_ptr = reinterpret_cast( + const auto* __restrict__ desc = reinterpret_cast(desc_); + const auto* __restrict__ codebook_ptr = reinterpret_cast(desc + 1); + const auto* __restrict__ query_ptr = reinterpret_cast( reinterpret_cast(codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); - auto* __restrict__ node_ptr = + const auto* __restrict__ node_ptr = desc->encoded_dataset_ptr + (static_cast(desc->encoded_dataset_dim) * dataset_index); const auto dim = desc->dim; @@ -207,7 +207,7 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); // Loading PQ codes uint32_t pq_codes[nelem]; -#pragma unroll +#pragma unroll 1 for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; if (k >= desc->n_subspace) break; diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 88c2b2546..ef372c16d 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -22,6 +22,7 @@ // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors #include +#include #include @@ -73,19 +74,19 @@ template RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( - IndexT* result_indices_ptr, // [num_pickup] - DistanceT* result_distances_ptr, // [num_pickup] - const DATASET_DESCRIPTOR_T& dataset_desc, - size_t num_pickup, - unsigned num_distilation, - uint64_t rand_xor_mask, - const IndexT* seed_ptr, // [num_seeds] - uint32_t num_seeds, - IndexT* visited_hash_ptr, - uint32_t hash_bitlen, - cuvs::distance::DistanceType metric, - uint32_t block_id = 0, - uint32_t num_blocks = 1) + IndexT* __restrict__ result_indices_ptr, // [num_pickup] + DistanceT* __restrict__ result_distances_ptr, // [num_pickup] + const DATASET_DESCRIPTOR_T& __restrict__ dataset_desc, + const size_t num_pickup, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const IndexT* __restrict__ seed_ptr, // [num_seeds] + const uint32_t num_seeds, + IndexT* __restrict__ visited_hash_ptr, + const uint32_t hash_bitlen, + const cuvs::distance::DistanceType metric, + const uint32_t block_id = 0, + const uint32_t num_blocks = 1) { const auto team_size = dataset_desc.team_size; uint32_t max_i = num_pickup; @@ -97,7 +98,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const bool valid_i = (i < num_pickup); IndexT best_index_team_local; - DistanceT best_norm2_team_local = utils::get_max_value(); + DistanceT best_norm2_team_local = raft::upper_bound(); for (uint32_t j = 0; j < num_distilation; j++) { // Select a node randomly and compute the distance to it IndexT seed_index; @@ -125,8 +126,8 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( result_distances_ptr[i] = best_norm2_team_local; result_indices_ptr[i] = best_index_team_local; } else { - result_distances_ptr[i] = utils::get_max_value(); - result_indices_ptr[i] = utils::get_max_value(); + result_distances_ptr[i] = raft::upper_bound(); + result_indices_ptr[i] = raft::upper_bound(); } } } @@ -134,23 +135,23 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( template RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( - IndexT* result_child_indices_ptr, - DistanceT* result_child_distances_ptr, + IndexT* __restrict__ result_child_indices_ptr, + DistanceT* __restrict__ result_child_distances_ptr, // [dataset_dim, dataset_size] - const DATASET_DESCRIPTOR_T& dataset_desc, + const DATASET_DESCRIPTOR_T& __restrict__ dataset_desc, // [knn_k, dataset_size] - const IndexT* knn_graph, - uint32_t knn_k, + const IndexT* __restrict__ knn_graph, + const uint32_t knn_k, // hashmap - IndexT* visited_hashmap_ptr, - uint32_t hash_bitlen, - const IndexT* parent_indices, - const IndexT* internal_topk_list, - uint32_t search_width, - cuvs::distance::DistanceType metric) + IndexT* __restrict__ visited_hashmap_ptr, + const uint32_t hash_bitlen, + const IndexT* __restrict__ parent_indices, + const IndexT* __restrict__ internal_topk_list, + const uint32_t search_width, + const cuvs::distance::DistanceType metric) { constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask::value; - const IndexT invalid_index = utils::get_max_value(); + constexpr IndexT invalid_index = raft::upper_bound(); // Read child indices of parents from knn graph and check if the distance // computaiton is necessary. @@ -190,7 +191,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( if (child_id != invalid_index) { result_child_distances_ptr[i] = norm2; } else { - result_child_distances_ptr[i] = utils::get_max_value(); + result_child_distances_ptr[i] = raft::upper_bound(); } } } diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 8ce20ec5c..0f8309328 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -125,24 +125,24 @@ union fp_conv { FP_T fp; }; template -_RAFT_HOST_DEVICE inline T get_max_value(); +_RAFT_HOST_DEVICE constexpr inline T get_max_value(); template <> -_RAFT_HOST_DEVICE inline float get_max_value() +_RAFT_HOST_DEVICE constexpr inline float get_max_value() { return FLT_MAX; }; template <> -_RAFT_HOST_DEVICE inline half get_max_value() +_RAFT_HOST_DEVICE constexpr inline half get_max_value() { return fp_conv{.bs = 0x7aff}.fp; }; template <> -_RAFT_HOST_DEVICE inline std::uint32_t get_max_value() +_RAFT_HOST_DEVICE constexpr inline std::uint32_t get_max_value() { return 0xffffffffu; }; template <> -_RAFT_HOST_DEVICE inline std::uint64_t get_max_value() +_RAFT_HOST_DEVICE constexpr inline std::uint64_t get_max_value() { return 0xfffffffffffffffflu; }; From fc7d83248ffb2dc6042850dbd8c1a4075a67320d Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 26 Aug 2024 15:24:44 +0200 Subject: [PATCH 15/41] Move the metric parameter to the compute_distance template --- cpp/CMakeLists.txt | 126 +- .../neighbors/detail/cagra/cagra_search.cuh | 48 +- .../detail/cagra/compute_distance-ext.cuh | 2364 +++++++++++++++-- .../detail/cagra/compute_distance.cu | 584 +++- .../detail/cagra/compute_distance.hpp | 41 +- .../cagra/compute_distance_00_generate.py | 53 +- .../cagra/compute_distance_standard.cuh | 49 +- ...ard_InnerProduct_float_uint32_dim128_t8.cu | 57 + ...rd_InnerProduct_float_uint32_dim256_t16.cu | 57 + ...rd_InnerProduct_float_uint32_dim512_t32.cu | 57 + ...ard_InnerProduct_float_uint64_dim128_t8.cu | 57 + ...rd_InnerProduct_float_uint64_dim256_t16.cu | 57 + ...rd_InnerProduct_float_uint64_dim512_t32.cu | 57 + ...dard_InnerProduct_half_uint32_dim128_t8.cu | 57 + ...ard_InnerProduct_half_uint32_dim256_t16.cu | 57 + ...ard_InnerProduct_half_uint32_dim512_t32.cu | 57 + ...dard_InnerProduct_half_uint64_dim128_t8.cu | 57 + ...ard_InnerProduct_half_uint64_dim256_t16.cu | 57 + ...ard_InnerProduct_half_uint64_dim512_t32.cu | 57 + ...dard_InnerProduct_int8_uint32_dim128_t8.cu | 57 + ...ard_InnerProduct_int8_uint32_dim256_t16.cu | 57 + ...ard_InnerProduct_int8_uint32_dim512_t32.cu | 57 + ...ard_InnerProduct_uint8_uint32_dim128_t8.cu | 57 + ...rd_InnerProduct_uint8_uint32_dim256_t16.cu | 57 + ...rd_InnerProduct_uint8_uint32_dim512_t32.cu | 57 + ...ndard_L2Expanded_float_uint32_dim128_t8.cu | 57 + ...dard_L2Expanded_float_uint32_dim256_t16.cu | 57 + ...dard_L2Expanded_float_uint32_dim512_t32.cu | 57 + ...ndard_L2Expanded_float_uint64_dim128_t8.cu | 57 + ...dard_L2Expanded_float_uint64_dim256_t16.cu | 57 + ...dard_L2Expanded_float_uint64_dim512_t32.cu | 57 + ...andard_L2Expanded_half_uint32_dim128_t8.cu | 57 + ...ndard_L2Expanded_half_uint32_dim256_t16.cu | 57 + ...ndard_L2Expanded_half_uint32_dim512_t32.cu | 57 + ...andard_L2Expanded_half_uint64_dim128_t8.cu | 57 + ...ndard_L2Expanded_half_uint64_dim256_t16.cu | 57 + ...ndard_L2Expanded_half_uint64_dim512_t32.cu | 57 + ...andard_L2Expanded_int8_uint32_dim128_t8.cu | 57 + ...ndard_L2Expanded_int8_uint32_dim256_t16.cu | 57 + ...ndard_L2Expanded_int8_uint32_dim512_t32.cu | 57 + ...ndard_L2Expanded_uint8_uint32_dim128_t8.cu | 57 + ...dard_L2Expanded_uint8_uint32_dim256_t16.cu | 57 + ...dard_L2Expanded_uint8_uint32_dim512_t32.cu | 57 + ...istance_standard_float_uint32_dim128_t8.cu | 37 - ...stance_standard_float_uint32_dim256_t16.cu | 37 - ...stance_standard_float_uint32_dim512_t32.cu | 37 - ...istance_standard_float_uint64_dim128_t8.cu | 37 - ...stance_standard_float_uint64_dim256_t16.cu | 37 - ...stance_standard_float_uint64_dim512_t32.cu | 37 - ...distance_standard_half_uint32_dim128_t8.cu | 37 - ...istance_standard_half_uint32_dim256_t16.cu | 37 - ...istance_standard_half_uint32_dim512_t32.cu | 37 - ...distance_standard_half_uint64_dim128_t8.cu | 37 - ...istance_standard_half_uint64_dim256_t16.cu | 37 - ...istance_standard_half_uint64_dim512_t32.cu | 37 - ...distance_standard_int8_uint32_dim128_t8.cu | 37 - ...istance_standard_int8_uint32_dim256_t16.cu | 37 - ...istance_standard_int8_uint32_dim512_t32.cu | 37 - ...istance_standard_uint8_uint32_dim128_t8.cu | 37 - ...stance_standard_uint8_uint32_dim256_t16.cu | 37 - ...stance_standard_uint8_uint32_dim512_t32.cu | 37 - .../detail/cagra/compute_distance_vpq.cuh | 23 +- ...d_float_uint32_dim128_t8_8pq_2subd_half.cu | 69 + ...d_float_uint32_dim128_t8_8pq_4subd_half.cu | 69 + ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 69 + ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 69 + ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 69 + ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 69 + ...d_float_uint64_dim128_t8_8pq_2subd_half.cu | 69 + ...d_float_uint64_dim128_t8_8pq_4subd_half.cu | 69 + ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 69 + ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 69 + ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 69 + ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 69 + ...ed_half_uint32_dim128_t8_8pq_2subd_half.cu | 69 + ...ed_half_uint32_dim128_t8_8pq_4subd_half.cu | 69 + ...d_half_uint32_dim256_t16_8pq_2subd_half.cu | 69 + ...d_half_uint32_dim256_t16_8pq_4subd_half.cu | 69 + ...d_half_uint32_dim512_t32_8pq_2subd_half.cu | 69 + ...d_half_uint32_dim512_t32_8pq_4subd_half.cu | 69 + ...ed_half_uint64_dim128_t8_8pq_2subd_half.cu | 69 + ...ed_half_uint64_dim128_t8_8pq_4subd_half.cu | 69 + ...d_half_uint64_dim256_t16_8pq_2subd_half.cu | 69 + ...d_half_uint64_dim256_t16_8pq_4subd_half.cu | 69 + ...d_half_uint64_dim512_t32_8pq_2subd_half.cu | 69 + ...d_half_uint64_dim512_t32_8pq_4subd_half.cu | 69 + ...ed_int8_uint32_dim128_t8_8pq_2subd_half.cu | 69 + ...ed_int8_uint32_dim128_t8_8pq_4subd_half.cu | 69 + ...d_int8_uint32_dim256_t16_8pq_2subd_half.cu | 69 + ...d_int8_uint32_dim256_t16_8pq_4subd_half.cu | 69 + ...d_int8_uint32_dim512_t32_8pq_2subd_half.cu | 69 + ...d_int8_uint32_dim512_t32_8pq_4subd_half.cu | 69 + ...d_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 69 + ...d_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 69 + ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 69 + ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 69 + ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 69 + ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 69 + ...q_float_uint32_dim128_t8_8pq_2subd_half.cu | 37 - ...q_float_uint32_dim128_t8_8pq_4subd_half.cu | 37 - ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 37 - ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 37 - ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 37 - ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 37 - ...q_float_uint64_dim128_t8_8pq_2subd_half.cu | 37 - ...q_float_uint64_dim128_t8_8pq_4subd_half.cu | 37 - ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 37 - ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 37 - ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 37 - ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 37 - ...pq_half_uint32_dim128_t8_8pq_2subd_half.cu | 37 - ...pq_half_uint32_dim128_t8_8pq_4subd_half.cu | 37 - ...q_half_uint32_dim256_t16_8pq_2subd_half.cu | 37 - ...q_half_uint32_dim256_t16_8pq_4subd_half.cu | 37 - ...q_half_uint32_dim512_t32_8pq_2subd_half.cu | 37 - ...q_half_uint32_dim512_t32_8pq_4subd_half.cu | 37 - ...pq_half_uint64_dim128_t8_8pq_2subd_half.cu | 37 - ...pq_half_uint64_dim128_t8_8pq_4subd_half.cu | 37 - ...q_half_uint64_dim256_t16_8pq_2subd_half.cu | 37 - ...q_half_uint64_dim256_t16_8pq_4subd_half.cu | 37 - ...q_half_uint64_dim512_t32_8pq_2subd_half.cu | 37 - ...q_half_uint64_dim512_t32_8pq_4subd_half.cu | 37 - ...pq_int8_uint32_dim128_t8_8pq_2subd_half.cu | 37 - ...pq_int8_uint32_dim128_t8_8pq_4subd_half.cu | 37 - ...q_int8_uint32_dim256_t16_8pq_2subd_half.cu | 37 - ...q_int8_uint32_dim256_t16_8pq_4subd_half.cu | 37 - ...q_int8_uint32_dim512_t32_8pq_2subd_half.cu | 37 - ...q_int8_uint32_dim512_t32_8pq_4subd_half.cu | 37 - ...q_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 37 - ...q_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 37 - ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 37 - ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 37 - ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 37 - ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 37 - .../neighbors/detail/cagra/device_common.hpp | 8 +- cpp/src/neighbors/detail/cagra/factory.cuh | 11 +- .../detail/cagra/search_multi_cta.cuh | 6 +- .../detail/cagra/search_multi_cta_inst.cuh | 1 - .../cagra/search_multi_cta_kernel-inl.cuh | 11 +- .../detail/cagra/search_multi_cta_kernel.cuh | 1 - .../detail/cagra/search_multi_kernel.cuh | 25 +- .../neighbors/detail/cagra/search_plan.cuh | 14 +- .../detail/cagra/search_single_cta.cuh | 6 +- .../detail/cagra/search_single_cta_inst.cuh | 1 - .../cagra/search_single_cta_kernel-inl.cuh | 13 +- .../detail/cagra/search_single_cta_kernel.cuh | 1 - 146 files changed, 7431 insertions(+), 2489 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8bd706ac8..ad0303486 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,60 +205,78 @@ add_library( src/neighbors/cagra_search_int8.cu src/neighbors/cagra_search_uint8.cu src/neighbors/detail/cagra/compute_distance.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 0a576e849..ed2122a50 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -86,16 +86,14 @@ inline } template -void search_main_core( - raft::resources const& res, - search_params params, - const dataset_descriptor_host& dataset_desc, - raft::device_matrix_view graph, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances, - CagraSampleFilterT sample_filter = CagraSampleFilterT(), - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) +void search_main_core(raft::resources const& res, + search_params params, + const dataset_descriptor_host& dataset_desc, + raft::device_matrix_view graph, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + CagraSampleFilterT sample_filter = CagraSampleFilterT()) { RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", static_cast(graph.extent(0)), @@ -119,7 +117,7 @@ void search_main_core( using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector::type; std::unique_ptr> plan = factory::create( - res, params, dataset_desc, queries.extent(1), graph.extent(1), topk, metric); + res, params, dataset_desc, queries.extent(1), graph.extent(1), topk); plan->check(topk); @@ -195,32 +193,20 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto desc = dataset_descriptor_init(params, *strided_dset, stream); - search_main_core(res, - params, - desc, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto desc = dataset_descriptor_init( + params, *strided_dset, index.metric(), stream); + search_main_core( + res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { // Search using a compressed dataset RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto desc = dataset_descriptor_init(params, *vpq_dset, stream); - search_main_core(res, - params, - desc, - graph_internal, - queries, - neighbors, - distances, - sample_filter, - index.metric()); + auto desc = dataset_descriptor_init( + params, *vpq_dset, index.metric(), stream); + search_main_core( + res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* empty_dset = dynamic_cast*>(&index.data()); empty_dset != nullptr) { // Forgot to add a dataset. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index cdec569c1..1dbc843d0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -30,238 +30,2164 @@ namespace cuvs::neighbors::cagra::detail { -extern template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; -extern template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; -extern template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<8, 128, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 256, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 512, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 128, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 256, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 512, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 128, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 256, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 512, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>; -extern template struct standard_descriptor_spec<8, 128, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<16, 256, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<32, 512, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>; -extern template struct standard_descriptor_spec<8, 128, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<16, 256, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>; -extern template struct standard_descriptor_spec<32, 512, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>; -extern template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct standard_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct cagra_q_dataset_descriptor_t; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct instance_selector< - standard_descriptor_spec<8, 128, float, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 256, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 512, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 128, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 256, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 512, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 256, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 512, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 128, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 256, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 512, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; -using descriptor_instances = - instance_selector, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 256, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 512, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 128, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 256, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 512, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 256, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 512, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 128, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 256, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 512, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; +using descriptor_instances = instance_selector< + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; template auto dataset_descriptor_init(const cagra::search_params& params, const DatasetT& dataset, + cuvs::distance::DistanceType metric, rmm::cuda_stream_view stream) -> dataset_descriptor_host { - auto [init, priority] = descriptor_instances::select(params, dataset); + auto [init, priority] = + descriptor_instances::select(params, dataset, metric); if (init == nullptr || priority < 0) { RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); } - return init(params, dataset, stream); + return init(params, dataset, metric, stream); } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index d18a60dd3..5d480f57a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -28,59 +28,535 @@ namespace cuvs::neighbors::cagra::detail { template struct instance_selector< - standard_descriptor_spec<8, 128, float, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<16, 256, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<32, 512, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>, - standard_descriptor_spec<8, 128, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<16, 256, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<32, 512, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>, - standard_descriptor_spec<8, 128, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>, - standard_descriptor_spec<8, 128, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<16, 256, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<32, 512, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>, - standard_descriptor_spec<8, 128, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<16, 256, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>, - standard_descriptor_spec<32, 512, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>, - vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>>; + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index bb1c70616..2f8e801c9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -42,10 +42,7 @@ struct dataset_descriptor_base_t { using DISTANCE_T = DistanceT; using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); - using compute_distance_type = DISTANCE_T(const base_type*, - INDEX_T, - cuvs::distance::DistanceType, - bool); + using compute_distance_type = DISTANCE_T(const base_type*, INDEX_T, bool); /** Copy the descriptor and the query into shared memory and do any other work, such as * initializing the codebook. */ @@ -84,11 +81,10 @@ struct dataset_descriptor_base_t { return setup_workspace_impl(this, smem_ptr, queries_ptr, query_id); } - RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, - cuvs::distance::DistanceType metric, - bool valid) const -> DISTANCE_T + RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const + -> DISTANCE_T { - return compute_distance_impl(this, dataset_index, metric, valid); + return compute_distance_impl(this, dataset_index, valid); } }; @@ -144,8 +140,11 @@ struct dataset_descriptor_host { }; template -using init_desc_type = dataset_descriptor_host (*)( - const cagra::search_params&, const DatasetT&, rmm::cuda_stream_view); +using init_desc_type = + dataset_descriptor_host (*)(const cagra::search_params&, + const DatasetT&, + cuvs::distance::DistanceType, + rmm::cuda_stream_view); template struct instance_spec { @@ -176,12 +175,14 @@ template -constexpr auto spec_match(const cagra::search_params& params, const DatasetT& dataset) +constexpr auto spec_match(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> std::tuple, double> { if constexpr (spec_sound) { return std::make_tuple(InstanceSpec::template init, - InstanceSpec::template priority(params, dataset)); + InstanceSpec::template priority(params, dataset, metric)); } return std::make_tuple(nullptr, -1.0); } @@ -189,7 +190,7 @@ constexpr auto spec_match(const cagra::search_params& params, const DatasetT& da template struct instance_selector { template - static auto select(const cagra::search_params&, const DatasetT&) + static auto select(const cagra::search_params&, const DatasetT&, cuvs::distance::DistanceType) -> std::tuple, double> { return std::make_tuple(nullptr, -1.0); @@ -199,23 +200,27 @@ struct instance_selector { template struct instance_selector { template - static auto select(const cagra::search_params& params, const DatasetT& dataset) + static auto select(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> std::enable_if_t, std::tuple, double>> { - auto s0 = spec_match(params, dataset); + auto s0 = spec_match(params, dataset, metric); auto ss = instance_selector::template select( - params, dataset); + params, dataset, metric); return std::get<1>(s0) >= std::get<1>(ss) ? s0 : ss; } template - static auto select(const cagra::search_params& params, const DatasetT& dataset) + static auto select(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> std::enable_if_t, std::tuple, double>> { return instance_selector::template select( - params, dataset); + params, dataset, metric); } }; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 3927b1ed7..1f2b24e10 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -69,6 +69,8 @@ half_uint64=("half", "uint64_t", "float"), ) +metric_prefix = 'cuvs::distance::DistanceType::' + specs = [] descs = [] cmake_list = [] @@ -86,43 +88,45 @@ for type_path, (data_t, idx_t, distance_t) in search_types.items(): for (mxdim, team) in mxdim_team: # CAGRA - path = f"compute_distance_standard_{type_path}_dim{mxdim}_t{team}.cu" - includes = '#include "compute_distance_standard.cuh"' - params = f"{team}, {mxdim}, {data_t}, {idx_t}, {distance_t}" - spec = f"standard_descriptor_spec<{params}>" - desc = f"standard_dataset_descriptor_t<{params}>" - content = f""" + for metric in ['L2Expanded', 'InnerProduct']: + path = f"compute_distance_standard_{metric}_{type_path}_dim{mxdim}_t{team}.cu" + includes = '#include "compute_distance_standard.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}" + spec = f"standard_descriptor_spec<{params}>" + desc = f"standard_dataset_descriptor_t<{params}>" + content = f""" template struct {desc}; template <> const void* {spec}::init_kernel = reinterpret_cast(&standard_dataset_descriptor_init_kernel<{params}>); template struct {spec}; """ - descs.append(desc) - specs.append(spec) - with open(path, "w") as f: - f.write(template.format(includes=includes, content=content)) - cmake_list.append(f" src/neighbors/detail/cagra/{path}") + descs.append(desc) + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") # CAGRA-Q for code_book_t in code_book_types: for pq_len in pq_lens: for pq_bit in pq_bits: - path = f"compute_distance_vpq_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" - includes = '#include "compute_distance_vpq.cuh"' - params = f"{team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" - spec = f"vpq_descriptor_spec<{params}>" - desc = f"cagra_q_dataset_descriptor_t<{params}>" - content = f""" + for metric in ['L2Expanded']: + path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" + includes = '#include "compute_distance_vpq.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" + spec = f"vpq_descriptor_spec<{params}>" + desc = f"cagra_q_dataset_descriptor_t<{params}>" + content = f""" template struct {desc}; template <> const void* {spec}::init_kernel = reinterpret_cast(&vpq_dataset_descriptor_init_kernel<{params}>); template struct {spec}; """ - descs.append(desc) - specs.append(spec) - with open(path, "w") as f: - f.write(template.format(includes=includes, content=content)) - cmake_list.append(f" src/neighbors/detail/cagra/{path}") + descs.append(desc) + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") with open("compute_distance-ext.cuh", "w") as f: includes = ''' @@ -145,14 +149,15 @@ template auto dataset_descriptor_init(const cagra::search_params& params, const DatasetT& dataset, + cuvs::distance::DistanceType metric, rmm::cuda_stream_view stream) -> dataset_descriptor_host {{ - auto [init, priority] = descriptor_instances::select(params, dataset); + auto [init, priority] = descriptor_instances::select(params, dataset, metric); if (init == nullptr || priority < 0) {{ RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination."); }} - return init(params, dataset, stream); + return init(params, dataset, metric, stream); }} ''' f.write(template.format(includes=includes, content=contents)) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 4b3bca1f2..701f1d574 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -28,8 +28,25 @@ #include namespace cuvs::neighbors::cagra::detail { +namespace { +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b) + -> std::enable_if_t +{ + T diff = a - b; + return diff * diff; +} + +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b) + -> std::enable_if_t +{ + return -a * b; +} +} // namespace -template _RAFT_DEVICE __noinline__ auto compute_distance_standard( const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index, - const cuvs::distance::DistanceType metric, const bool valid) -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; @@ -160,16 +177,8 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( // - The data buffer has to be also padded with zeros. DISTANCE_T d; raft::lds(d, query_ptr + device::swizzling(k + v)); - constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - d -= mapping(dl_buff[e].val.data[v]); - norm2 += d * d; - break; - case cuvs::distance::DistanceType::InnerProduct: - norm2 -= d * mapping(dl_buff[e].val.data[v]); - break; - } + norm2 += dist_op( + d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); } } } @@ -181,7 +190,8 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( return norm2; } -template ; + standard_dataset_descriptor_t; new (out) desc_type(&setup_workspace_standard, &compute_distance_standard, ptr, @@ -203,7 +213,8 @@ __launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( ld); } -template } using descriptor_type = - standard_dataset_descriptor_t; + standard_dataset_descriptor_t; static const void* init_kernel; template static auto init(const cagra::search_params& params, const DatasetT& dataset, + cuvs::distance::DistanceType metric, rmm::cuda_stream_view stream) -> host_type { descriptor_type dd_host{nullptr, @@ -244,10 +256,13 @@ struct standard_descriptor_spec : public instance_spec } template - static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double { // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (Metric != metric) { return -1.0; } // Otherwise, favor the closest dataset dimensionality. return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu new file mode 100644 index 000000000..bc1900856 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu new file mode 100644 index 000000000..aaf339c51 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu new file mode 100644 index 000000000..0a1b4c8dc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu new file mode 100644 index 000000000..134f1efd8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu new file mode 100644 index 000000000..1350b74ad --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu new file mode 100644 index 000000000..ceab122fb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu new file mode 100644 index 000000000..e94500dad --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu new file mode 100644 index 000000000..f1ace30cc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu new file mode 100644 index 000000000..a5f2aae13 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu new file mode 100644 index 000000000..800debf5d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu new file mode 100644 index 000000000..4528426c7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu new file mode 100644 index 000000000..45782089c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu new file mode 100644 index 000000000..95725e7ee --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu new file mode 100644 index 000000000..17fa88635 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu new file mode 100644 index 000000000..d6f1b03b7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..9afca54ea --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu new file mode 100644 index 000000000..97c6489a9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu new file mode 100644 index 000000000..43d0a9958 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu new file mode 100644 index 000000000..645032af9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu new file mode 100644 index 000000000..cdb315bac --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu new file mode 100644 index 000000000..49053a2d6 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu new file mode 100644 index 000000000..5a534718b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu new file mode 100644 index 000000000..7e85fa349 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu new file mode 100644 index 000000000..4bc254679 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu new file mode 100644 index 000000000..c0fe52caf --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu new file mode 100644 index 000000000..b585e1f80 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu new file mode 100644 index 000000000..91de967e8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu new file mode 100644 index 000000000..b77b84793 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu new file mode 100644 index 000000000..7ce86c034 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu new file mode 100644 index 000000000..507d709eb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu new file mode 100644 index 000000000..c5c7a7b4c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu new file mode 100644 index 000000000..8d237f58b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu new file mode 100644 index 000000000..cf3ea2eda --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu new file mode 100644 index 000000000..b8c80709d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu new file mode 100644 index 000000000..2c1fb61c2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu new file mode 100644 index 000000000..fb4ad12d1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_standard.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct standard_dataset_descriptor_t; +template <> +const void* standard_descriptor_spec::init_kernel = + reinterpret_cast( + &standard_dataset_descriptor_init_kernel); +template struct standard_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu deleted file mode 100644 index 6d21b9364..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, float, uint32_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, float, uint32_t, float>); -template struct standard_descriptor_spec<8, 128, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu deleted file mode 100644 index b535aa716..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, float, uint32_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, float, uint32_t, float>); -template struct standard_descriptor_spec<16, 256, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu deleted file mode 100644 index 37c804a4e..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, float, uint32_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, float, uint32_t, float>); -template struct standard_descriptor_spec<32, 512, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu deleted file mode 100644 index ad6b12ef8..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, float, uint64_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, float, uint64_t, float>); -template struct standard_descriptor_spec<8, 128, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu deleted file mode 100644 index 1c41899d4..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, float, uint64_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, float, uint64_t, float>); -template struct standard_descriptor_spec<16, 256, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu deleted file mode 100644 index 36f58bb68..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, float, uint64_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, float, uint64_t, float>); -template struct standard_descriptor_spec<32, 512, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu deleted file mode 100644 index dd73a2363..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, half, uint32_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, half, uint32_t, float>); -template struct standard_descriptor_spec<8, 128, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu deleted file mode 100644 index b431e468a..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, half, uint32_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, half, uint32_t, float>); -template struct standard_descriptor_spec<16, 256, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu deleted file mode 100644 index 29eaf36eb..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, half, uint32_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, half, uint32_t, float>); -template struct standard_descriptor_spec<32, 512, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu deleted file mode 100644 index 066d08793..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, half, uint64_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, half, uint64_t, float>); -template struct standard_descriptor_spec<8, 128, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu deleted file mode 100644 index a2ace4528..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, half, uint64_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, half, uint64_t, float>); -template struct standard_descriptor_spec<16, 256, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu deleted file mode 100644 index 1417d3284..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, half, uint64_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, half, uint64_t, float>); -template struct standard_descriptor_spec<32, 512, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu deleted file mode 100644 index 01970b374..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, int8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<8, 128, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu deleted file mode 100644 index 296070314..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, int8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<16, 256, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu deleted file mode 100644 index 95f3c94d1..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_int8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, int8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, int8_t, uint32_t, float>); -template struct standard_descriptor_spec<32, 512, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu deleted file mode 100644 index c5fe8e28e..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim128_t8.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<8, 128, uint8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<8, 128, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<8, 128, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu deleted file mode 100644 index a6fc25350..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim256_t16.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<16, 256, uint8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<16, 256, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<16, 256, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu deleted file mode 100644 index fb86dc8d4..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_uint8_uint32_dim512_t32.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct standard_dataset_descriptor_t<32, 512, uint8_t, uint32_t, float>; -template <> -const void* standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel<32, 512, uint8_t, uint32_t, float>); -template struct standard_descriptor_spec<32, 512, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 84c3617a9..08b673614 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -23,7 +23,8 @@ namespace cuvs::neighbors::cagra::detail { -template _RAFT_DEVICE __noinline__ auto compute_distance_vpq( const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index, - const cuvs::distance::DistanceType /* only L2 metric is implemented */, const bool valid) -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; @@ -302,7 +303,8 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( return norm; } -template { return false; } - using descriptor_type = cagra_q_dataset_descriptor_t { template static auto init(const cagra::search_params& params, const DatasetT& dataset, + cuvs::distance::DistanceType metric, rmm::cuda_stream_view stream) -> host_type { descriptor_type dd_host{nullptr, @@ -407,10 +413,13 @@ struct vpq_descriptor_spec : public instance_spec { } template - static auto priority(const cagra::search_params& params, const DatasetT& dataset) -> double + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double { // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..7abc27bda --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..6407d4b3b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..f8d19f4ea --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..e649829c1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..ce47d2c63 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..9e8159e13 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..b24552f5e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..f34f3d6ce --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..f1bbfdc49 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..9f41de84d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..2eed65e94 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..54f8e58e4 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..e63fc260a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..3f89f4fb0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..7bd75f5fc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..761df5fdc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..ed13b8730 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..49b834d3a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..38fdc4e66 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..6a38f2cfe --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..c730ae78c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..abe38cceb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..871e9410b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..6fc02d616 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..ad0c1ea21 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..493c5d799 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..09b4dcd1e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..50369096b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..247b52bda --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..dccd08910 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu new file mode 100644 index 000000000..75c74e704 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu new file mode 100644 index 000000000..d87fd66e8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu new file mode 100644 index 000000000..d6d651bc3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu new file mode 100644 index 000000000..e36faf918 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu new file mode 100644 index 000000000..bc3e8e524 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu new file mode 100644 index 000000000..3c63592b6 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq.cuh" + +namespace cuvs::neighbors::cagra::detail { + +template struct cagra_q_dataset_descriptor_t; +template <> +const void* vpq_descriptor_spec::init_kernel = + reinterpret_cast( + &vpq_dataset_descriptor_init_kernel); +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index d6831560f..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 548a9b75a..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 828008555..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 49a449384..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 2d40ae3dd..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 2dfad4f28..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, float, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index e97a4a840..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 2b20e7af3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index b624e4cc9..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 8efd32c96..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 9a62f74f0..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 7b344cd07..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, float, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, float, uint64_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, float, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index a4f9676d8..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index eccd180d2..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index ebda2c92d..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 88b1e1678..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 94d4f1f84..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index e8249238d..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, half, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index cee5f07a3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 9b1daa3e3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index 7fb295f55..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 712d28082..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 307991526..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 341f70bc0..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, half, uint64_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, half, uint64_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, half, uint64_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index f17e58da7..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 21568247a..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index e164c976f..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index 4880d6718..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 7c9c44911..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index c44f82c2e..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, int8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, int8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, int8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index dac083b05..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 2, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 2, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 15fe73593..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<8, 128, 8, 4, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<8, 128, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<8, 128, 8, 4, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index df5b01e2a..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 2, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 2, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index edf8361a3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<16, 256, 8, 4, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<16, 256, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<16, 256, 8, 4, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index fc40634d5..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 2, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 2, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 2, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 0b0d269f7..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq.cuh" - -namespace cuvs::neighbors::cagra::detail { - -template struct cagra_q_dataset_descriptor_t<32, 512, 8, 4, half, uint8_t, uint32_t, float>; -template <> -const void* vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel<32, 512, 8, 4, half, uint8_t, uint32_t, float>); -template struct vpq_descriptor_spec<32, 512, 8, 4, half, uint8_t, uint32_t, float>; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index ef372c16d..ca8e777a7 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -84,7 +84,6 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const uint32_t num_seeds, IndexT* __restrict__ visited_hash_ptr, const uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric, const uint32_t block_id = 0, const uint32_t num_blocks = 1) { @@ -112,7 +111,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( } } - auto norm2 = dataset_desc.compute_distance(seed_index, metric, valid_i); + auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); if (valid_i && (norm2 < best_norm2_team_local)) { best_norm2_team_local = norm2; @@ -147,8 +146,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( const uint32_t hash_bitlen, const IndexT* __restrict__ parent_indices, const IndexT* __restrict__ internal_topk_list, - const uint32_t search_width, - const cuvs::distance::DistanceType metric) + const uint32_t search_width) { constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask::value; constexpr IndexT invalid_index = raft::upper_bound(); @@ -183,7 +181,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( IndexT child_id = invalid_index; if (valid_i) { child_id = result_child_indices_ptr[i]; } - auto norm2 = dataset_desc.compute_distance(child_id, metric, child_id != invalid_index); + auto norm2 = dataset_desc.compute_distance(child_id, child_id != invalid_index); // Store the distance const unsigned lane_id = threadIdx.x % team_size; diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 2a6d9add0..65a744a8f 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -40,10 +40,9 @@ class factory { const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - const cuvs::distance::DistanceType metric) + uint32_t topk) { - search_plan_impl_base plan(params, dim, graph_degree, topk, metric); + search_plan_impl_base plan(params, dim, graph_degree, topk); return dispatch_kernel(res, plan, dataset_desc); } @@ -56,15 +55,15 @@ class factory { if (plan.algo == search_algo::SINGLE_CTA) { return std::make_unique< single_cta_search::search>( - res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } else if (plan.algo == search_algo::MULTI_CTA) { return std::make_unique< multi_cta_search::search>( - res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } else { return std::make_unique< multi_kernel_search::search>( - res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk, plan.metric); + res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk); } } }; diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index bb9b5f647..c0234407b 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -103,9 +103,8 @@ struct search : public search_plan_impl& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric), + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk), intermediate_indices(0, raft::resource::get_cuda_stream(res)), intermediate_distances(0, raft::resource::get_cuda_stream(res)), topk_workspace(0, raft::resource::get_cuda_stream(res)) @@ -228,7 +227,6 @@ struct search : public search_plan_implmetric, stream); RAFT_CUDA_TRY(cudaPeekAtLastError()); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh index 30bbd60aa..036a4e414 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh @@ -41,7 +41,6 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search { uint32_t num_cta_per_query, \ uint32_t num_seeds, \ SampleFilterT sample_filter, \ - cuvs::distance::DistanceType metric, \ cudaStream_t stream); } // namespace cuvs::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 56dd0d8f6..9b6a6e299 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -154,8 +154,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const uint32_t min_iteration, const uint32_t max_iteration, uint32_t* const num_executed_iterations, /* stats */ - SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; @@ -236,7 +235,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( num_seeds, local_visited_hashmap_ptr, hash_bitlen, - metric, block_id, num_blocks); __syncthreads(); @@ -277,8 +275,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( hash_bitlen, parent_indices_buffer, result_indices_buffer, - search_width, - metric); + search_width); _CLK_REC(clk_compute_distance); __syncthreads(); @@ -434,7 +431,6 @@ void select_and_run(const dataset_descriptor_base_t* d uint32_t num_cta_per_query, uint32_t num_seeds, SampleFilterT sample_filter, - cuvs::distance::DistanceType metric, cudaStream_t stream) { auto kernel = @@ -473,8 +469,7 @@ void select_and_run(const dataset_descriptor_base_t* d ps.min_iterations, ps.max_iterations, num_executed_iterations, - sample_filter, - metric); + sample_filter); } } // namespace multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh index a3dc42424..1ef35f947 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh @@ -41,7 +41,6 @@ void select_and_run(const dataset_descriptor_base_t* d uint32_t num_cta_per_query, uint32_t num_seeds, SampleFilterT sample_filter, - cuvs::distance::DistanceType metric, cudaStream_t stream); } diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 7aa7cd9d3..59c3e2ee3 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -105,8 +105,7 @@ RAFT_KERNEL random_pickup_kernel( typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] const std::uint32_t ldr, // (*) ldr >= num_pickup typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << bitlen] - const std::uint32_t hash_bitlen, - const cuvs::distance::DistanceType metric) + const std::uint32_t hash_bitlen) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; @@ -133,7 +132,7 @@ RAFT_KERNEL random_pickup_kernel( device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size; } - DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, metric, true); + DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true); if (norm2 < best_norm2_team_local) { best_norm2_team_local = norm2; best_index_team_local = seed_index; @@ -168,7 +167,6 @@ void random_pickup(const dataset_descriptor_host& data std::size_t ldr, // (*) ldr >= num_pickup IndexT* visited_hashmap_ptr, // [num_queries, 1 << bitlen] std::uint32_t hash_bitlen, - cuvs::distance::DistanceType metric, cudaStream_t cuda_stream) { const auto block_size = 256u; @@ -188,8 +186,7 @@ void random_pickup(const dataset_descriptor_host& data result_distances_ptr, ldr, visited_hashmap_ptr, - hash_bitlen, - metric); + hash_bitlen); } template @@ -314,8 +311,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, ldd] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] const std::uint32_t ldd, // (*) ldd >= search_width * graph_degree - SAMPLE_FILTER_T sample_filter, - const cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; @@ -354,7 +350,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const auto compute_distance_flag = hashmap::insert( team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id); - DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, metric, compute_distance_flag); + DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, compute_distance_flag); if (compute_distance_flag) { if (threadIdx.x % team_size == 0) { @@ -398,7 +394,6 @@ void compute_distance_to_child_nodes( DistanceT* result_distances_ptr, // [num_queries, ldd] std::uint32_t ldd, // (*) ldd >= search_width * graph_degree SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric, cudaStream_t cuda_stream) { const auto block_size = 128; @@ -423,8 +418,7 @@ void compute_distance_to_child_nodes( result_indices_ptr, result_distances_ptr, ldd, - sample_filter, - metric); + sample_filter); } template @@ -632,9 +626,8 @@ struct search : search_plan_impl { const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric), + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk), result_indices(0, raft::resource::get_cuda_stream(res)), result_distances(0, raft::resource::get_cuda_stream(res)), parent_node_list(0, raft::resource::get_cuda_stream(res)), @@ -807,7 +800,6 @@ struct search : search_plan_impl { result_buffer_allocation_size, hashmap.data(), hash_bitlen, - this->metric, stream); unsigned iter = 0; @@ -877,7 +869,6 @@ struct search : search_plan_impl { result_distances.data() + itopk_size, result_buffer_allocation_size, sample_filter, - this->metric, stream); iter++; diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 293b01e4f..16864ed19 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -38,13 +38,8 @@ struct search_plan_impl_base : public search_params { int64_t dim; int64_t graph_degree; uint32_t topk; - cuvs::distance::DistanceType metric; - search_plan_impl_base(search_params params, - int64_t dim, - int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric) + search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk) + : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk) { if (algo == search_algo::AUTO) { const size_t num_sm = raft::getMultiProcessorCount(); @@ -90,9 +85,8 @@ struct search_plan_impl : public search_plan_impl_base { const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : search_plan_impl_base(params, dim, graph_degree, topk, metric), + uint32_t topk) + : search_plan_impl_base(params, dim, graph_degree, topk), hashmap(0, raft::resource::get_cuda_stream(res)), num_executed_iterations(0, raft::resource::get_cuda_stream(res)), dev_seed(0, raft::resource::get_cuda_stream(res)), diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index e48b03940..aefadf643 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -97,9 +97,8 @@ struct search : search_plan_impl { const dataset_descriptor_host& dataset_desc, int64_t dim, int64_t graph_degree, - uint32_t topk, - cuvs::distance::DistanceType metric) - : base_type(res, params, dataset_desc, dim, graph_degree, topk, metric) + uint32_t topk) + : base_type(res, params, dataset_desc, dim, graph_degree, topk) { set_params(res); } @@ -238,7 +237,6 @@ struct search : search_plan_impl { small_hash_reset_interval, num_seeds, sample_filter, - this->metric, stream); } }; diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh index 2a9974575..26ca7b672 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh @@ -42,7 +42,6 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { size_t small_hash_reset_interval, \ uint32_t num_seeds, \ SampleFilterT sample_filter, \ - cuvs::distance::DistanceType metric, \ cudaStream_t stream); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index f3b47b846..972f314c5 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -482,8 +482,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( const std::uint32_t hash_bitlen, const std::uint32_t small_hash_bitlen, const std::uint32_t small_hash_reset_interval, - SAMPLE_FILTER_T sample_filter, - cuvs::distance::DistanceType metric) + SAMPLE_FILTER_T sample_filter) { using LOAD_T = device::LOAD_128BIT_T; @@ -569,8 +568,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( local_seed_ptr, num_seeds, local_visited_hashmap_ptr, - hash_bitlen, - metric); + hash_bitlen); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -700,8 +698,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( hash_bitlen, parent_list_buffer, result_indices_buffer, - search_width, - metric); + search_width); __syncthreads(); _CLK_REC(clk_compute_distance); @@ -878,7 +875,6 @@ void select_and_run(const dataset_descriptor_base_t* d size_t small_hash_reset_interval, uint32_t num_seeds, SampleFilterT sample_filter, - cuvs::distance::DistanceType metric, cudaStream_t stream) { auto kernel = @@ -912,8 +908,7 @@ void select_and_run(const dataset_descriptor_base_t* d hash_bitlen, small_hash_bitlen, small_hash_reset_interval, - sample_filter, - metric); + sample_filter); // RAFT_CUDA_TRY(cudaPeekAtLastError()); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh index 972bd6101..7b7f44db7 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -41,7 +41,6 @@ void select_and_run(const dataset_descriptor_base_t* d size_t small_hash_reset_interval, uint32_t num_seeds, SampleFilterT sample_filter, - cuvs::distance::DistanceType metric, cudaStream_t stream); } From 118808ecea11865712dd6c7d07ef2cb1c57abaf1 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 27 Aug 2024 13:13:20 +0200 Subject: [PATCH 16/41] Further reduce register pressure by moving code out of the non-inlinable compute_distance_impl and being more explicit about the memory spaces (using lds/ldg) --- .../detail/cagra/compute_distance.hpp | 5 +- .../cagra/compute_distance_standard.cuh | 58 +++--- .../detail/cagra/compute_distance_vpq.cuh | 195 +++++++++--------- .../neighbors/detail/cagra/device_common.hpp | 24 +++ .../cagra/search_single_cta_kernel-inl.cuh | 71 ++++--- 5 files changed, 185 insertions(+), 168 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 2f8e801c9..ee56a5d8d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -42,7 +42,7 @@ struct dataset_descriptor_base_t { using DISTANCE_T = DistanceT; using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); - using compute_distance_type = DISTANCE_T(const base_type*, INDEX_T, bool); + using compute_distance_type = DISTANCE_T(const base_type*, INDEX_T); /** Copy the descriptor and the query into shared memory and do any other work, such as * initializing the codebook. */ @@ -84,7 +84,8 @@ struct dataset_descriptor_base_t { RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const -> DISTANCE_T { - return compute_distance_impl(this, dataset_index, valid); + auto per_thread_distances = valid ? compute_distance_impl(this, dataset_index) : 0; + return device::team_sum(per_thread_distances, this->team_size); } }; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 701f1d574..f56fc4328 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -134,13 +134,11 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( template _RAFT_DEVICE __noinline__ auto compute_distance_standard( - const typename DescriptorT::base_type* desc_, - const typename DescriptorT::INDEX_T dataset_index, - const bool valid) -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index) + -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; - using INDEX_T = typename DescriptorT::INDEX_T; using LOAD_T = typename DescriptorT::LOAD_T; using QUERY_T = typename DescriptorT::QUERY_T; constexpr auto kTeamSize = DescriptorT::kTeamSize; @@ -153,41 +151,35 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( const auto lane_id = threadIdx.x % kTeamSize; const auto dim = desc->dim; - DISTANCE_T norm2 = 0; - if (valid) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += kDatasetBlockDim) { - constexpr unsigned vlen = device::get_vlen(); - constexpr unsigned reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); - raft::TxN_t dl_buff[reg_nelem]; + DISTANCE_T r = 0; + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += kDatasetBlockDim) { + constexpr unsigned vlen = device::get_vlen(); + constexpr unsigned reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); + raft::TxN_t dl_buff[reg_nelem]; #pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; - dl_buff[e].load(dataset_ptr, k); - } + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; + dl_buff[e].load(dataset_ptr, k); + } #pragma unroll - for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; + for (uint32_t e = 0; e < reg_nelem; e++) { + const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; + if (k >= dim) break; #pragma unroll - for (uint32_t v = 0; v < vlen; v++) { - // Note this loop can go above the dataset_dim for padded arrays. This is not a problem - // because: - // - Above the last element (dataset_dim-1), the query array is filled with zeros. - // - The data buffer has to be also padded with zeros. - DISTANCE_T d; - raft::lds(d, query_ptr + device::swizzling(k + v)); - norm2 += dist_op( - d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); - } + for (uint32_t v = 0; v < vlen; v++) { + // Note this loop can go above the dataset_dim for padded arrays. This is not a problem + // because: + // - Above the last element (dataset_dim-1), the query array is filled with zeros. + // - The data buffer has to be also padded with zeros. + DISTANCE_T d; + raft::lds(d, query_ptr + device::swizzling(k + v)); + r += dist_op( + d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); } } } -#pragma unroll - for (uint32_t offset = kTeamSize / 2; offset > 0; offset >>= 1) { - norm2 += __shfl_xor_sync(0xffffffff, norm2, offset); - } - return norm2; + return r; } template +#include #include namespace cuvs::neighbors::cagra::detail { @@ -174,13 +175,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::b template _RAFT_DEVICE __noinline__ auto compute_distance_vpq( - const typename DescriptorT::base_type* desc_, - const typename DescriptorT::INDEX_T dataset_index, - const bool valid) -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index) + -> typename DescriptorT::DISTANCE_T { - using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; - using INDEX_T = typename DescriptorT::INDEX_T; using LOAD_T = typename DescriptorT::LOAD_T; using QUERY_T = typename DescriptorT::QUERY_T; using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; @@ -189,117 +187,116 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - const auto* __restrict__ desc = reinterpret_cast(desc_); - const auto* __restrict__ codebook_ptr = reinterpret_cast(desc + 1); - const auto* __restrict__ query_ptr = reinterpret_cast( - reinterpret_cast(codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); + const auto* __restrict__ desc = reinterpret_cast(desc_); + const auto* __restrict__ pq_codebook_ptr = reinterpret_cast(desc + 1); + const auto* __restrict__ query_ptr = reinterpret_cast( + reinterpret_cast(pq_codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); const auto* __restrict__ node_ptr = desc->encoded_dataset_ptr + (static_cast(desc->encoded_dataset_dim) * dataset_index); - const auto dim = desc->dim; - float norm = 0; - if (valid) { - const unsigned lane_id = threadIdx.x % TeamSize; - const uint32_t vq_code = *reinterpret_cast(node_ptr); - if constexpr (PQ_BITS == 8) { - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { - constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** - constexpr unsigned nelem = - raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); - // Loading PQ codes - uint32_t pq_codes[nelem]; -#pragma unroll 1 - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= desc->n_subspace) break; - // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - pq_codes[e] = *(reinterpret_cast(node_ptr + 4 + k)); - } - // - if constexpr (PQ_LEN % 2 == 0) { - // **** Use half2 for distance computation **** - half2 norm2{0, 0}; + const auto* __restrict__ vq_codebook_ptr = desc->vq_code_book_ptr; + const auto dim = desc->dim; + const auto n_subspace = desc->n_subspace; + const unsigned lane_id = threadIdx.x % TeamSize; + // const uint32_t& vq_code = *reinterpret_cast(node_ptr); + uint32_t vq_code; + raft::ldg(vq_code, reinterpret_cast(node_ptr)); + static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); + DISTANCE_T norm = 0; + for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { + constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** + constexpr unsigned nelem = + raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + // Loading PQ codes + uint32_t pq_codes[nelem]; #pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= desc->n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= n_subspace) break; + // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) + raft::ldg(pq_codes[e], reinterpret_cast(node_ptr + 4 + k)); + } + // + if constexpr (PQ_LEN % 2 == 0) { + // **** Use half2 for distance computation **** +#pragma unroll 1 + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= n_subspace) break; + // Loading VQ code-book + raft::TxN_t vq_vals[PQ_LEN]; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { - const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - vq_vals[m].load( - reinterpret_cast(desc->vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; + for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { + const uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + vq_vals[m].load(reinterpret_cast(vq_codebook_ptr + d + (dim * vq_code)), 0); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; #pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // Loading query vector in smem - half2 diff2 = (reinterpret_cast( - query_ptr))[device::swizzling(d / 2)]; - // Loading PQ code book in smem - diff2 -= *(reinterpret_cast( - codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); - diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; - norm2 += diff2 * diff2; - } - pq_code >>= 8; - } + for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { + const std::uint32_t d1 = m + (PQ_LEN * v); + const std::uint32_t d = d1 + (PQ_LEN * k); + half2 q2, c2; + // Loading query vector from smem + raft::lds(reinterpret_cast(q2), + reinterpret_cast(query_ptr) + + device::swizzling(d / 2)); + // Loading PQ code book from smem + raft::lds(reinterpret_cast(c2), + reinterpret_cast( + pq_codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); + // L2 distance + auto dist = q2 - c2 - vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); } - norm += static_cast(norm2.x + norm2.y); - } else { - // **** Use float for distance computation **** + pq_code >>= 8; + } + } + } else { + // **** Use float for distance computation **** #pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= desc->n_subspace) break; - // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; + if (k >= n_subspace) break; + // Loading VQ code-book + raft::TxN_t vq_vals[PQ_LEN]; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device - // memory) - vq_vals[m].load( - reinterpret_cast(desc->vq_code_book_ptr + d + (dim * vq_code)), 0); - } - // Compute distance - std::uint32_t pq_code = pq_codes[e]; + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device + // memory) + vq_vals[m].load(reinterpret_cast(vq_codebook_ptr + d + (dim * vq_code)), 0); + } + // Compute distance + std::uint32_t pq_code = pq_codes[e]; #pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; - raft::TxN_t pq_vals; - pq_vals.load(reinterpret_cast(codebook_ptr + PQ_LEN * (pq_code & 0xff)), - 0); // (from L1$ or smem) + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; + raft::TxN_t pq_vals; + pq_vals.load(reinterpret_cast(pq_codebook_ptr + PQ_LEN * (pq_code & 0xff)), + 0); // (from L1$ or smem) #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); - // if (d >= dataset_dim) break; - DISTANCE_T diff = query_ptr[d]; // (from smem) - diff -= static_cast(pq_vals.data[m]); - diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); - norm += diff * diff; - } - pq_code >>= 8; - } + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const std::uint32_t d1 = m + (PQ_LEN * v); + const std::uint32_t d = d1 + (PQ_LEN * k); + // if (d >= dataset_dim) break; + DISTANCE_T diff; + raft::lds(diff, query_ptr + d); + diff -= static_cast(pq_vals.data[m]); + diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); + norm += diff * diff; } + pq_code >>= 8; } } } } -#pragma unroll - for (uint32_t offset = TeamSize / 2; offset > 0; offset >>= 1) { - norm += __shfl_xor_sync(0xffffffff, norm, offset); - } return norm; } diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index ca8e777a7..20545a225 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -23,6 +23,7 @@ // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors #include #include +#include #include @@ -195,5 +196,28 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( } } +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T +{ +#pragma unroll + for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) { + x += raft::shfl_xor(x, stride, TeamSize); + } + return x; +} + +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size) -> T +{ + switch (team_size) { + case 1: return team_sum<1>(x); + case 2: return team_sum<2>(x); + case 4: return team_sum<4>(x); + case 8: return team_sum<8>(x); + case 16: return team_sum<16>(x); + default: return team_sum<32>(x); + } +} + } // namespace device } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 972f314c5..af99ca7ca 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -58,11 +58,11 @@ namespace single_cta_search { // #define _CLK_BREAKDOWN template -__device__ void pickup_next_parents(std::uint32_t* const terminate_flag, - INDEX_T* const next_parent_indices, - INDEX_T* const internal_topk_indices, - const std::size_t internal_topk_size, - const std::uint32_t search_width) +RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag, + INDEX_T* const next_parent_indices, + INDEX_T* const internal_topk_indices, + const std::size_t internal_topk_size, + const std::uint32_t search_width) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; // if (threadIdx.x >= 32) return; @@ -100,11 +100,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag, } template -__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - const std::uint32_t num_itopk, - unsigned MULTI_WARPS = 0) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st( + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + const std::uint32_t num_itopk, + unsigned MULTI_WARPS = 0) { const unsigned lane_id = threadIdx.x % 32; const unsigned warp_id = threadIdx.x / 32; @@ -203,15 +204,16 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // } template -__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first, - unsigned MULTI_WARPS = 0) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd( + float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first, + unsigned MULTI_WARPS = 0) { const unsigned lane_id = threadIdx.x % 32; const unsigned warp_id = threadIdx.x / 32; @@ -411,16 +413,17 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num template -__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first, - const unsigned MULTI_WARPS_1, - const unsigned MULTI_WARPS_2) +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( + float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first, + const unsigned MULTI_WARPS_1, + const unsigned MULTI_WARPS_2) { // The results in candidate_distances/indices are sorted by bitonic sort. topk_by_bitonic_sort_1st( @@ -440,11 +443,11 @@ __device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] } template -__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, - const size_t hashmap_bitlen, - const INDEX_T* itopk_indices, - const uint32_t itopk_size, - const uint32_t first_tid = 0) +RAFT_DEVICE_INLINE_FUNCTION void hashmap_restore(INDEX_T* const hashmap_ptr, + const size_t hashmap_bitlen, + const INDEX_T* itopk_indices, + const uint32_t itopk_size, + const uint32_t first_tid = 0) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; if (threadIdx.x < first_tid) return; From abec1256adb280e6cb4a8ba6034e6f4a8ade3a17 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 27 Aug 2024 16:12:12 +0200 Subject: [PATCH 17/41] Manually unroll device::team_sum --- cpp/src/neighbors/detail/cagra/device_common.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 20545a225..73787ec90 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -210,12 +210,12 @@ template RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size) -> T { switch (team_size) { - case 1: return team_sum<1>(x); - case 2: return team_sum<2>(x); - case 4: return team_sum<4>(x); - case 8: return team_sum<8>(x); - case 16: return team_sum<16>(x); - default: return team_sum<32>(x); + case 32: x += raft::shfl_xor(x, 16); + case 16: x += raft::shfl_xor(x, 8); + case 8: x += raft::shfl_xor(x, 4); + case 4: x += raft::shfl_xor(x, 2); + case 2: x += raft::shfl_xor(x, 1); + default: return x; } } From cf0101cee0ebd188fc19c4b91c7463fb34fa3b83 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Aug 2024 13:04:35 +0200 Subject: [PATCH 18/41] Remove the test of a compute_distance instance that is not compiled (team_size = 4) --- cpp/test/neighbors/ann_cagra.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 9d2f9c175..4ce0849fd 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -706,7 +706,7 @@ inline std::vector generate_inputs() {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT}, {search_algo::AUTO}, {10}, - {0, 4, 8, 16, 32}, // team_size + {0, 8, 16, 32}, // team_size {64}, {1}, {cuvs::distance::DistanceType::L2Expanded}, From b3e6d26b15b6f12b3d8204f249606ee68c7bc2b3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Aug 2024 13:37:21 +0200 Subject: [PATCH 19/41] Hide previously not hidden kernels --- cpp/src/neighbors/detail/ann_utils.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 000de576b..29f790ec5 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -224,7 +224,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream) } template -__global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) +static __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); IdxT i = gid / len_b; @@ -234,12 +234,12 @@ __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, } template -__global__ void block_copy_kernel(const IdxT* in_offsets, - const IdxT* out_offsets, - IdxT n_blocks, - const T* in_data, - T* out_data, - IdxT n_mult) +static __global__ void block_copy_kernel(const IdxT* in_offsets, + const IdxT* out_offsets, + IdxT n_blocks, + const T* in_data, + T* out_data, + IdxT n_mult) { IdxT i = static_cast(blockDim.x) * static_cast(blockIdx.x) + threadIdx.x; // find the source offset using the binary search. @@ -317,7 +317,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s } template -__global__ void copy_selected_kernel( +static __global__ void copy_selected_kernel( IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst) { IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); From dc75f7ab92bd4b4052e0a9c41f6a1ed6ca2db49d Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 2 Sep 2024 11:47:40 +0200 Subject: [PATCH 20/41] Reduce register usage by minimizing the part of descriptor struct passed to the compute_distance function --- cpp/CMakeLists.txt | 2 +- cpp/src/neighbors/detail/cagra/bitonic.hpp | 37 ++- .../detail/cagra/compute_distance.hpp | 46 +++- .../cagra/compute_distance_standard.cuh | 117 +++++++--- .../detail/cagra/compute_distance_vpq.cuh | 215 +++++++++++------- .../neighbors/detail/cagra/device_common.hpp | 52 ++++- .../cagra/search_multi_cta_kernel-inl.cuh | 3 +- .../cagra/search_single_cta_kernel-inl.cuh | 3 +- 8 files changed, 325 insertions(+), 150 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ad0303486..3b414c38c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -291,7 +291,7 @@ add_library( src/neighbors/detail/cagra/search_single_cta_half_uint64.cu ) -file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu") +file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/*.cu") set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) set_target_properties( diff --git a/cpp/src/neighbors/detail/cagra/bitonic.hpp b/cpp/src/neighbors/detail/cagra/bitonic.hpp index 26195bd9c..ed609d6fd 100644 --- a/cpp/src/neighbors/detail/cagra/bitonic.hpp +++ b/cpp/src/neighbors/detail/cagra/bitonic.hpp @@ -26,7 +26,7 @@ namespace bitonic { namespace detail { template -_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc) +RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc) { if ((k0 != k1) && ((k0 < k1) != asc)) { const auto tmp_k = k0; @@ -39,7 +39,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool a } template -_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset, const bool asc) +RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, + V& v0, + const unsigned lane_offset, + const bool asc) { auto k1 = __shfl_xor_sync(~0u, k0, lane_offset); auto v1 = __shfl_xor_sync(~0u, v0, lane_offset); @@ -51,7 +54,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[N], V v[N], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[N], + V v[N], + const std::uint32_t range, + const bool asc) { const auto lane_id = threadIdx.x % warp_size; @@ -93,7 +99,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[6], V v[6], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[6], + V v[6], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 6; const auto lane_id = threadIdx.x % warp_size; @@ -141,7 +150,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[3], V v[3], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[3], + V v[3], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 3; const auto lane_id = threadIdx.x % warp_size; @@ -171,7 +183,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[2], V v[2], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[2], + V v[2], + const std::uint32_t range, + const bool asc) { constexpr unsigned N = 2; const auto lane_id = threadIdx.x % warp_size; @@ -197,7 +212,10 @@ struct warp_merge_core { template struct warp_merge_core { - _RAFT_DEVICE inline void operator()(K k[1], V v[1], const std::uint32_t range, const bool asc) + RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[1], + V v[1], + const std::uint32_t range, + const bool asc) { const auto lane_id = threadIdx.x % warp_size; const std::uint32_t b = range; @@ -211,14 +229,15 @@ struct warp_merge_core { } // namespace detail template -__device__ void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true) +RAFT_DEVICE_INLINE_FUNCTION void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true) { detail::warp_merge_core{}(k, v, range, asc); } template -__device__ void warp_sort(K k[N], V v[N], const bool asc = true) +RAFT_DEVICE_INLINE_FUNCTION void warp_sort(K k[N], V v[N], const bool asc = true) { +#pragma unroll for (std::uint32_t range = 1; range <= warp_size; range <<= 1) { warp_merge(k, v, range, asc); } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index ee56a5d8d..f174a40a2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -35,14 +35,42 @@ namespace cuvs::neighbors::cagra::detail { template -struct dataset_descriptor_base_t { +struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; + using LOAD_T = device::LOAD_128BIT_T; using DATA_T = DataT; using INDEX_T = IndexT; using DISTANCE_T = DistanceT; + struct alignas(LOAD_T) args_t { + void* extra_ptr1; + void* extra_ptr2; + /** Pointer to the workspace in the shared memory (filled in every copy by a thread block). */ + uint32_t smem_ws_ptr; + /** Dimensionality of the data/queries. */ + uint32_t dim; + uint32_t extra_word1; + uint32_t extra_word2; + + RAFT_DEVICE_INLINE_FUNCTION auto load() const -> args_t + { + constexpr int kCount = sizeof(*this) / sizeof(LOAD_T); + using blob_type = LOAD_T[kCount]; + args_t r; + auto& src = reinterpret_cast(*this); + auto& dst = reinterpret_cast(r); +#pragma unroll + for (int i = 0; i < kCount; i++) { + device::lds(dst[i], src + i); + } + return r; + } + }; + using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); - using compute_distance_type = DISTANCE_T(const base_type*, INDEX_T); + using compute_distance_type = DISTANCE_T(const args_t, const INDEX_T); + + args_t args; /** Copy the descriptor and the query into shared memory and do any other work, such as * initializing the codebook. */ @@ -50,15 +78,15 @@ struct dataset_descriptor_base_t { /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector * given by the dataset_index. */ compute_distance_type* compute_distance_impl; - /** Number of records in the database. */ - INDEX_T size; - /** Dimensionality of the data/queries. */ - uint32_t dim; + void* extra_ptr3; /** How many threads are involved in computing a single distance. */ uint32_t team_size; /** Total dynamic shared memory required by the descriptor. */ uint32_t smem_ws_size_in_bytes; + /** Number of records in the database. */ + INDEX_T size; + RAFT_INLINE_FUNCTION dataset_descriptor_base_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, INDEX_T size, @@ -68,9 +96,9 @@ struct dataset_descriptor_base_t { : setup_workspace_impl(setup_workspace_impl), compute_distance_impl(compute_distance_impl), size(size), - dim(dim), team_size(team_size), - smem_ws_size_in_bytes(smem_ws_size_in_bytes) + smem_ws_size_in_bytes(smem_ws_size_in_bytes), + args{nullptr, nullptr, 0, dim, 0, 0} { } @@ -84,7 +112,7 @@ struct dataset_descriptor_base_t { RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const -> DISTANCE_T { - auto per_thread_distances = valid ? compute_distance_impl(this, dataset_index) : 0; + auto per_thread_distances = valid ? compute_distance_impl(args.load(), dataset_index) : 0; return device::team_sum(per_thread_distances, this->team_size); } }; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index f56fc4328..46230c624 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -51,24 +51,57 @@ template -struct alignas(device::LOAD_128BIT_T) standard_dataset_descriptor_t - : public dataset_descriptor_base_t { +struct standard_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; - using LOAD_T = device::LOAD_128BIT_T; using QUERY_T = float; - using base_type::dim; + using base_type::args; using base_type::smem_ws_size_in_bytes; + using typename base_type::args_t; using typename base_type::compute_distance_type; using typename base_type::DATA_T; using typename base_type::DISTANCE_T; using typename base_type::INDEX_T; + using typename base_type::LOAD_T; using typename base_type::setup_workspace_type; constexpr static inline auto kMetric = Metric; constexpr static inline auto kTeamSize = TeamSize; constexpr static inline auto kDatasetBlockDim = DatasetBlockDim; - const DATA_T* ptr; - uint32_t ld; + // const DATA_T* ptr; + // uint32_t ld; + + // RAFT_INLINE_FUNCTION constexpr auto ptr() noexcept -> const DATA_T*& + // { + // return (const DATA_T*&)(extra_ptr1); + // } + + // RAFT_INLINE_FUNCTION constexpr auto ptr() const noexcept -> const DATA_T* const& + // { + // return (const DATA_T* const&)(extra_ptr1); + // } + + // RAFT_INLINE_FUNCTION constexpr auto ld() noexcept -> uint32_t& { return extra_word1; } + // RAFT_INLINE_FUNCTION constexpr auto ld() const noexcept -> const uint32_t& { return + // extra_word1; } + + static constexpr RAFT_INLINE_FUNCTION auto ptr(const args_t& args) noexcept + -> const DATA_T* const& + { + return (const DATA_T* const&)(args.extra_ptr1); + } + static constexpr RAFT_INLINE_FUNCTION auto ptr(args_t& args) noexcept -> const DATA_T*& + { + return (const DATA_T*&)(args.extra_ptr1); + } + + static constexpr RAFT_INLINE_FUNCTION auto ld(const args_t& args) noexcept -> const uint32_t& + { + return args.extra_word1; + } + static constexpr RAFT_INLINE_FUNCTION auto ld(args_t& args) noexcept -> uint32_t& + { + return args.extra_word1; + } _RAFT_HOST_DEVICE standard_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, @@ -81,10 +114,12 @@ struct alignas(device::LOAD_128BIT_T) standard_dataset_descriptor_t size, dim, TeamSize, - get_smem_ws_size_in_bytes(dim)), - ptr(ptr), - ld(ld) + get_smem_ws_size_in_bytes(dim)) { + standard_dataset_descriptor_t::ptr(args) = ptr; + standard_dataset_descriptor_t::ld(args) = ld; + static_assert(sizeof(*this) == sizeof(base_type)); + static_assert(alignof(standard_dataset_descriptor_t) == alignof(base_type)); } private: @@ -97,27 +132,34 @@ struct alignas(device::LOAD_128BIT_T) standard_dataset_descriptor_t template _RAFT_DEVICE __noinline__ auto setup_workspace_standard( - const typename DescriptorT::base_type* that, + const DescriptorT* that, void* smem_ptr, const typename DescriptorT::DATA_T* queries_ptr, - uint32_t query_id) -> const typename DescriptorT::base_type* + uint32_t query_id) -> const DescriptorT* { - using descriptor_type = DescriptorT; using base_type = typename DescriptorT::base_type; - using QUERY_T = typename descriptor_type::QUERY_T; + using QUERY_T = typename DescriptorT::QUERY_T; + using LOAD_T = typename DescriptorT::LOAD_T; constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; - using word_type = uint32_t; - if (((void*)that) != smem_ptr) { - constexpr auto kStructWords = sizeof(DescriptorT) / sizeof(word_type); - auto* dst = reinterpret_cast(smem_ptr); - auto* src = reinterpret_cast(that); - for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { + auto* r = reinterpret_cast(smem_ptr); + auto* buf = reinterpret_cast(r + 1); + if (r != that) { + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(LOAD_T); + using blob_type = LOAD_T[kCount]; + auto& src = reinterpret_cast(*that); + auto& dst = reinterpret_cast(*r); + for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { dst[i] = src[i]; } + const auto smem_ptr_offset = + reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(LOAD_T))) { + r->args.smem_ws_ptr = uint32_t(__cvta_generic_to_shared(buf)); + } + __syncthreads(); } - uint32_t dim = that->dim; - auto buf = reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT)); + uint32_t dim = r->args.dim; auto buf_len = raft::round_up_safe(dim, DatasetBlockDim); queries_ptr += dim * query_id; for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { @@ -129,13 +171,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( } } - return const_cast(reinterpret_cast(smem_ptr)); + return const_cast(r); } template _RAFT_DEVICE __noinline__ auto compute_distance_standard( - const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index) - -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; @@ -144,28 +186,26 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( constexpr auto kTeamSize = DescriptorT::kTeamSize; constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - const auto* __restrict__ desc = reinterpret_cast(desc_); - const auto* __restrict__ query_ptr = reinterpret_cast(desc + 1); + // const auto* __restrict__ query_ptr = reinterpret_cast(args.smem_ws_ptr); const auto* __restrict__ dataset_ptr = - desc->ptr + (static_cast(desc->ld) * dataset_index); + DescriptorT::ptr(args) + (static_cast(DescriptorT::ld(args)) * dataset_index); const auto lane_id = threadIdx.x % kTeamSize; - const auto dim = desc->dim; DISTANCE_T r = 0; - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += kDatasetBlockDim) { + for (uint32_t elem_offset = 0; elem_offset < args.dim; elem_offset += kDatasetBlockDim) { constexpr unsigned vlen = device::get_vlen(); constexpr unsigned reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); raft::TxN_t dl_buff[reg_nelem]; #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; + if (k >= args.dim) break; dl_buff[e].load(dataset_ptr, k); } #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= dim) break; + if (k >= args.dim) break; #pragma unroll for (uint32_t v = 0; v < vlen; v++) { // Note this loop can go above the dataset_dim for padded arrays. This is not a problem @@ -173,7 +213,7 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( // - Above the last element (dataset_dim-1), the query array is filled with zeros. // - The data buffer has to be also padded with zeros. DISTANCE_T d; - raft::lds(d, query_ptr + device::swizzling(k + v)); + device::lds(d, args.smem_ws_ptr + sizeof(QUERY_T) * device::swizzling(k + v)); r += dist_op( d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); } @@ -197,8 +237,11 @@ __launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( { using desc_type = standard_dataset_descriptor_t; - new (out) desc_type(&setup_workspace_standard, - &compute_distance_standard, + using base_type = typename desc_type::base_type; + new (out) desc_type(reinterpret_cast( + &setup_workspace_standard), + reinterpret_cast( + &compute_distance_standard), ptr, size, dim, @@ -242,7 +285,11 @@ struct standard_descriptor_spec : public instance_spec dataset.stride()}; host_type result{dd_host, stream, DatasetBlockDim}; void* args[] = // NOLINT - {&result.dev_ptr, &dd_host.ptr, &dd_host.size, &dd_host.dim, &dd_host.ld}; + {&result.dev_ptr, + &descriptor_type::ptr(dd_host.args), + &dd_host.size, + &dd_host.args.dim, + &descriptor_type::ld(dd_host.args)}; RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); return result; } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 3dc4ed683..055aeeabf 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -33,18 +33,18 @@ template -struct alignas(device::LOAD_128BIT_T) cagra_q_dataset_descriptor_t - : public dataset_descriptor_base_t { +struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodeBookT; - using LOAD_T = device::LOAD_128BIT_T; using QUERY_T = half; - using base_type::dim; - using base_type::smem_ws_size_in_bytes; + using base_type::args; + using base_type::extra_ptr3; + using typename base_type::args_t; using typename base_type::compute_distance_type; using typename base_type::DATA_T; using typename base_type::DISTANCE_T; using typename base_type::INDEX_T; + using typename base_type::LOAD_T; using typename base_type::setup_workspace_type; constexpr static inline auto kMetric = Metric; constexpr static inline auto kTeamSize = TeamSize; @@ -54,11 +54,59 @@ struct alignas(device::LOAD_128BIT_T) cagra_q_dataset_descriptor_t static_assert(std::is_same_v, "Only CODE_BOOK_T = `half` is supported now"); - const std::uint8_t* encoded_dataset_ptr; - const CODE_BOOK_T* vq_code_book_ptr; - const CODE_BOOK_T* pq_code_book_ptr; - std::uint32_t encoded_dataset_dim; - std::uint32_t n_subspace; + // alignas(LOAD_T) const std::uint8_t* encoded_dataset_ptr; + // const CODE_BOOK_T* vq_code_book_ptr; + // const CODE_BOOK_T* pq_code_book_ptr; + // std::uint32_t encoded_dataset_dim; + // std::uint32_t n_subspace; + + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(args_t& args) noexcept + -> const uint8_t*& + { + return (const uint8_t*&)args.extra_ptr1; + } + RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(args_t& args) noexcept + -> const CODE_BOOK_T*& + { + return (const CODE_BOOK_T*&)args.extra_ptr2; + } + RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() noexcept -> const CODE_BOOK_T*& + { + return (const CODE_BOOK_T*&)extra_ptr3; + } + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(args_t& args) noexcept -> uint32_t& + { + return args.extra_word1; + } + RAFT_INLINE_FUNCTION static constexpr auto n_subspace(args_t& args) noexcept -> uint32_t& + { + return args.extra_word2; + } + + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(const args_t& args) noexcept + -> const uint8_t* const& + { + return (const uint8_t*&)args.extra_ptr1; + } + RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(const args_t& args) noexcept + -> const CODE_BOOK_T* const& + { + return (const CODE_BOOK_T*&)args.extra_ptr2; + } + RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() const noexcept -> const CODE_BOOK_T* const& + { + return (const CODE_BOOK_T*&)extra_ptr3; + } + RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(const args_t& args) noexcept + -> const uint32_t& + { + return args.extra_word1; + } + RAFT_INLINE_FUNCTION static constexpr auto n_subspace(const args_t& args) noexcept + -> const uint32_t& + { + return args.extra_word2; + } static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of(); @@ -77,13 +125,15 @@ struct alignas(device::LOAD_128BIT_T) cagra_q_dataset_descriptor_t size, dim, TeamSize, - get_smem_ws_size_in_bytes(dim)), - encoded_dataset_ptr(encoded_dataset_ptr), - encoded_dataset_dim(encoded_dataset_dim), - n_subspace(n_subspace), - vq_code_book_ptr(vq_code_book_ptr), - pq_code_book_ptr(pq_code_book_ptr) + get_smem_ws_size_in_bytes(dim)) { + cagra_q_dataset_descriptor_t::encoded_dataset_ptr(args) = encoded_dataset_ptr; + cagra_q_dataset_descriptor_t::vq_code_book_ptr(args) = vq_code_book_ptr; + this->pq_code_book_ptr() = pq_code_book_ptr; + cagra_q_dataset_descriptor_t::encoded_dataset_dim(args) = encoded_dataset_dim; + cagra_q_dataset_descriptor_t::n_subspace(args) = n_subspace; + static_assert(sizeof(*this) == sizeof(base_type)); + static_assert(alignof(cagra_q_dataset_descriptor_t) == alignof(base_type)); } private: @@ -100,13 +150,11 @@ struct alignas(device::LOAD_128BIT_T) cagra_q_dataset_descriptor_t }; template -_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::base_type* that_, +_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, void* smem_ptr, const typename DescriptorT::DATA_T* queries_ptr, - uint32_t query_id) -> const - typename DescriptorT::base_type* + uint32_t query_id) -> const DescriptorT* { - using descriptor_type = DescriptorT; using base_type = typename DescriptorT::base_type; using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; @@ -118,26 +166,31 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::b constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - using word_type = uint32_t; - auto* that = reinterpret_cast(that_); + auto* r = reinterpret_cast(smem_ptr); - if (((void*)that) != smem_ptr) { - constexpr auto kStructWords = sizeof(DescriptorT) / sizeof(word_type); - auto* dst = reinterpret_cast(smem_ptr); - auto* src = reinterpret_cast(that); - for (unsigned i = threadIdx.x; i < kStructWords; i += blockDim.x) { + if (r != that) { + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(LOAD_T); + using blob_type = LOAD_T[kCount]; + auto& src = reinterpret_cast(*that); + auto& dst = reinterpret_cast(*r); + for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { dst[i] = src[i]; } - auto codebook_buf = - reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT)); + auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1)); + const auto smem_ptr_offset = + reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(LOAD_T))) { + r->args.smem_ws_ptr = codebook_buf; + } + __syncthreads(); // Copy PQ table for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { half2 buf2; - buf2.x = that->pq_code_book_ptr[i]; - buf2.y = that->pq_code_book_ptr[i + 1]; + buf2.x = r->pq_code_book_ptr()[i]; + buf2.y = r->pq_code_book_ptr()[i + 1]; // Change the order of PQ code book array to reduce the // frequency of bank conflicts. @@ -146,11 +199,12 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::b const auto j = i / num_elements_per_bank; const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - codebook_buf[smem_index] = buf2; + + device::sts(codebook_buf + smem_index * sizeof(half2), buf2); } } - uint32_t dim = that->dim; + uint32_t dim = r->args.dim; queries_ptr += dim * query_id; constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; @@ -170,13 +224,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const typename DescriptorT::b } } - return const_cast(reinterpret_cast(smem_ptr)); + return const_cast(r); } template _RAFT_DEVICE __noinline__ auto compute_distance_vpq( - const typename DescriptorT::base_type* desc_, const typename DescriptorT::INDEX_T dataset_index) - -> typename DescriptorT::DISTANCE_T + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T { using DISTANCE_T = typename DescriptorT::DISTANCE_T; using LOAD_T = typename DescriptorT::LOAD_T; @@ -187,23 +241,18 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - const auto* __restrict__ desc = reinterpret_cast(desc_); - const auto* __restrict__ pq_codebook_ptr = reinterpret_cast(desc + 1); - const auto* __restrict__ query_ptr = reinterpret_cast( - reinterpret_cast(pq_codebook_ptr) + DescriptorT::kSMemCodeBookSizeInBytes); + const uint32_t pq_codebook_ptr = args.smem_ws_ptr; + const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; const auto* __restrict__ node_ptr = - desc->encoded_dataset_ptr + - (static_cast(desc->encoded_dataset_dim) * dataset_index); - const auto* __restrict__ vq_codebook_ptr = desc->vq_code_book_ptr; - const auto dim = desc->dim; - const auto n_subspace = desc->n_subspace; - const unsigned lane_id = threadIdx.x % TeamSize; + DescriptorT::encoded_dataset_ptr(args) + + (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index); + const unsigned lane_id = threadIdx.x % TeamSize; // const uint32_t& vq_code = *reinterpret_cast(node_ptr); uint32_t vq_code; raft::ldg(vq_code, reinterpret_cast(node_ptr)); static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); DISTANCE_T norm = 0; - for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) { + for (uint32_t elem_offset = 0; elem_offset < args.dim; elem_offset += DatasetBlockDim) { constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** constexpr unsigned nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); @@ -212,7 +261,7 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; + if (k >= DescriptorT::n_subspace(args)) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) raft::ldg(pq_codes[e], reinterpret_cast(node_ptr + 4 + k)); } @@ -222,33 +271,35 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( #pragma unroll 1 for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; + if (k >= DescriptorT::n_subspace(args)) break; // Loading VQ code-book raft::TxN_t vq_vals[PQ_LEN]; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - vq_vals[m].load(reinterpret_cast(vq_codebook_ptr + d + (dim * vq_code)), 0); + if (d >= args.dim) break; + vq_vals[m].load(reinterpret_cast(DescriptorT::vq_code_book_ptr(args) + d + + (args.dim * vq_code)), + 0); } // Compute distance std::uint32_t pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; + if (PQ_LEN * (v + k) >= args.dim) break; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { const std::uint32_t d1 = m + (PQ_LEN * v); const std::uint32_t d = d1 + (PQ_LEN * k); half2 q2, c2; // Loading query vector from smem - raft::lds(reinterpret_cast(q2), - reinterpret_cast(query_ptr) + - device::swizzling(d / 2)); + device::lds(q2, + query_ptr + sizeof(uint32_t) * + device::swizzling(d / 2)); // Loading PQ code book from smem - raft::lds(reinterpret_cast(c2), - reinterpret_cast( - pq_codebook_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff)))); + device::lds(c2, + pq_codebook_ptr + sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * (m / 2) + + (2 * (pq_code & 0xff)))); // L2 distance auto dist = q2 - c2 - vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; dist = dist * dist; @@ -262,32 +313,34 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= n_subspace) break; + if (k >= DescriptorT::n_subspace(args)) break; // Loading VQ code-book raft::TxN_t vq_vals[PQ_LEN]; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { const std::uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; + if (d >= args.dim) break; // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device // memory) - vq_vals[m].load(reinterpret_cast(vq_codebook_ptr + d + (dim * vq_code)), 0); + vq_vals[m].load(reinterpret_cast(DescriptorT::vq_code_book_ptr(args) + d + + (args.dim * vq_code)), + 0); } // Compute distance std::uint32_t pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; + if (PQ_LEN * (v + k) >= args.dim) break; raft::TxN_t pq_vals; - pq_vals.load(reinterpret_cast(pq_codebook_ptr + PQ_LEN * (pq_code & 0xff)), - 0); // (from L1$ or smem) + device::lds(*pq_vals.vectorized_data(), + pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { const std::uint32_t d1 = m + (PQ_LEN * v); const std::uint32_t d = d1 + (PQ_LEN * k); // if (d >= dataset_dim) break; DISTANCE_T diff; - raft::lds(diff, query_ptr + d); + device::lds(diff, query_ptr + sizeof(QUERY_T) * d); diff -= static_cast(pq_vals.data[m]); diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); norm += diff * diff; @@ -328,15 +381,17 @@ __launch_bounds__(1, 1) __global__ DataT, IndexT, DistanceT>; - new (out) desc_type(&setup_workspace_vpq, - &compute_distance_vpq, - encoded_dataset_ptr, - encoded_dataset_dim, - n_subspace, - vq_code_book_ptr, - pq_code_book_ptr, - size, - dim); + using base_type = typename desc_type::base_type; + new (out) desc_type( + reinterpret_cast(&setup_workspace_vpq), + reinterpret_cast(&compute_distance_vpq), + encoded_dataset_ptr, + encoded_dataset_dim, + n_subspace, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim); } template { host_type result{dd_host, stream, DatasetBlockDim}; void* args[] = // NOLINT {&result.dev_ptr, - &dd_host.encoded_dataset_ptr, - &dd_host.encoded_dataset_dim, - &dd_host.n_subspace, - &dd_host.vq_code_book_ptr, - &dd_host.pq_code_book_ptr, + &descriptor_type::encoded_dataset_ptr(dd_host.args), + &descriptor_type::encoded_dataset_dim(dd_host.args), + &descriptor_type::n_subspace(dd_host.args), + &descriptor_type::vq_code_book_ptr(dd_host.args), + &dd_host.pq_code_book_ptr(), &dd_host.size, - &dd_host.dim}; + &dd_host.args.dim}; RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); return result; } diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 73787ec90..34c00e414 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -77,9 +77,9 @@ template (num_pickup, warp_size / team_size); for (uint32_t i = threadIdx.x / team_size; i < max_i; i += blockDim.x / team_size) { const bool valid_i = (i < num_pickup); @@ -138,7 +135,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( IndexT* __restrict__ result_child_indices_ptr, DistanceT* __restrict__ result_child_distances_ptr, // [dataset_dim, dataset_size] - const DATASET_DESCRIPTOR_T& __restrict__ dataset_desc, + const DATASET_DESCRIPTOR_T& dataset_desc, // [knn_k, dataset_size] const IndexT* __restrict__ knn_graph, const uint32_t knn_k, @@ -171,11 +168,8 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( __syncthreads(); // Compute the distance to child nodes - uint32_t max_i = knn_k * search_width; const auto team_size = dataset_desc.team_size; - if (max_i % (warp_size / team_size)) { - max_i += (warp_size / team_size) - (max_i % (warp_size / team_size)); - } + const auto max_i = raft::round_up_safe(knn_k * search_width, warp_size / team_size); for (uint32_t tid = threadIdx.x; tid < max_i * team_size; tid += blockDim.x) { const auto i = tid / team_size; const bool valid_i = (i < (knn_k * search_width)); @@ -219,5 +213,39 @@ RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size) -> T } } +RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr) +{ + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half& x, uint32_t addr) +{ + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(reinterpret_cast(x)) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half2& x, uint32_t addr) +{ + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(reinterpret_cast(x)) : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr) +{ + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr) +{ + lds(x, uint32_t(__cvta_generic_to_shared(addr))); +} + +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) +{ + asm volatile("st.shared.v2.u16 [%0], {%1, %2};" + : + : "r"(addr), + "h"(reinterpret_cast(x.x)), + "h"(reinterpret_cast(x.y))); +} + } // namespace device } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 9b6a6e299..17da14b1c 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -133,7 +133,7 @@ __device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] // multiple CTAs per single query // template -__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( +RAFT_KERNEL search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const @@ -195,7 +195,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // Set smem working buffer for the distance calculation dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - __syncthreads(); auto result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index af99ca7ca..20ade6de9 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -463,7 +463,7 @@ template -__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( +RAFT_KERNEL search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, top_k] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] const std::uint32_t top_k, @@ -527,7 +527,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( // Set smem working buffer for the distance calculation dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - __syncthreads(); auto result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); From 6630a9934a9c4568cc0f87f0a13872e57687be5d Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 2 Sep 2024 16:10:05 +0200 Subject: [PATCH 21/41] Further reduce the size size of the dataset descriptor and add explicit loading from shmem for more of its members --- .../detail/cagra/compute_distance.hpp | 64 ++++++++++++--- .../cagra/compute_distance_standard.cuh | 4 +- .../detail/cagra/compute_distance_vpq.cuh | 3 +- .../neighbors/detail/cagra/device_common.hpp | 81 +++++++++++-------- .../cagra/search_multi_cta_kernel-inl.cuh | 2 +- .../detail/cagra/search_multi_kernel.cuh | 15 ++-- .../cagra/search_single_cta_kernel-inl.cuh | 2 +- 7 files changed, 115 insertions(+), 56 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index f174a40a2..81f002c61 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -28,6 +28,7 @@ // TODO: This shouldn't be invoking spatial/knn #include "../ann_utils.cuh" +#include #include #include @@ -67,6 +68,30 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { } }; + struct smem_and_team_size_t { + uint32_t value; + RAFT_INLINE_FUNCTION constexpr smem_and_team_size_t(uint32_t smem_size_bytes, + uint32_t team_size_bitshift) + : value{(team_size_bitshift << 24) | smem_size_bytes} + { + } + /** Total dynamic shared memory required by the descriptor. */ + RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t + { + return value & 0xffffffu; + } + RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t + { + return (value >> 24) & 0xffu; + } + /** How many threads are involved in computing a single distance. */ + RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t + { + return 1u << team_size_bitshift(); + } + }; + static_assert(sizeof(smem_and_team_size_t) == sizeof(uint32_t)); + using setup_workspace_type = const base_type*(const base_type*, void*, const DATA_T*, uint32_t); using compute_distance_type = DISTANCE_T(const args_t, const INDEX_T); @@ -79,10 +104,7 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { * given by the dataset_index. */ compute_distance_type* compute_distance_impl; void* extra_ptr3; - /** How many threads are involved in computing a single distance. */ - uint32_t team_size; - /** Total dynamic shared memory required by the descriptor. */ - uint32_t smem_ws_size_in_bytes; + smem_and_team_size_t smem_and_team_size; /** Number of records in the database. */ INDEX_T size; @@ -91,17 +113,39 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { compute_distance_type* compute_distance_impl, INDEX_T size, uint32_t dim, - uint32_t team_size, + uint32_t team_size_bitshift, uint32_t smem_ws_size_in_bytes) : setup_workspace_impl(setup_workspace_impl), compute_distance_impl(compute_distance_impl), size(size), - team_size(team_size), - smem_ws_size_in_bytes(smem_ws_size_in_bytes), + smem_and_team_size(smem_ws_size_in_bytes, team_size_bitshift), args{nullptr, nullptr, 0, dim, 0, 0} { } + /** Total dynamic shared memory required by the descriptor. */ + RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t + { + return smem_and_team_size.smem_ws_size_in_bytes(); + } + RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t + { + return smem_and_team_size.team_size_bitshift(); + } + RAFT_DEVICE_INLINE_FUNCTION constexpr auto team_size_bitshift_from_smem() const noexcept + -> uint32_t + { + uint32_t sts; + raft::lds(sts, reinterpret_cast(&smem_and_team_size)); + return reinterpret_cast(sts).team_size_bitshift(); + } + + /** How many threads are involved in computing a single distance. */ + RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t + { + return smem_and_team_size.team_size(); + } + RAFT_DEVICE_INLINE_FUNCTION auto setup_workspace(void* smem_ptr, const DATA_T* queries_ptr, uint32_t query_id) const -> const base_type* @@ -113,7 +157,7 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { -> DISTANCE_T { auto per_thread_distances = valid ? compute_distance_impl(args.load(), dataset_index) : 0; - return device::team_sum(per_thread_distances, this->team_size); + return device::team_sum(per_thread_distances, team_size_bitshift_from_smem()); } }; @@ -130,8 +174,8 @@ struct dataset_descriptor_host { rmm::cuda_stream_view stream, uint32_t dataset_block_dim) : stream_{stream}, - smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes}, - team_size{dd_host.team_size}, + smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, + team_size{dd_host.team_size()}, dataset_block_dim{dataset_block_dim} { RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 46230c624..2a0439bea 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -22,7 +22,7 @@ #include #include #include - +#include #include #include @@ -113,7 +113,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t::Log2, get_smem_ws_size_in_bytes(dim)) { standard_dataset_descriptor_t::ptr(args) = ptr; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh index 055aeeabf..69c570417 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh @@ -21,6 +21,7 @@ #include #include #include +#include namespace cuvs::neighbors::cagra::detail { @@ -124,7 +125,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t::Log2, get_smem_ws_size_in_bytes(dim)) { cagra_q_dataset_descriptor_t::encoded_dataset_ptr(args) = encoded_dataset_ptr; diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 34c00e414..09fce8cbf 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -71,6 +71,29 @@ RAFT_DEVICE_INLINE_FUNCTION constexpr T swizzling(T x) } } +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T +{ +#pragma unroll + for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) { + x += raft::shfl_xor(x, stride, TeamSize); + } + return x; +} + +template +RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size_bitshift) -> T +{ + switch (team_size_bitshift) { + case 5: x += raft::shfl_xor(x, 16); + case 4: x += raft::shfl_xor(x, 8); + case 3: x += raft::shfl_xor(x, 4); + case 2: x += raft::shfl_xor(x, 2); + case 1: x += raft::shfl_xor(x, 1); + default: return x; + } +} + template @@ -88,10 +111,12 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const uint32_t block_id = 0, const uint32_t num_blocks = 1) { - const auto team_size = dataset_desc.team_size; - const auto max_i = raft::round_up_safe(num_pickup, warp_size / team_size); + const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); + const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); + const auto compute_distance = dataset_desc.compute_distance_impl; + const auto args = dataset_desc.args.load(); - for (uint32_t i = threadIdx.x / team_size; i < max_i; i += blockDim.x / team_size) { + for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { const bool valid_i = (i < num_pickup); IndexT best_index_team_local; @@ -109,7 +134,11 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( } } - auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); + // This is the `dataset_desc.compute_distance` manually inlined to move the fetching of + // dataset_desc from smem out of the loop. + // const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); + const auto norm2 = + device::team_sum(valid_i ? compute_distance(args, seed_index) : 0, team_size_bits); if (valid_i && (norm2 < best_norm2_team_local)) { best_norm2_team_local = norm2; @@ -117,7 +146,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( } } - const unsigned lane_id = threadIdx.x % team_size; + const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u); if (valid_i && lane_id == 0) { if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { result_distances_ptr[i] = best_norm2_team_local; @@ -168,18 +197,25 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( __syncthreads(); // Compute the distance to child nodes - const auto team_size = dataset_desc.team_size; - const auto max_i = raft::round_up_safe(knn_k * search_width, warp_size / team_size); - for (uint32_t tid = threadIdx.x; tid < max_i * team_size; tid += blockDim.x) { - const auto i = tid / team_size; + const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); + const auto max_i = raft::round_up_safe(knn_k * search_width, warp_size >> team_size_bits) + << team_size_bits; + const auto compute_distance = dataset_desc.compute_distance_impl; + const auto args = dataset_desc.args.load(); + for (uint32_t tid = threadIdx.x; tid < max_i; tid += blockDim.x) { + const auto i = tid >> team_size_bits; const bool valid_i = (i < (knn_k * search_width)); IndexT child_id = invalid_index; if (valid_i) { child_id = result_child_indices_ptr[i]; } - auto norm2 = dataset_desc.compute_distance(child_id, child_id != invalid_index); + // This is the `dataset_desc.compute_distance` manually inlined to move the fetching of + // dataset_desc from smem out of the loop. + // const auto norm2 = dataset_desc.compute_distance(child_id, child_id != invalid_index); + const auto norm2 = device::team_sum( + (child_id != invalid_index) ? compute_distance(args, child_id) : 0, team_size_bits); // Store the distance - const unsigned lane_id = threadIdx.x % team_size; + const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u); if (valid_i && lane_id == 0) { if (child_id != invalid_index) { result_child_distances_ptr[i] = norm2; @@ -190,29 +226,6 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( } } -template -RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T -{ -#pragma unroll - for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) { - x += raft::shfl_xor(x, stride, TeamSize); - } - return x; -} - -template -RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size) -> T -{ - switch (team_size) { - case 32: x += raft::shfl_xor(x, 16); - case 16: x += raft::shfl_xor(x, 8); - case 8: x += raft::shfl_xor(x, 4); - case 4: x += raft::shfl_xor(x, 2); - case 2: x += raft::shfl_xor(x, 1); - default: return x; - } -} - RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr) { asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr)); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 17da14b1c..b04ef4dc2 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -197,7 +197,7 @@ RAFT_KERNEL search_kernel( dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); auto result_indices_buffer = - reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto parent_indices_buffer = diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 59c3e2ee3..48299b683 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -111,9 +111,9 @@ RAFT_KERNEL random_pickup_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - const auto team_size = dataset_desc->team_size; + const auto team_size_bits = dataset_desc->team_size_bitshift(); const auto ldb = hashmap::get_size(hash_bitlen); - const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / team_size; + const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) >> team_size_bits; const uint32_t query_id = blockIdx.y; if (global_team_index >= num_pickup) { return; } extern __shared__ uint8_t smem[]; @@ -140,7 +140,7 @@ RAFT_KERNEL random_pickup_kernel( } const auto store_gmem_index = global_team_index + (ldr * query_id); - if (threadIdx.x % team_size == 0) { + if ((threadIdx.x & ((1u << team_size_bits) - 1u)) == 0) { if (hashmap::insert( visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) { result_distances_ptr[store_gmem_index] = best_norm2_team_local; @@ -316,10 +316,11 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - const auto team_size = dataset_desc->team_size; + const auto team_size_bits = dataset_desc->team_size_bitshift(); + const auto team_size = 1u << team_size_bits; const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; - const auto global_team_id = tid / team_size; + const auto global_team_id = tid >> team_size_bits; const auto query_id = blockIdx.y; extern __shared__ uint8_t smem[]; @@ -353,12 +354,12 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, compute_distance_flag); if (compute_distance_flag) { - if (threadIdx.x % team_size == 0) { + if ((threadIdx.x & (team_size - 1)) == 0) { result_indices_ptr[ldd * blockIdx.y + global_team_id] = child_id; result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2; } } else { - if (threadIdx.x % team_size == 0) { + if ((threadIdx.x & (team_size - 1)) == 0) { result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value(); } } diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 20ade6de9..cdb2578b8 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -529,7 +529,7 @@ RAFT_KERNEL search_kernel( dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); auto result_indices_buffer = - reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes); + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); auto result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); auto visited_hash_buffer = From 790e79ca5e7561b05e22029009158b3cf191f5bc Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 2 Sep 2024 17:11:02 +0200 Subject: [PATCH 22/41] Cache dataset descriptors to recover small batch performance --- .../neighbors/detail/cagra/cagra_search.cuh | 9 +- cpp/src/neighbors/detail/cagra/factory.cuh | 86 +++++++++++++++++++ 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index ed2122a50..6dc601f32 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -16,7 +16,6 @@ #pragma once -#include "compute_distance-ext.cuh" #include "factory.cuh" #include "search_plan.cuh" #include "search_single_cta_inst.cuh" @@ -193,8 +192,8 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto desc = dataset_descriptor_init( - params, *strided_dset, index.metric(), stream); + auto& desc = dataset_descriptor_init_with_cache( + res, params, *strided_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); @@ -203,8 +202,8 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto desc = dataset_descriptor_init( - params, *vpq_dset, index.metric(), stream); + auto& desc = dataset_descriptor_init_with_cache( + res, params, *vpq_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); } else if (auto* empty_dset = dynamic_cast*>(&index.data()); diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 65a744a8f..05118472b 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -16,6 +16,7 @@ #pragma once +#include "compute_distance-ext.cuh" #include "search_multi_cta.cuh" #include "search_multi_kernel.cuh" #include "search_plan.cuh" @@ -67,4 +68,89 @@ class factory { } } }; + +struct dataset_descriptor_key { + uint64_t data_ptr; + uint64_t n_rows; + uint32_t dim; + uint32_t extra_val; + uint32_t team_size; + uint32_t metric; +}; + +template +auto make_key(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, dataset_descriptor_key> +{ + return dataset_descriptor_key{reinterpret_cast(dataset.view().data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + dataset.stride(), + uint32_t(params.team_size), + uint32_t(metric)}; +} + +template +auto make_key(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> std::enable_if_t, dataset_descriptor_key> +{ + return dataset_descriptor_key{ + reinterpret_cast(dataset.data.data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + uint32_t(reinterpret_cast(dataset.pq_code_book.data_handle()) >> 6), + uint32_t(params.team_size), + uint32_t(metric)}; +} + +inline auto operator==(const dataset_descriptor_key& a, const dataset_descriptor_key& b) -> bool +{ + return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim && + a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric; +} + +struct dataset_descriptor_key_hash { + inline auto operator()(const dataset_descriptor_key& x) const noexcept -> std::size_t + { + return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} + + (size_t{x.team_size} ^ size_t{x.metric}); + } +}; + +template +struct dataset_descriptor_cache { + /** Number of descriptors to cache. */ + static constexpr size_t kDefaultSize = 100; + raft::cache::lru, + std::shared_ptr>> + value{kDefaultSize}; +}; + +template +auto dataset_descriptor_init_with_cache(const raft::resources& res, + const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) + -> const dataset_descriptor_host& +{ + using desc_t = dataset_descriptor_host; + auto key = make_key(params, dataset, metric); + auto& cache = + raft::resource::get_custom_resource>(res) + ->value; + std::shared_ptr desc{nullptr}; + if (!cache.get(key, &desc)) { + desc = std::make_shared(std::move(dataset_descriptor_init( + params, dataset, metric, raft::resource::get_cuda_stream(res)))); + cache.set(key, desc); + } + return *desc; +} + }; // namespace cuvs::neighbors::cagra::detail From 7599331730b81500902b1cc634955d8836e328ec Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 3 Sep 2024 11:35:19 +0200 Subject: [PATCH 23/41] Reduce the register usage in compute_distance_standard further --- .../cagra/compute_distance_standard.cuh | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh index 2a0439bea..c1d38ead1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh @@ -175,9 +175,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( } template -_RAFT_DEVICE __noinline__ auto compute_distance_standard( - const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> - typename DescriptorT::DISTANCE_T +RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker( + const typename DescriptorT::DATA_T* __restrict__ dataset_ptr, + uint32_t dim, + uint32_t query_smem_ptr) -> typename DescriptorT::DISTANCE_T { using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; @@ -185,27 +186,23 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( using QUERY_T = typename DescriptorT::QUERY_T; constexpr auto kTeamSize = DescriptorT::kTeamSize; constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - - // const auto* __restrict__ query_ptr = reinterpret_cast(args.smem_ws_ptr); - const auto* __restrict__ dataset_ptr = - DescriptorT::ptr(args) + (static_cast(DescriptorT::ld(args)) * dataset_index); - const auto lane_id = threadIdx.x % kTeamSize; + constexpr auto vlen = device::get_vlen(); + constexpr auto reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); DISTANCE_T r = 0; - for (uint32_t elem_offset = 0; elem_offset < args.dim; elem_offset += kDatasetBlockDim) { - constexpr unsigned vlen = device::get_vlen(); - constexpr unsigned reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); + for (uint32_t elem_offset = (threadIdx.x % kTeamSize) * vlen; elem_offset < dim; + elem_offset += kDatasetBlockDim) { raft::TxN_t dl_buff[reg_nelem]; #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= args.dim) break; + const uint32_t k = e * (kTeamSize * vlen) + elem_offset; + if (k >= dim) break; dl_buff[e].load(dataset_ptr, k); } #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { - const uint32_t k = (lane_id + (kTeamSize * e)) * vlen + elem_offset; - if (k >= args.dim) break; + const uint32_t k = e * (kTeamSize * vlen) + elem_offset; + if (k >= dim) break; #pragma unroll for (uint32_t v = 0; v < vlen; v++) { // Note this loop can go above the dataset_dim for padded arrays. This is not a problem @@ -213,7 +210,7 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( // - Above the last element (dataset_dim-1), the query array is filled with zeros. // - The data buffer has to be also padded with zeros. DISTANCE_T d; - device::lds(d, args.smem_ws_ptr + sizeof(QUERY_T) * device::swizzling(k + v)); + device::lds(d, query_smem_ptr + sizeof(QUERY_T) * device::swizzling(k + v)); r += dist_op( d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); } @@ -222,6 +219,17 @@ _RAFT_DEVICE __noinline__ auto compute_distance_standard( return r; } +template +_RAFT_DEVICE __noinline__ auto compute_distance_standard( + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T +{ + return compute_distance_standard_worker( + DescriptorT::ptr(args) + (static_cast(DescriptorT::ld(args)) * dataset_index), + args.dim, + args.smem_ws_ptr); +} + template Date: Tue, 3 Sep 2024 14:39:46 +0200 Subject: [PATCH 24/41] Reduce the generated code volume --- .../detail/cagra/compute_distance-ext.cuh | 1894 ++--------------- .../detail/cagra/compute_distance.cu | 604 +----- .../cagra/compute_distance_00_generate.py | 30 +- ...cuh => compute_distance_standard-impl.cuh} | 75 +- .../cagra/compute_distance_standard.hpp | 80 + ...ard_InnerProduct_float_uint32_dim128_t8.cu | 25 +- ...rd_InnerProduct_float_uint32_dim256_t16.cu | 25 +- ...rd_InnerProduct_float_uint32_dim512_t32.cu | 25 +- ...ard_InnerProduct_float_uint64_dim128_t8.cu | 25 +- ...rd_InnerProduct_float_uint64_dim256_t16.cu | 25 +- ...rd_InnerProduct_float_uint64_dim512_t32.cu | 25 +- ...dard_InnerProduct_half_uint32_dim128_t8.cu | 30 +- ...ard_InnerProduct_half_uint32_dim256_t16.cu | 25 +- ...ard_InnerProduct_half_uint32_dim512_t32.cu | 25 +- ...dard_InnerProduct_half_uint64_dim128_t8.cu | 30 +- ...ard_InnerProduct_half_uint64_dim256_t16.cu | 25 +- ...ard_InnerProduct_half_uint64_dim512_t32.cu | 25 +- ...dard_InnerProduct_int8_uint32_dim128_t8.cu | 25 +- ...ard_InnerProduct_int8_uint32_dim256_t16.cu | 25 +- ...ard_InnerProduct_int8_uint32_dim512_t32.cu | 25 +- ...ard_InnerProduct_uint8_uint32_dim128_t8.cu | 25 +- ...rd_InnerProduct_uint8_uint32_dim256_t16.cu | 25 +- ...rd_InnerProduct_uint8_uint32_dim512_t32.cu | 25 +- ...ndard_L2Expanded_float_uint32_dim128_t8.cu | 30 +- ...dard_L2Expanded_float_uint32_dim256_t16.cu | 30 +- ...dard_L2Expanded_float_uint32_dim512_t32.cu | 30 +- ...ndard_L2Expanded_float_uint64_dim128_t8.cu | 30 +- ...dard_L2Expanded_float_uint64_dim256_t16.cu | 30 +- ...dard_L2Expanded_float_uint64_dim512_t32.cu | 30 +- ...andard_L2Expanded_half_uint32_dim128_t8.cu | 30 +- ...ndard_L2Expanded_half_uint32_dim256_t16.cu | 30 +- ...ndard_L2Expanded_half_uint32_dim512_t32.cu | 30 +- ...andard_L2Expanded_half_uint64_dim128_t8.cu | 30 +- ...ndard_L2Expanded_half_uint64_dim256_t16.cu | 30 +- ...ndard_L2Expanded_half_uint64_dim512_t32.cu | 30 +- ...andard_L2Expanded_int8_uint32_dim128_t8.cu | 30 +- ...ndard_L2Expanded_int8_uint32_dim256_t16.cu | 25 +- ...ndard_L2Expanded_int8_uint32_dim512_t32.cu | 25 +- ...ndard_L2Expanded_uint8_uint32_dim128_t8.cu | 25 +- ...dard_L2Expanded_uint8_uint32_dim256_t16.cu | 25 +- ...dard_L2Expanded_uint8_uint32_dim512_t32.cu | 25 +- ..._vpq.cuh => compute_distance_vpq-impl.cuh} | 157 +- .../detail/cagra/compute_distance_vpq.hpp | 102 + ...d_float_uint32_dim128_t8_8pq_2subd_half.cu | 34 +- ...d_float_uint32_dim128_t8_8pq_4subd_half.cu | 34 +- ..._float_uint32_dim256_t16_8pq_2subd_half.cu | 34 +- ..._float_uint32_dim256_t16_8pq_4subd_half.cu | 34 +- ..._float_uint32_dim512_t32_8pq_2subd_half.cu | 34 +- ..._float_uint32_dim512_t32_8pq_4subd_half.cu | 34 +- ...d_float_uint64_dim128_t8_8pq_2subd_half.cu | 34 +- ...d_float_uint64_dim128_t8_8pq_4subd_half.cu | 34 +- ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 34 +- ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 34 +- ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 34 +- ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 34 +- ...ed_half_uint32_dim128_t8_8pq_2subd_half.cu | 34 +- ...ed_half_uint32_dim128_t8_8pq_4subd_half.cu | 34 +- ...d_half_uint32_dim256_t16_8pq_2subd_half.cu | 34 +- ...d_half_uint32_dim256_t16_8pq_4subd_half.cu | 34 +- ...d_half_uint32_dim512_t32_8pq_2subd_half.cu | 34 +- ...d_half_uint32_dim512_t32_8pq_4subd_half.cu | 34 +- ...ed_half_uint64_dim128_t8_8pq_2subd_half.cu | 34 +- ...ed_half_uint64_dim128_t8_8pq_4subd_half.cu | 34 +- ...d_half_uint64_dim256_t16_8pq_2subd_half.cu | 34 +- ...d_half_uint64_dim256_t16_8pq_4subd_half.cu | 34 +- ...d_half_uint64_dim512_t32_8pq_2subd_half.cu | 34 +- ...d_half_uint64_dim512_t32_8pq_4subd_half.cu | 34 +- ...ed_int8_uint32_dim128_t8_8pq_2subd_half.cu | 34 +- ...ed_int8_uint32_dim128_t8_8pq_4subd_half.cu | 34 +- ...d_int8_uint32_dim256_t16_8pq_2subd_half.cu | 34 +- ...d_int8_uint32_dim256_t16_8pq_4subd_half.cu | 34 +- ...d_int8_uint32_dim512_t32_8pq_2subd_half.cu | 34 +- ...d_int8_uint32_dim512_t32_8pq_4subd_half.cu | 34 +- ...d_uint8_uint32_dim128_t8_8pq_2subd_half.cu | 34 +- ...d_uint8_uint32_dim128_t8_8pq_4subd_half.cu | 34 +- ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu | 34 +- ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu | 34 +- ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu | 34 +- ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu | 34 +- 79 files changed, 794 insertions(+), 4347 deletions(-) rename cpp/src/neighbors/detail/cagra/{compute_distance_standard.cuh => compute_distance_standard-impl.cuh} (83%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq.cuh => compute_distance_vpq-impl.cuh} (81%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 1dbc843d0..86de55db6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -25,564 +25,26 @@ #pragma once -#include "compute_distance_standard.cuh" -#include "compute_distance_vpq.cuh" +#include "compute_distance_standard.hpp" +#include "compute_distance_vpq.hpp" namespace cuvs::neighbors::cagra::detail { -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct standard_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct cagra_q_dataset_descriptor_t; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; extern template struct instance_selector< - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; using descriptor_instances = instance_selector< - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; template auto dataset_descriptor_init(const cagra::search_params& params, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index 5d480f57a..387b4c71b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -27,536 +27,80 @@ namespace cuvs::neighbors::cagra::detail { +using namespace cuvs::distance; + template struct instance_selector< - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 1f2b24e10..1b0743901 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -43,6 +43,7 @@ namespace cuvs::neighbors::cagra::detail {{ +using namespace cuvs::distance; {content} }} // namespace cuvs::neighbors::cagra::detail @@ -69,7 +70,7 @@ half_uint64=("half", "uint64_t", "float"), ) -metric_prefix = 'cuvs::distance::DistanceType::' +metric_prefix = 'DistanceType::' specs = [] descs = [] @@ -90,17 +91,10 @@ # CAGRA for metric in ['L2Expanded', 'InnerProduct']: path = f"compute_distance_standard_{metric}_{type_path}_dim{mxdim}_t{team}.cu" - includes = '#include "compute_distance_standard.cuh"' + includes = '#include "compute_distance_standard-impl.cuh"' params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}" spec = f"standard_descriptor_spec<{params}>" - desc = f"standard_dataset_descriptor_t<{params}>" - content = f""" -template struct {desc}; -template <> -const void* {spec}::init_kernel = reinterpret_cast(&standard_dataset_descriptor_init_kernel<{params}>); -template struct {spec}; -""" - descs.append(desc) + content = f"""template struct {spec};""" specs.append(spec) with open(path, "w") as f: f.write(template.format(includes=includes, content=content)) @@ -112,17 +106,10 @@ for pq_bit in pq_bits: for metric in ['L2Expanded']: path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" - includes = '#include "compute_distance_vpq.cuh"' + includes = '#include "compute_distance_vpq-impl.cuh"' params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" spec = f"vpq_descriptor_spec<{params}>" - desc = f"cagra_q_dataset_descriptor_t<{params}>" - content = f""" -template struct {desc}; -template <> -const void* {spec}::init_kernel = reinterpret_cast(&vpq_dataset_descriptor_init_kernel<{params}>); -template struct {spec}; -""" - descs.append(desc) + content = f"""template struct {spec};""" specs.append(spec) with open(path, "w") as f: f.write(template.format(includes=includes, content=content)) @@ -132,12 +119,11 @@ includes = ''' #pragma once -#include "compute_distance_standard.cuh" -#include "compute_distance_vpq.cuh" +#include "compute_distance_standard.hpp" +#include "compute_distance_vpq.hpp" ''' newline = "\n" contents = f''' -{newline.join(map(lambda s: "extern template struct " + s + ";", descs))} {newline.join(map(lambda s: "extern template struct " + s + ";", specs))} extern template struct diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh similarity index 83% rename from cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh rename to cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh index c1d38ead1..7fe5242a9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include "compute_distance.hpp" +#include "compute_distance_standard.hpp" #include #include @@ -262,57 +262,30 @@ template -struct standard_descriptor_spec : public instance_spec { - using base_type = instance_spec; - using typename base_type::data_type; - using typename base_type::distance_type; - using typename base_type::host_type; - using typename base_type::index_type; - - template - constexpr static inline bool accepts_dataset() - { - return is_strided_dataset_v; - } - - using descriptor_type = +dataset_descriptor_host +standard_descriptor_spec::init_( + const cagra::search_params& params, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld, + rmm::cuda_stream_view stream) +{ + using desc_type = standard_dataset_descriptor_t; - static const void* init_kernel; - - template - static auto init(const cagra::search_params& params, - const DatasetT& dataset, - cuvs::distance::DistanceType metric, - rmm::cuda_stream_view stream) -> host_type - { - descriptor_type dd_host{nullptr, - nullptr, - dataset.view().data_handle(), - IndexT(dataset.n_rows()), - dataset.dim(), - dataset.stride()}; - host_type result{dd_host, stream, DatasetBlockDim}; - void* args[] = // NOLINT - {&result.dev_ptr, - &descriptor_type::ptr(dd_host.args), - &dd_host.size, - &dd_host.args.dim, - &descriptor_type::ld(dd_host.args)}; - RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); - return result; - } + using base_type = typename desc_type::base_type; + desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld}; + host_type result{dd_host, stream, DatasetBlockDim}; - template - static auto priority(const cagra::search_params& params, - const DatasetT& dataset, - cuvs::distance::DistanceType metric) -> double - { - // If explicit team_size is specified and doesn't match the instance, discard it - if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } - if (Metric != metric) { return -1.0; } - // Otherwise, favor the closest dataset dimensionality. - return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); - } -}; + standard_dataset_descriptor_init_kernel + <<<1, 1, 0, stream>>>(result.dev_ptr, ptr, size, dim, desc_type::ld(dd_host.args)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + return result; +} } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp new file mode 100644 index 000000000..df1b77e86 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_distance.hpp" + +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct standard_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; + + template + constexpr static inline bool accepts_dataset() + { + return is_strided_dataset_v; + } + + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) -> host_type + { + return init_(params, + dataset.view().data_handle(), + IndexT(dataset.n_rows()), + dataset.dim(), + dataset.stride(), + stream); + } + + template + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (Metric != metric) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); + } + + private: + static dataset_descriptor_host init_(const cagra::search_params& params, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld, + rmm::cuda_stream_view stream); +}; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu index bc1900856..af5e89a76 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu @@ -23,31 +23,12 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu index f1ace30cc..cfad79f3a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu @@ -23,31 +23,12 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu index 4528426c7..32a18ff3e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu @@ -23,31 +23,12 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu index cdb315bac..7d1206c37 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu index 49053a2d6..251316b2c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu index 5a534718b..7a8c4059c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu index 7e85fa349..fcc65a48e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu index 4bc254679..833dac9c4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu index c0fe52caf..e3870df40 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu index b585e1f80..1253d7cd4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu index 91de967e8..792532c2c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu index b77b84793..b3a466f46 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu index 7ce86c034..a11701e5a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu index 507d709eb..9ed0a32ee 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu index c5c7a7b4c..c9c960cf9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu @@ -23,35 +23,11 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; +using namespace cuvs::distance; +template struct standard_descriptor_spec; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu index 8d237f58b..d7a12804b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu @@ -23,31 +23,12 @@ * */ -#include "compute_distance_standard.cuh" +#include "compute_distance_standard-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct standard_dataset_descriptor_t; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec; -template <> -const void* standard_descriptor_spec::init_kernel = - reinterpret_cast( - &standard_dataset_descriptor_init_kernel); -template struct standard_descriptor_spec #include @@ -30,13 +30,13 @@ template struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; - using CODE_BOOK_T = CodeBookT; + using CODE_BOOK_T = CodebookT; using QUERY_T = half; using base_type::args; using base_type::extra_ptr3; @@ -119,7 +119,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t __launch_bounds__(1, 1) __global__ void vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, const std::uint8_t* encoded_dataset_ptr, - std::uint32_t encoded_dataset_dim, - std::uint32_t n_subspace, - const CodeBookT* vq_code_book_ptr, - const CodeBookT* pq_code_book_ptr, - std::size_t size, - std::uint32_t dim) + uint32_t encoded_dataset_dim, + uint32_t n_subspace, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim) { using desc_type = cagra_q_dataset_descriptor_t; @@ -404,81 +404,64 @@ template -struct vpq_descriptor_spec : public instance_spec { - using base_type = instance_spec; - using typename base_type::data_type; - using typename base_type::distance_type; - using typename base_type::host_type; - using typename base_type::index_type; - - template - constexpr static inline auto accepts_dataset() - -> std::enable_if_t, bool> - { - return std::is_same_v; - } - - template - constexpr static inline auto accepts_dataset() - -> std::enable_if_t, bool> - { - return false; - } - - using descriptor_type = cagra_q_dataset_descriptor_t; - static const void* init_kernel; - - template - static auto init(const cagra::search_params& params, - const DatasetT& dataset, - cuvs::distance::DistanceType metric, - rmm::cuda_stream_view stream) -> host_type - { - descriptor_type dd_host{nullptr, - nullptr, - dataset.data.data_handle(), - dataset.encoded_row_length(), - dataset.pq_dim(), - dataset.vq_code_book.data_handle(), - dataset.pq_code_book.data_handle(), - IndexT(dataset.n_rows()), - dataset.dim()}; - host_type result{dd_host, stream, DatasetBlockDim}; - void* args[] = // NOLINT - {&result.dev_ptr, - &descriptor_type::encoded_dataset_ptr(dd_host.args), - &descriptor_type::encoded_dataset_dim(dd_host.args), - &descriptor_type::n_subspace(dd_host.args), - &descriptor_type::vq_code_book_ptr(dd_host.args), - &dd_host.pq_code_book_ptr(), - &dd_host.size, - &dd_host.args.dim}; - RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream)); - return result; - } +dataset_descriptor_host +vpq_descriptor_spec::init_(const cagra::search_params& params, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + uint32_t n_subspace, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim, + rmm::cuda_stream_view stream) +{ + using desc_type = cagra_q_dataset_descriptor_t; + using base_type = typename desc_type::base_type; - template - static auto priority(const cagra::search_params& params, - const DatasetT& dataset, - cuvs::distance::DistanceType metric) -> double - { - // If explicit team_size is specified and doesn't match the instance, discard it - if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } - if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } - // Match codebook params - if (dataset.pq_bits() != PqBits) { return -1.0; } - if (dataset.pq_len() != PqLen) { return -1.0; } - // Otherwise, favor the closest dataset dimensionality. - return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); - } -}; + desc_type dd_host{nullptr, + nullptr, + encoded_dataset_ptr, + encoded_dataset_dim, + n_subspace, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim}; + host_type result{dd_host, stream, DatasetBlockDim}; + vpq_dataset_descriptor_init_kernel<<<1, 1, 0, stream>>>(result.dev_ptr, + encoded_dataset_ptr, + encoded_dataset_dim, + n_subspace, + vq_code_book_ptr, + pq_code_book_ptr, + size, + dim); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + return result; +} } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp new file mode 100644 index 000000000..9d5b0b6c0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "compute_distance.hpp" + +#include + +#include + +namespace cuvs::neighbors::cagra::detail { + +template +struct vpq_descriptor_spec : public instance_spec { + using base_type = instance_spec; + using typename base_type::data_type; + using typename base_type::distance_type; + using typename base_type::host_type; + using typename base_type::index_type; + + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return std::is_same_v; + } + + template + constexpr static inline auto accepts_dataset() + -> std::enable_if_t, bool> + { + return false; + } + + template + static auto init(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric, + rmm::cuda_stream_view stream) -> host_type + { + return init_(params, + dataset.data.data_handle(), + dataset.encoded_row_length(), + dataset.pq_dim(), + dataset.vq_code_book.data_handle(), + dataset.pq_code_book.data_handle(), + IndexT(dataset.n_rows()), + dataset.dim(), + stream); + } + + template + static auto priority(const cagra::search_params& params, + const DatasetT& dataset, + cuvs::distance::DistanceType metric) -> double + { + // If explicit team_size is specified and doesn't match the instance, discard it + if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } + if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } + // Match codebook params + if (dataset.pq_bits() != PqBits) { return -1.0; } + if (dataset.pq_len() != PqLen) { return -1.0; } + // Otherwise, favor the closest dataset dimensionality. + return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim))); + } + + private: + static dataset_descriptor_host init_( + const cagra::search_params& params, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + uint32_t n_subspace, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim, + rmm::cuda_stream_view stream); +}; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu index 7abc27bda..a56a5a9df 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu @@ -23,40 +23,12 @@ * */ -#include "compute_distance_vpq.cuh" +#include "compute_distance_vpq-impl.cuh" namespace cuvs::neighbors::cagra::detail { -template struct cagra_q_dataset_descriptor_t; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec; -template <> -const void* vpq_descriptor_spec::init_kernel = - reinterpret_cast( - &vpq_dataset_descriptor_init_kernel); -template struct vpq_descriptor_spec Date: Wed, 4 Sep 2024 14:21:44 +0200 Subject: [PATCH 25/41] More explicit ldg cache behavior and a few smaller things --- cpp/CMakeLists.txt | 2 +- .../cagra/compute_distance_standard-impl.cuh | 26 ++-- .../cagra/compute_distance_vpq-impl.cuh | 120 ++++++++++-------- .../neighbors/detail/cagra/device_common.hpp | 94 ++++++++++++++ .../cagra/search_multi_cta_kernel-inl.cuh | 2 +- .../cagra/search_single_cta_kernel-inl.cuh | 2 +- 6 files changed, 174 insertions(+), 72 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3b414c38c..ad0303486 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -291,7 +291,7 @@ add_library( src/neighbors/detail/cagra/search_single_cta_half_uint64.cu ) -file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/*.cu") +file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu") set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64) set_target_properties( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh index 7fe5242a9..e17562e48 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -18,12 +18,8 @@ #include "compute_distance_standard.hpp" #include -#include -#include #include -#include #include -#include #include @@ -187,17 +183,19 @@ RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker( constexpr auto kTeamSize = DescriptorT::kTeamSize; constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto vlen = device::get_vlen(); - constexpr auto reg_nelem = raft::ceildiv(kDatasetBlockDim, kTeamSize * vlen); + constexpr auto reg_nelem = + raft::div_rounding_up_unsafe(kDatasetBlockDim, kTeamSize * vlen); DISTANCE_T r = 0; for (uint32_t elem_offset = (threadIdx.x % kTeamSize) * vlen; elem_offset < dim; elem_offset += kDatasetBlockDim) { - raft::TxN_t dl_buff[reg_nelem]; + DATA_T data[reg_nelem][vlen]; #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { const uint32_t k = e * (kTeamSize * vlen) + elem_offset; if (k >= dim) break; - dl_buff[e].load(dataset_ptr, k); + device::ldg_cg(reinterpret_cast(data[e]), + reinterpret_cast(dataset_ptr + k)); } #pragma unroll for (uint32_t e = 0; e < reg_nelem; e++) { @@ -212,7 +210,7 @@ RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker( DISTANCE_T d; device::lds(d, query_smem_ptr + sizeof(QUERY_T) * device::swizzling(k + v)); r += dist_op( - d, cuvs::spatial::knn::detail::utils::mapping{}(dl_buff[e].val.data[v])); + d, cuvs::spatial::knn::detail::utils::mapping{}(data[e][v])); } } } @@ -236,12 +234,12 @@ template -__launch_bounds__(1, 1) __global__ void standard_dataset_descriptor_init_kernel( - dataset_descriptor_base_t* out, - const DataT* ptr, - IndexT size, - uint32_t dim, - uint32_t ld) +RAFT_KERNEL __launch_bounds__(1, 1) + standard_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, + const DataT* ptr, + IndexT size, + uint32_t dim, + uint32_t ld) { using desc_type = standard_dataset_descriptor_t; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 22b543994..7d6f2f838 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -19,10 +19,11 @@ #include "compute_distance_vpq.hpp" #include -#include #include #include +#include + namespace cuvs::neighbors::cagra::detail { template -_RAFT_DEVICE __noinline__ auto compute_distance_vpq( - const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> - typename DescriptorT::DISTANCE_T +_RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( + const uint8_t* __restrict__ dataset_ptr, + const typename DescriptorT::CODE_BOOK_T* __restrict__ vq_code_book_ptr, + uint32_t dim, + uint32_t pq_codebook_ptr, + uint32_t n_subspace) -> typename DescriptorT::DISTANCE_T { using DISTANCE_T = typename DescriptorT::DISTANCE_T; using LOAD_T = typename DescriptorT::LOAD_T; @@ -242,52 +246,48 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - const uint32_t pq_codebook_ptr = args.smem_ws_ptr; - const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; - const auto* __restrict__ node_ptr = - DescriptorT::encoded_dataset_ptr(args) + - (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index); - const unsigned lane_id = threadIdx.x % TeamSize; - // const uint32_t& vq_code = *reinterpret_cast(node_ptr); - uint32_t vq_code; - raft::ldg(vq_code, reinterpret_cast(node_ptr)); + const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; + { + uint32_t vq_code; + device::ldg_cg(vq_code, reinterpret_cast(dataset_ptr)); + vq_code_book_ptr += dim * vq_code; + } static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); + constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + constexpr uint32_t nelem = + raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); DISTANCE_T norm = 0; - for (uint32_t elem_offset = 0; elem_offset < args.dim; elem_offset += DatasetBlockDim) { - constexpr unsigned vlen = 4; // **** DO NOT CHANGE **** - constexpr unsigned nelem = - raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + for (uint32_t elem_offset = (threadIdx.x % TeamSize) * (vlen * PQ_LEN); elem_offset < dim; + elem_offset += DatasetBlockDim) { // Loading PQ codes uint32_t pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= DescriptorT::n_subspace(args)) break; + const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + if (k >= n_subspace) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - raft::ldg(pq_codes[e], reinterpret_cast(node_ptr + 4 + k)); + device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); } // if constexpr (PQ_LEN % 2 == 0) { // **** Use half2 for distance computation **** -#pragma unroll 1 +#pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= DescriptorT::n_subspace(args)) break; + const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + if (k >= n_subspace) break; // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; + half2 vq_vals[PQ_LEN][vlen / 2]; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= args.dim) break; - vq_vals[m].load(reinterpret_cast(DescriptorT::vq_code_book_ptr(args) + d + - (args.dim * vq_code)), - 0); + if (d >= dim) break; + device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); } // Compute distance std::uint32_t pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= args.dim) break; + if (PQ_LEN * (v + k) >= dim) break; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { const std::uint32_t d1 = m + (PQ_LEN * v); @@ -300,9 +300,9 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( // Loading PQ code book from smem device::lds(c2, pq_codebook_ptr + sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * (m / 2) + - (2 * (pq_code & 0xff)))); + (2 * (pq_codes[e] & 0xff)))); // L2 distance - auto dist = q2 - c2 - vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2]; + auto dist = q2 - c2 - vq_vals[d1 / vlen][(d1 % vlen) / 2]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } @@ -313,28 +313,24 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( // **** Use float for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN; - if (k >= DescriptorT::n_subspace(args)) break; + const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + if (k >= n_subspace) break; // Loading VQ code-book - raft::TxN_t vq_vals[PQ_LEN]; + CODE_BOOK_T vq_vals[PQ_LEN][vlen]; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { const std::uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= args.dim) break; - // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device - // memory) - vq_vals[m].load(reinterpret_cast(DescriptorT::vq_code_book_ptr(args) + d + - (args.dim * vq_code)), - 0); + if (d >= dim) break; + // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device memory) + device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); } // Compute distance std::uint32_t pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= args.dim) break; - raft::TxN_t pq_vals; - device::lds(*pq_vals.vectorized_data(), - pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); + if (PQ_LEN * (v + k) >= dim) break; + CODE_BOOK_T pq_vals[PQ_LEN]; + device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { const std::uint32_t d1 = m + (PQ_LEN * v); @@ -342,8 +338,8 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( // if (d >= dataset_dim) break; DISTANCE_T diff; device::lds(diff, query_ptr + sizeof(QUERY_T) * d); - diff -= static_cast(pq_vals.data[m]); - diff -= static_cast(vq_vals[d1 / vlen].val.data[d1 % vlen]); + diff -= static_cast(pq_vals[m]); + diff -= static_cast(vq_vals[d1 / vlen][d1 % vlen]); norm += diff * diff; } pq_code >>= 8; @@ -354,6 +350,20 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( return norm; } +template +_RAFT_DEVICE __noinline__ auto compute_distance_vpq( + const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> + typename DescriptorT::DISTANCE_T +{ + return compute_distance_vpq_worker( + DescriptorT::encoded_dataset_ptr(args) + + (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index), + DescriptorT::vq_code_book_ptr(args), + args.dim, + args.smem_ws_ptr, + DescriptorT::n_subspace(args)); +} + template -__launch_bounds__(1, 1) __global__ - void vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, - const std::uint8_t* encoded_dataset_ptr, - uint32_t encoded_dataset_dim, - uint32_t n_subspace, - const CodebookT* vq_code_book_ptr, - const CodebookT* pq_code_book_ptr, - IndexT size, - uint32_t dim) +RAFT_KERNEL __launch_bounds__(1, 1) + vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, + const std::uint8_t* encoded_dataset_ptr, + uint32_t encoded_dataset_dim, + uint32_t n_subspace, + const CodebookT* vq_code_book_ptr, + const CodebookT* pq_code_book_ptr, + IndexT size, + uint32_t dim) { using desc_type = cagra_q_dataset_descriptor_t(x)) : "r"(addr)); } +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[1], uint32_t addr) +{ + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast(x)) : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[2], uint32_t addr) +{ + asm volatile("ld.shared.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "r"(addr)); +} +RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[4], uint32_t addr) +{ + asm volatile("ld.shared.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "r"(addr)); +} RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr) { @@ -260,5 +279,80 @@ RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) "h"(reinterpret_cast(x.y))); } +RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint4& x, const uint4* addr) +{ + asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint4& x, const uint4* addr) +{ + asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w) + : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint32_t& x, const uint32_t* addr) +{ + asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint32_t& x, const uint32_t* addr) +{ + asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half& x, const half* addr) +{ + asm volatile("ld.global.ca.u16 {%0}, [%1];" + : "=h"(reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[1], const half* addr) +{ + asm volatile("ld.global.ca.u16 {%0}, [%1];" + : "=h"(*reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[2], const half* addr) +{ + asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[4], const half* addr) +{ + asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "l"(reinterpret_cast(addr))); +} + +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2& x, const half* addr) +{ + asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" + : "=h"(reinterpret_cast(x.x)), "=h"(reinterpret_cast(x.y)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[1], const half* addr) +{ + asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" + : "=h"(reinterpret_cast(x[0].x)), + "=h"(reinterpret_cast(x[0].y)) + : "l"(reinterpret_cast(addr))); +} +RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[2], const half* addr) +{ + asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(reinterpret_cast(x[0].x)), + "=h"(reinterpret_cast(x[0].y)), + "=h"(reinterpret_cast(x[1].x)), + "=h"(reinterpret_cast(x[1].y)) + : "l"(reinterpret_cast(addr))); +} + } // namespace device } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index b04ef4dc2..b5d8296be 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -133,7 +133,7 @@ __device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] // multiple CTAs per single query // template -RAFT_KERNEL search_kernel( +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index cdb2578b8..116843ce1 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -463,7 +463,7 @@ template -RAFT_KERNEL search_kernel( +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr, // [num_queries, top_k] typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] const std::uint32_t top_k, From 5984596229c636dcd97ac4c0a59a72539df2c31b Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 4 Sep 2024 17:03:55 +0200 Subject: [PATCH 26/41] Simplify vpq indexing arithmetics a bit --- .../cagra/compute_distance_vpq-impl.cuh | 19 ++++++++-------- .../neighbors/detail/cagra/device_common.hpp | 22 ++++++++----------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 7d6f2f838..1343adcae 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -278,7 +278,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // Loading VQ code-book half2 vq_vals[PQ_LEN][vlen / 2]; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 1) { + for (std::uint32_t m = 0; m < PQ_LEN; m++) { const uint32_t d = (vlen * m) + (PQ_LEN * k); if (d >= dim) break; device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); @@ -289,20 +289,20 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m += 2) { - const std::uint32_t d1 = m + (PQ_LEN * v); - const std::uint32_t d = d1 + (PQ_LEN * k); + for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { + const std::uint32_t d1 = m + (PQ_LEN / 2) * v; + const std::uint32_t d = d1 + (PQ_LEN / 2) * k; half2 q2, c2; // Loading query vector from smem device::lds(q2, query_ptr + sizeof(uint32_t) * - device::swizzling(d / 2)); + device::swizzling(d)); // Loading PQ code book from smem device::lds(c2, - pq_codebook_ptr + sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * (m / 2) + - (2 * (pq_codes[e] & 0xff)))); + pq_codebook_ptr + + sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff)))); // L2 distance - auto dist = q2 - c2 - vq_vals[d1 / vlen][(d1 % vlen) / 2]; + auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } @@ -339,7 +339,8 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( DISTANCE_T diff; device::lds(diff, query_ptr + sizeof(QUERY_T) * d); diff -= static_cast(pq_vals[m]); - diff -= static_cast(vq_vals[d1 / vlen][d1 % vlen]); + diff -= + static_cast(reinterpret_cast(vq_vals)[d1]); norm += diff * diff; } pq_code >>= 8; diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 8def1cdec..026f01e24 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -333,25 +333,21 @@ RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[4], const half* addr) RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2& x, const half* addr) { - asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" - : "=h"(reinterpret_cast(x.x)), "=h"(reinterpret_cast(x.y)) - : "l"(reinterpret_cast(addr))); + asm volatile("ld.global.ca.u32 %0, [%1];" + : "=r"(reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); } RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[1], const half* addr) { - asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];" - : "=h"(reinterpret_cast(x[0].x)), - "=h"(reinterpret_cast(x[0].y)) - : "l"(reinterpret_cast(addr))); + asm volatile("ld.global.ca.u32 %0, [%1];" + : "=r"(*reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); } RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[2], const half* addr) { - asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(reinterpret_cast(x[0].x)), - "=h"(reinterpret_cast(x[0].y)), - "=h"(reinterpret_cast(x[1].x)), - "=h"(reinterpret_cast(x[1].y)) - : "l"(reinterpret_cast(addr))); + asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];" + : "=r"(*reinterpret_cast(x)), "=r"(*reinterpret_cast(x + 1)) + : "l"(reinterpret_cast(addr))); } } // namespace device From af0cc122c014bd738aa8324098ee3698779ab9bd Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 5 Sep 2024 14:25:25 +0200 Subject: [PATCH 27/41] Bring back the fatbin.ld link option --- cpp/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ad0303486..6d902dedc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -554,8 +554,7 @@ target_compile_options( ) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries # -# TODO(achirkin): disabled during experiments with CUDA_SEPARABLE_COMPILATION (otherwise did't link) -# target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") +target_link_options(cuvs PRIVATE $) # ################################################################################################## # * cuvs_c ------------------------------------------------------------------------------- From 9023e68df0a42d2fa57905055f18357db5dffec3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 5 Sep 2024 14:27:13 +0200 Subject: [PATCH 28/41] relax the config for checking the raft_cutlass symbol exclusion (see https://github.com/rapidsai/raft/pull/2425) --- .github/workflows/pr.yaml | 2 +- .github/workflows/test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 8ea2fa503..bcd9e9ef5 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -50,7 +50,7 @@ jobs: with: build_type: pull-request enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) # https://github.com/rapidsai/raft/pull/2425 conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c0d07297b..4f732653c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) # https://github.com/rapidsai/raft/pull/2425 conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 From c0f5715f3c936c363b311df4098dbbd8c342b982 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 18 Sep 2024 14:34:29 +0200 Subject: [PATCH 29/41] Add pointer hints and reduce the instruction count a bit --- .../cagra/search_single_cta_kernel-inl.cuh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 116843ce1..d10313c5b 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -520,24 +520,24 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( // | | | upto 32 | // +----------------------+------------------------------+---------+ // |<--- result_buffer_size --->| - std::uint32_t result_buffer_size = internal_topk + (search_width * graph_degree); - std::uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } - const auto small_hash_size = hashmap::get_size(small_hash_bitlen); + const auto result_buffer_size = internal_topk + (search_width * graph_degree); + const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); + const auto small_hash_size = hashmap::get_size(small_hash_bitlen); // Set smem working buffer for the distance calculation dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - auto result_indices_buffer = + auto* __restrict__ result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); - auto result_distances_buffer = + auto* __restrict__ result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto visited_hash_buffer = + auto* __restrict__ visited_hash_buffer = reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto parent_list_buffer = reinterpret_cast(visited_hash_buffer + small_hash_size); - auto topk_ws = reinterpret_cast(parent_list_buffer + search_width); - auto terminate_flag = reinterpret_cast(topk_ws + 3); - auto smem_work_ptr = reinterpret_cast(terminate_flag + 1); + auto* __restrict__ parent_list_buffer = + reinterpret_cast(visited_hash_buffer + small_hash_size); + auto* __restrict__ topk_ws = reinterpret_cast(parent_list_buffer + search_width); + auto* terminate_flag = reinterpret_cast(topk_ws + 3); + auto* __restrict__ smem_work_ptr = reinterpret_cast(terminate_flag + 1); // A flag for filtering. auto filter_flag = terminate_flag; @@ -655,7 +655,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( nullptr, topk_ws, true, - reinterpret_cast(smem_work_ptr)); + smem_work_ptr); _CLK_REC(clk_topk); // reset small-hash table From f65cfd721044490ffe55a0baa7dde2192d4b82cf Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 18 Sep 2024 14:36:05 +0200 Subject: [PATCH 30/41] Reorganize the compute-similarity code to allow the compiler optimize the register usage --- .../neighbors/detail/cagra/device_common.hpp | 37 +++++++------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 026f01e24..7bfd7006d 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -114,7 +114,6 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); const auto compute_distance = dataset_desc.compute_distance_impl; - const auto args = dataset_desc.args.load(); for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { const bool valid_i = (i < num_pickup); @@ -134,11 +133,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( } } - // This is the `dataset_desc.compute_distance` manually inlined to move the fetching of - // dataset_desc from smem out of the loop. - // const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); - const auto norm2 = - device::team_sum(valid_i ? compute_distance(args, seed_index) : 0, team_size_bits); + const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i); if (valid_i && (norm2 < best_norm2_team_local)) { best_norm2_team_local = norm2; @@ -197,32 +192,26 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( __syncthreads(); // Compute the distance to child nodes - const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); - const auto max_i = raft::round_up_safe(knn_k * search_width, warp_size >> team_size_bits) - << team_size_bits; + const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); + const auto num_k = knn_k * search_width; + const auto max_i = raft::round_up_safe(num_k, warp_size >> team_size_bits); const auto compute_distance = dataset_desc.compute_distance_impl; const auto args = dataset_desc.args.load(); - for (uint32_t tid = threadIdx.x; tid < max_i; tid += blockDim.x) { - const auto i = tid >> team_size_bits; - const bool valid_i = (i < (knn_k * search_width)); - IndexT child_id = invalid_index; - if (valid_i) { child_id = result_child_indices_ptr[i]; } + const bool lead_lane = (threadIdx.x & ((1u << team_size_bits) - 1u)) == 0; + for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += blockDim.x >> team_size_bits) { + const bool valid_i = i < num_k; + const auto child_id = valid_i ? result_child_indices_ptr[i] : invalid_index; // This is the `dataset_desc.compute_distance` manually inlined to move the fetching of // dataset_desc from smem out of the loop. // const auto norm2 = dataset_desc.compute_distance(child_id, child_id != invalid_index); - const auto norm2 = device::team_sum( - (child_id != invalid_index) ? compute_distance(args, child_id) : 0, team_size_bits); + const DistanceT child_dist = device::team_sum( + (child_id != invalid_index) ? compute_distance(args, child_id) + : (lead_lane ? raft::upper_bound() : 0), + team_size_bits); // Store the distance - const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u); - if (valid_i && lane_id == 0) { - if (child_id != invalid_index) { - result_child_distances_ptr[i] = norm2; - } else { - result_child_distances_ptr[i] = raft::upper_bound(); - } - } + if (valid_i && lead_lane) { result_child_distances_ptr[i] = child_dist; } } } From 05041297e89a6a1ed5bda174d7ada2e4af1b5d82 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 18 Sep 2024 14:37:10 +0200 Subject: [PATCH 31/41] Disable swizzling and reduce the instruction count in VPQ distance --- .../cagra/compute_distance_vpq-impl.cuh | 51 ++++++++----------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 1343adcae..b35f374cb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -209,21 +209,12 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, uint32_t dim = r->args.dim; queries_ptr += dim * query_id; - constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; + constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; auto smem_query_ptr = - reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + - DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { - half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(queries_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } - if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { - // Use swizzling in the condition to reduce bank conflicts in shared - // memory, which are likely to occur when pq_code_book_dim is large. - ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = buf2; - } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf2; - } + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x; i < dim; i += blockDim.x) { + smem_query_ptr[i] = mapping(queries_ptr[i]); } return const_cast(r); @@ -247,23 +238,23 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto PQ_LEN = DescriptorT::kPqLen; const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; - { - uint32_t vq_code; - device::ldg_cg(vq_code, reinterpret_cast(dataset_ptr)); - vq_code_book_ptr += dim * vq_code; - } static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** constexpr uint32_t nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + + constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; + constexpr auto kTeamStride = vlen * PQ_LEN; + constexpr auto kTeamVLen = TeamSize * vlen; + DISTANCE_T norm = 0; - for (uint32_t elem_offset = (threadIdx.x % TeamSize) * (vlen * PQ_LEN); elem_offset < dim; + for (uint32_t elem_offset = (threadIdx.x & kTeamMask) * kTeamStride; elem_offset < dim; elem_offset += DatasetBlockDim) { // Loading PQ codes uint32_t pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; if (k >= n_subspace) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); @@ -273,7 +264,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // **** Use half2 for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; if (k >= n_subspace) break; // Loading VQ code-book half2 vq_vals[PQ_LEN][vlen / 2]; @@ -294,9 +285,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( const std::uint32_t d = d1 + (PQ_LEN / 2) * k; half2 q2, c2; // Loading query vector from smem - device::lds(q2, - query_ptr + sizeof(uint32_t) * - device::swizzling(d)); + device::lds(q2, query_ptr + sizeof(half2) * d); // Loading PQ code book from smem device::lds(c2, pq_codebook_ptr + @@ -313,7 +302,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // **** Use float for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * (TeamSize * vlen) + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; if (k >= n_subspace) break; // Loading VQ code-book CODE_BOOK_T vq_vals[PQ_LEN][vlen]; @@ -356,10 +345,14 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) -> typename DescriptorT::DISTANCE_T { - return compute_distance_vpq_worker( + const auto* dataset_ptr = DescriptorT::encoded_dataset_ptr(args) + - (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index), - DescriptorT::vq_code_book_ptr(args), + (static_cast(DescriptorT::encoded_dataset_dim(args)) * dataset_index); + uint32_t vq_code; + device::ldg_cg(vq_code, reinterpret_cast(dataset_ptr)); + return compute_distance_vpq_worker( + dataset_ptr, + DescriptorT::vq_code_book_ptr(args) + args.dim * vq_code, args.dim, args.smem_ws_ptr, DescriptorT::n_subspace(args)); From b6050613de6e5fb3f52df1beefa7f615cef6b091 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 19 Sep 2024 10:08:18 +0200 Subject: [PATCH 32/41] Don't apply swizzling when the bank conflicts are not possible (small team_size*vlen, and avoid bank conflicts in setup_workspace --- .../cagra/compute_distance_standard-impl.cuh | 33 +++++++++++-------- .../cagra/compute_distance_vpq-impl.cuh | 32 ++++++++++++++---- .../neighbors/detail/cagra/device_common.hpp | 22 ++++++++++--- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh index e17562e48..50ea88a3e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -133,15 +133,18 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using base_type = typename DescriptorT::base_type; - using QUERY_T = typename DescriptorT::QUERY_T; - using LOAD_T = typename DescriptorT::LOAD_T; - constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; - auto* r = reinterpret_cast(smem_ptr); - auto* buf = reinterpret_cast(r + 1); + using DATA_T = typename DescriptorT::DATA_T; + using LOAD_T = typename DescriptorT::LOAD_T; + using base_type = typename DescriptorT::base_type; + using QUERY_T = typename DescriptorT::QUERY_T; + using word_type = uint32_t; + constexpr auto kTeamSize = DescriptorT::kTeamSize; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + auto* r = reinterpret_cast(smem_ptr); + auto* buf = reinterpret_cast(r + 1); if (r != that) { - constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(LOAD_T); - using blob_type = LOAD_T[kCount]; + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type); + using blob_type = word_type[kCount]; auto& src = reinterpret_cast(*that); auto& dst = reinterpret_cast(*r); for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { @@ -149,17 +152,18 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_standard( } const auto smem_ptr_offset = reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); - if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(LOAD_T))) { + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) { r->args.smem_ws_ptr = uint32_t(__cvta_generic_to_shared(buf)); } __syncthreads(); } - uint32_t dim = r->args.dim; - auto buf_len = raft::round_up_safe(dim, DatasetBlockDim); + uint32_t dim = r->args.dim; + auto buf_len = raft::round_up_safe(dim, kDatasetBlockDim); + constexpr auto vlen = device::get_vlen(); queries_ptr += dim * query_id; for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) { - unsigned j = device::swizzling(i); + unsigned j = device::swizzling(i); if (i < dim) { buf[j] = cuvs::spatial::knn::detail::utils::mapping{}(queries_ptr[i]); } else { @@ -208,7 +212,10 @@ RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker( // - Above the last element (dataset_dim-1), the query array is filled with zeros. // - The data buffer has to be also padded with zeros. DISTANCE_T d; - device::lds(d, query_smem_ptr + sizeof(QUERY_T) * device::swizzling(k + v)); + device::lds( + d, + query_smem_ptr + + sizeof(QUERY_T) * device::swizzling(k + v)); r += dist_op( d, cuvs::spatial::knn::detail::utils::mapping{}(data[e][v])); } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index b35f374cb..d6b0bc96d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -161,9 +161,9 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, using DATA_T = typename DescriptorT::DATA_T; using DISTANCE_T = typename DescriptorT::DISTANCE_T; using INDEX_T = typename DescriptorT::INDEX_T; - using LOAD_T = typename DescriptorT::LOAD_T; using QUERY_T = typename DescriptorT::QUERY_T; using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; constexpr auto TeamSize = DescriptorT::kTeamSize; constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; @@ -172,8 +172,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto* r = reinterpret_cast(smem_ptr); if (r != that) { - constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(LOAD_T); - using blob_type = LOAD_T[kCount]; + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(uint32_t); + using blob_type = uint32_t[kCount]; auto& src = reinterpret_cast(*that); auto& dst = reinterpret_cast(*r); for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { @@ -183,7 +183,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1)); const auto smem_ptr_offset = reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); - if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(LOAD_T))) { + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(uint32_t))) { r->args.smem_ws_ptr = codebook_buf; } __syncthreads(); @@ -213,8 +213,25 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto smem_query_ptr = reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x; i < dim; i += blockDim.x) { - smem_query_ptr[i] = mapping(queries_ptr[i]); + // for (unsigned i = threadIdx.x; i < dim; i += blockDim.x) { + // smem_query_ptr[i] = mapping(queries_ptr[i]); + // } + for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { + half2 buf2{0, 0}; + if (i < dim) { buf2.x = mapping(queries_ptr[i]); } + if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } + if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { + // Use swizzling in the condition to reduce bank conflicts in shared + // memory, which are likely to occur when pq_code_book_dim is large. + constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + // The actual stride should be as commented out below, but it seems the performance is better + // this way (with swizzling disabled for a larger range of inputs) + // constexpr auto kStride = TeamSize * vlen * PQ_LEN / 2; + constexpr auto kStride = TeamSize * vlen; + ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = buf2; + } else { + (reinterpret_cast(smem_query_ptr + i))[0] = buf2; + } } return const_cast(r); @@ -282,7 +299,8 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { const std::uint32_t d1 = m + (PQ_LEN / 2) * v; - const std::uint32_t d = d1 + (PQ_LEN / 2) * k; + const std::uint32_t d = + device::swizzling(d1 + (PQ_LEN / 2) * k); half2 q2, c2; // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * d); diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 7bfd7006d..3956c860e 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -58,16 +58,18 @@ _RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u) return u * 0x2545F4914F6CDD1DULL; } -template -RAFT_DEVICE_INLINE_FUNCTION constexpr T swizzling(T x) +template +RAFT_DEVICE_INLINE_FUNCTION constexpr auto swizzling(T x) -> T { // Address swizzling reduces bank conflicts in shared memory, but increases // the amount of operation instead. // return x; - if constexpr (X_MAX <= 1024) { - return (x) ^ ((x) >> 5); + if constexpr (Stride <= 32) { + return x; + } else if constexpr (Dim <= 1024) { + return x ^ (x >> 5); } else { - return (x) ^ (((x) >> 5) & 0x1f); + return x ^ ((x >> 5) & 0x1f); } } @@ -247,6 +249,16 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[4], uint32_t addr) : "r"(addr)); } +RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr) +{ + asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr)); +} + +RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr) +{ + lds(x, uint32_t(__cvta_generic_to_shared(addr))); +} + RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr) { asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" From 478a8240cd334efa2d26eb62fbc43f7127ad640a Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 20 Sep 2024 09:36:53 +0200 Subject: [PATCH 33/41] Minor improvements to multi-cta kernel --- .../cagra/search_multi_cta_kernel-inl.cuh | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index b5d8296be..dd74ba44b 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -55,11 +55,12 @@ namespace multi_cta_search { // #define _CLK_BREAKDOWN template -__device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [search_width] - const uint32_t search_width, - INDEX_T* const itopk_indices, // [num_itopk] - const size_t num_itopk, - uint32_t* const terminate_flag) +RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents( + INDEX_T* const next_parent_indices, // [search_width] + const uint32_t search_width, + INDEX_T* const itopk_indices, // [num_itopk] + const size_t num_itopk, + uint32_t* const terminate_flag) { constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; const unsigned warp_id = threadIdx.x / 32; @@ -95,10 +96,11 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [sea } template -__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] - INDEX_T* indices, // [num_elements] - const uint32_t num_elements, - const uint32_t num_itopk // num_itopk <= num_elements +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( + float* distances, // [num_elements] + INDEX_T* indices, // [num_elements] + const uint32_t num_elements, + const uint32_t num_itopk // num_itopk <= num_elements ) { const unsigned warp_id = threadIdx.x / 32; @@ -188,21 +190,21 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( // | | | upto 32 | // +----------------+------------------------------+---------+ // |<--- result_buffer_size --->| - uint32_t result_buffer_size = itopk_size + (search_width * graph_degree); - uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } + const auto result_buffer_size = itopk_size + (search_width * graph_degree); + const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); assert(result_buffer_size_32 <= MAX_ELEMENTS); // Set smem working buffer for the distance calculation dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - auto result_indices_buffer = + auto* __restrict__ result_indices_buffer = reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); - auto result_distances_buffer = + auto* __restrict__ result_distances_buffer = reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto parent_indices_buffer = + auto* __restrict__ parent_indices_buffer = reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto terminate_flag = reinterpret_cast(parent_indices_buffer + search_width); + auto* __restrict__ terminate_flag = + reinterpret_cast(parent_indices_buffer + search_width); #if 0 /* debug */ From 5090ebb8676f7eb53d07897cc14e65f9dce995ad Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 20 Sep 2024 09:37:59 +0200 Subject: [PATCH 34/41] Transpose query buffer instead of swizzling in VPQ distance to reduce instruction count --- .../cagra/compute_distance_vpq-impl.cuh | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index d6b0bc96d..0da36216b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -151,29 +151,34 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t +RAFT_DEVICE_INLINE_FUNCTION constexpr auto transpose(T x) -> T +{ + auto i = x % Block; + auto j = x / Block; + auto k = i % Stride; + auto l = i / Stride; + return j * Block + k * (Block / Stride) + l; +} + template _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, void* smem_ptr, const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using base_type = typename DescriptorT::base_type; - using DATA_T = typename DescriptorT::DATA_T; - using DISTANCE_T = typename DescriptorT::DISTANCE_T; - using INDEX_T = typename DescriptorT::INDEX_T; - using QUERY_T = typename DescriptorT::QUERY_T; - using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using word_type = uint32_t; - constexpr auto TeamSize = DescriptorT::kTeamSize; - constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto PQ_BITS = DescriptorT::kPqBits; - constexpr auto PQ_LEN = DescriptorT::kPqLen; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; auto* r = reinterpret_cast(smem_ptr); if (r != that) { - constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(uint32_t); - using blob_type = uint32_t[kCount]; + constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type); + using blob_type = word_type[kCount]; auto& src = reinterpret_cast(*that); auto& dst = reinterpret_cast(*r); for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) { @@ -183,7 +188,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1)); const auto smem_ptr_offset = reinterpret_cast(&(r->args.smem_ws_ptr)) - reinterpret_cast(r); - if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(uint32_t))) { + if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) { r->args.smem_ws_ptr = codebook_buf; } __syncthreads(); @@ -213,22 +218,16 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto smem_query_ptr = reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + DescriptorT::kSMemCodeBookSizeInBytes); - // for (unsigned i = threadIdx.x; i < dim; i += blockDim.x) { - // smem_query_ptr[i] = mapping(queries_ptr[i]); - // } for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { half2 buf2{0, 0}; if (i < dim) { buf2.x = mapping(queries_ptr[i]); } if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { - // Use swizzling in the condition to reduce bank conflicts in shared - // memory, which are likely to occur when pq_code_book_dim is large. + // Transpose the queries buffer to avoid bank conflicts in compute_distance. constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** - // The actual stride should be as commented out below, but it seems the performance is better - // this way (with swizzling disabled for a larger range of inputs) - // constexpr auto kStride = TeamSize * vlen * PQ_LEN / 2; - constexpr auto kStride = TeamSize * vlen; - ((half2*)smem_query_ptr)[device::swizzling(i / 2)] = buf2; + constexpr auto kStride = vlen * PQ_LEN / 2; + reinterpret_cast(smem_query_ptr)[transpose(i / 2)] = + buf2; } else { (reinterpret_cast(smem_query_ptr + i))[0] = buf2; } @@ -260,18 +259,18 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr uint32_t nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); - constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; - constexpr auto kTeamStride = vlen * PQ_LEN; - constexpr auto kTeamVLen = TeamSize * vlen; + constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; + constexpr auto kTeamVLen = TeamSize * vlen; - DISTANCE_T norm = 0; - for (uint32_t elem_offset = (threadIdx.x & kTeamMask) * kTeamStride; elem_offset < dim; - elem_offset += DatasetBlockDim) { + const auto laneId = threadIdx.x & kTeamMask; + DISTANCE_T norm = 0; + for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim; + elem_offset += DatasetBlockDim / PQ_LEN) { // Loading PQ codes uint32_t pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); @@ -281,7 +280,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // **** Use half2 for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; // Loading VQ code-book half2 vq_vals[PQ_LEN][vlen / 2]; @@ -298,9 +297,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( if (PQ_LEN * (v + k) >= dim) break; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { - const std::uint32_t d1 = m + (PQ_LEN / 2) * v; + constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); + const std::uint32_t d1 = m + (PQ_LEN / 2) * v; const std::uint32_t d = - device::swizzling(d1 + (PQ_LEN / 2) * k); + d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; half2 q2, c2; // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * d); @@ -320,7 +320,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // **** Use float for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * kTeamVLen + elem_offset / PQ_LEN; + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; // Loading VQ code-book CODE_BOOK_T vq_vals[PQ_LEN][vlen]; From d0eb9b35f1897758b427bd1b4783f11ffc20320d Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 13:25:34 +0200 Subject: [PATCH 35/41] VPQ distance: don't pass n_subspace as parameter, because it can be cheaply computed from dim and PQ_LEN --- .../cagra/compute_distance_vpq-impl.cuh | 30 ++++--------------- .../detail/cagra/compute_distance_vpq.hpp | 2 -- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 0da36216b..310fb6100 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -60,7 +60,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t const uint8_t*& @@ -80,10 +79,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t uint32_t& - { - return args.extra_word2; - } RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(const args_t& args) noexcept -> const uint8_t* const& @@ -104,11 +99,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t const uint32_t& - { - return args.extra_word2; - } static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of(); @@ -117,7 +107,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_tpq_code_book_ptr() = pq_code_book_ptr; cagra_q_dataset_descriptor_t::encoded_dataset_dim(args) = encoded_dataset_dim; - cagra_q_dataset_descriptor_t::n_subspace(args) = n_subspace; static_assert(sizeof(*this) == sizeof(base_type)); static_assert(alignof(cagra_q_dataset_descriptor_t) == alignof(base_type)); } @@ -241,8 +229,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( const uint8_t* __restrict__ dataset_ptr, const typename DescriptorT::CODE_BOOK_T* __restrict__ vq_code_book_ptr, uint32_t dim, - uint32_t pq_codebook_ptr, - uint32_t n_subspace) -> typename DescriptorT::DISTANCE_T + uint32_t pq_codebook_ptr) -> typename DescriptorT::DISTANCE_T { using DISTANCE_T = typename DescriptorT::DISTANCE_T; using LOAD_T = typename DescriptorT::LOAD_T; @@ -262,8 +249,9 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; constexpr auto kTeamVLen = TeamSize * vlen; - const auto laneId = threadIdx.x & kTeamMask; - DISTANCE_T norm = 0; + const auto n_subspace = raft::div_rounding_up_unsafe(dim, PQ_LEN); + const auto laneId = threadIdx.x & kTeamMask; + DISTANCE_T norm = 0; for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim; elem_offset += DatasetBlockDim / PQ_LEN) { // Loading PQ codes @@ -369,11 +357,10 @@ _RAFT_DEVICE __noinline__ auto compute_distance_vpq( uint32_t vq_code; device::ldg_cg(vq_code, reinterpret_cast(dataset_ptr)); return compute_distance_vpq_worker( - dataset_ptr, + dataset_ptr /* advance dataset pointer by the size of vq_code */, DescriptorT::vq_code_book_ptr(args) + args.dim * vq_code, args.dim, - args.smem_ws_ptr, - DescriptorT::n_subspace(args)); + args.smem_ws_ptr); } template * out, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, - uint32_t n_subspace, const CodebookT* vq_code_book_ptr, const CodebookT* pq_code_book_ptr, IndexT size, @@ -410,7 +396,6 @@ RAFT_KERNEL __launch_bounds__(1, 1) reinterpret_cast(&compute_distance_vpq), encoded_dataset_ptr, encoded_dataset_dim, - n_subspace, vq_code_book_ptr, pq_code_book_ptr, size, @@ -438,7 +423,6 @@ vpq_descriptor_spec::init_(const cagra::search_params& params, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, - uint32_t n_subspace, const CodebookT* vq_code_book_ptr, const CodebookT* pq_code_book_ptr, IndexT size, @@ -460,7 +444,6 @@ vpq_descriptor_spec<<<1, 1, 0, stream>>>(result.dev_ptr, encoded_dataset_ptr, encoded_dataset_dim, - n_subspace, vq_code_book_ptr, pq_code_book_ptr, size, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index 9d5b0b6c0..378d2943e 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -63,7 +63,6 @@ struct vpq_descriptor_spec : public instance_spec { return init_(params, dataset.data.data_handle(), dataset.encoded_row_length(), - dataset.pq_dim(), dataset.vq_code_book.data_handle(), dataset.pq_code_book.data_handle(), IndexT(dataset.n_rows()), @@ -91,7 +90,6 @@ struct vpq_descriptor_spec : public instance_spec { const cagra::search_params& params, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, - uint32_t n_subspace, const CodebookT* vq_code_book_ptr, const CodebookT* pq_code_book_ptr, IndexT size, From 7bce6da23e2eb89cbdf6824837fe457c4f37ca99 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 14:02:13 +0200 Subject: [PATCH 36/41] Docs and readability: device_common.hpp and factory.cuh --- .../neighbors/detail/cagra/device_common.hpp | 7 +- cpp/src/neighbors/detail/cagra/factory.cuh | 72 ++++++++++++------- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 3956c860e..b7cb9c42d 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -204,9 +204,10 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( const bool valid_i = i < num_k; const auto child_id = valid_i ? result_child_indices_ptr[i] : invalid_index; - // This is the `dataset_desc.compute_distance` manually inlined to move the fetching of - // dataset_desc from smem out of the loop. - // const auto norm2 = dataset_desc.compute_distance(child_id, child_id != invalid_index); + // We should be calling `dataset_desc.compute_distance(..)` here as follows: + // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index); + // Instead, we manually inline this function for performance reasons. + // This allows us to move the fetching of the arguments from shared memory out of the loop. const DistanceT child_dist = device::team_sum( (child_id != invalid_index) ? compute_distance(args, child_id) : (lead_lane ? raft::upper_bound() : 0), diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 05118472b..1c99f72f7 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -69,11 +69,22 @@ class factory { } }; -struct dataset_descriptor_key { +/* +Caching of dataset/distance descriptor initialization + (see `dataset_descriptor_init_with_cache` below). + */ +namespace descriptor_cache { + +/** + * The key for caching consists of a minimal set of fields that uniquely define the descriptor. + * The key field names are the same as of the descriptor and the contents are not relevant for + * caching. + */ +struct key { uint64_t data_ptr; uint64_t n_rows; uint32_t dim; - uint32_t extra_val; + uint32_t extra_val; // this one has different meanings for different descriptor types uint32_t team_size; uint32_t metric; }; @@ -82,39 +93,38 @@ template auto make_key(const cagra::search_params& params, const DatasetT& dataset, cuvs::distance::DistanceType metric) - -> std::enable_if_t, dataset_descriptor_key> + -> std::enable_if_t, key> { - return dataset_descriptor_key{reinterpret_cast(dataset.view().data_handle()), - uint64_t(dataset.n_rows()), - dataset.dim(), - dataset.stride(), - uint32_t(params.team_size), - uint32_t(metric)}; + return key{reinterpret_cast(dataset.view().data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + dataset.stride(), + uint32_t(params.team_size), + uint32_t(metric)}; } template auto make_key(const cagra::search_params& params, const DatasetT& dataset, cuvs::distance::DistanceType metric) - -> std::enable_if_t, dataset_descriptor_key> + -> std::enable_if_t, key> { - return dataset_descriptor_key{ - reinterpret_cast(dataset.data.data_handle()), - uint64_t(dataset.n_rows()), - dataset.dim(), - uint32_t(reinterpret_cast(dataset.pq_code_book.data_handle()) >> 6), - uint32_t(params.team_size), - uint32_t(metric)}; + return key{reinterpret_cast(dataset.data.data_handle()), + uint64_t(dataset.n_rows()), + dataset.dim(), + uint32_t(reinterpret_cast(dataset.pq_code_book.data_handle()) >> 6), + uint32_t(params.team_size), + uint32_t(metric)}; } -inline auto operator==(const dataset_descriptor_key& a, const dataset_descriptor_key& b) -> bool +inline auto operator==(const key& a, const key& b) -> bool { return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim && a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric; } -struct dataset_descriptor_key_hash { - inline auto operator()(const dataset_descriptor_key& x) const noexcept -> std::size_t +struct key_hash { + inline auto operator()(const key& x) const noexcept -> std::size_t { return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} + (size_t{x.team_size} ^ size_t{x.metric}); @@ -122,16 +132,28 @@ struct dataset_descriptor_key_hash { }; template -struct dataset_descriptor_cache { +struct store { /** Number of descriptors to cache. */ static constexpr size_t kDefaultSize = 100; - raft::cache::lru, std::shared_ptr>> value{kDefaultSize}; }; +} // namespace descriptor_cache + +/** + * Call `dataset_descriptor_init` with memoization. + * (NB: `dataset_descriptor_init` is a function in a generated header file + * `neighbors/detail/cagra/compute_distance-ext.cuh`). + * + * `dataset_descriptor_init` involves calling a CUDA kernel to resolve device symbols before the + * main search kernel runs. This adds an extra unwanted latency. + * Caching the the descriptor helps to hide this latency for repeated searches. + * + */ template auto dataset_descriptor_init_with_cache(const raft::resources& res, const cagra::search_params& params, @@ -140,9 +162,9 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res, -> const dataset_descriptor_host& { using desc_t = dataset_descriptor_host; - auto key = make_key(params, dataset, metric); + auto key = descriptor_cache::make_key(params, dataset, metric); auto& cache = - raft::resource::get_custom_resource>(res) + raft::resource::get_custom_resource>(res) ->value; std::shared_ptr desc{nullptr}; if (!cache.get(key, &desc)) { From 5154892019ae6bee466c22cc7fbcd879a272a580 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 14:14:47 +0200 Subject: [PATCH 37/41] Remove unused distance instances (with uint64_t index type) --- cpp/CMakeLists.txt | 24 -- .../detail/cagra/compute_distance-ext.cuh | 232 +----------------- .../detail/cagra/compute_distance.cu | 26 +- .../cagra/compute_distance_00_generate.py | 4 +- ...ard_InnerProduct_float_uint64_dim128_t8.cu | 38 --- ...rd_InnerProduct_float_uint64_dim256_t16.cu | 38 --- ...rd_InnerProduct_float_uint64_dim512_t32.cu | 38 --- ...dard_InnerProduct_half_uint64_dim128_t8.cu | 33 --- ...ard_InnerProduct_half_uint64_dim256_t16.cu | 38 --- ...ard_InnerProduct_half_uint64_dim512_t32.cu | 38 --- ...ndard_L2Expanded_float_uint64_dim128_t8.cu | 33 --- ...dard_L2Expanded_float_uint64_dim256_t16.cu | 33 --- ...dard_L2Expanded_float_uint64_dim512_t32.cu | 33 --- ...andard_L2Expanded_half_uint64_dim128_t8.cu | 33 --- ...ndard_L2Expanded_half_uint64_dim256_t16.cu | 33 --- ...ndard_L2Expanded_half_uint64_dim512_t32.cu | 33 --- ...d_float_uint64_dim128_t8_8pq_2subd_half.cu | 41 ---- ...d_float_uint64_dim128_t8_8pq_4subd_half.cu | 41 ---- ..._float_uint64_dim256_t16_8pq_2subd_half.cu | 41 ---- ..._float_uint64_dim256_t16_8pq_4subd_half.cu | 41 ---- ..._float_uint64_dim512_t32_8pq_2subd_half.cu | 41 ---- ..._float_uint64_dim512_t32_8pq_4subd_half.cu | 41 ---- ...ed_half_uint64_dim128_t8_8pq_2subd_half.cu | 41 ---- ...ed_half_uint64_dim128_t8_8pq_4subd_half.cu | 41 ---- ...d_half_uint64_dim256_t16_8pq_2subd_half.cu | 41 ---- ...d_half_uint64_dim256_t16_8pq_4subd_half.cu | 41 ---- ...d_half_uint64_dim512_t32_8pq_2subd_half.cu | 41 ---- ...d_half_uint64_dim512_t32_8pq_4subd_half.cu | 41 ---- 28 files changed, 5 insertions(+), 1194 deletions(-) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2d9a58c3d..d1ba73adf 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -208,15 +208,9 @@ add_library( src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu @@ -226,15 +220,9 @@ add_library( src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu @@ -247,24 +235,12 @@ add_library( src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 86de55db6..8407ef055 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -392,186 +392,6 @@ extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct vpq_descriptor_spec; -extern template struct vpq_descriptor_spec; extern template struct instance_selector< standard_descriptor_spec, @@ -621,31 +441,7 @@ extern template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + vpq_descriptor_spec>; using descriptor_instances = instance_selector< standard_descriptor_spec, @@ -695,31 +491,7 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + vpq_descriptor_spec>; template auto dataset_descriptor_init(const cagra::search_params& params, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index 387b4c71b..45316e59b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -77,30 +77,6 @@ template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec>; + vpq_descriptor_spec>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 1b0743901..52a15e2a1 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -66,8 +66,8 @@ half_uint32=("half", "uint32_t", "float"), int8_uint32=("int8_t", "uint32_t", "float"), uint8_uint32=("uint8_t", "uint32_t", "float"), - float_uint64=("float", "uint64_t", "float"), - half_uint64=("half", "uint64_t", "float"), + # float_uint64=("float", "uint64_t", "float"), + # half_uint64=("half", "uint64_t", "float"), ) metric_prefix = 'DistanceType::' diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu deleted file mode 100644 index df761ab25..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu deleted file mode 100644 index b7e5eb2e9..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu deleted file mode 100644 index fd2a6b276..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu deleted file mode 100644 index 291d16ddd..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim128_t8.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu deleted file mode 100644 index 32a18ff3e..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu deleted file mode 100644 index 27dba0ebb..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu deleted file mode 100644 index 7a8c4059c..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim128_t8.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu deleted file mode 100644 index fcc65a48e..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim256_t16.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu deleted file mode 100644 index 833dac9c4..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint64_dim512_t32.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu deleted file mode 100644 index b3a466f46..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim128_t8.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu deleted file mode 100644 index a11701e5a..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim256_t16.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu deleted file mode 100644 index 9ed0a32ee..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint64_dim512_t32.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_standard-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct standard_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index 9a8b945d2..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index da7d99af3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index cc3a0344c..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index bd24b48f3..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index cc3f1fb59..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index 0559b86e1..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu deleted file mode 100644 index b962f6706..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu deleted file mode 100644 index 5d1f85316..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim128_t8_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu deleted file mode 100644 index eb31e7b06..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu deleted file mode 100644 index e3bd42313..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim256_t16_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu deleted file mode 100644 index 7387d141b..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu deleted file mode 100644 index fd26fe89b..000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint64_dim512_t32_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail From a0c54e38a484ed248075576bd4a9bb421dfb6764 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 16:06:55 +0200 Subject: [PATCH 38/41] compute_distance.hpp: document and slightly simplify the dataset descriptor types --- .../detail/cagra/compute_distance.hpp | 109 ++++++++++++------ .../cagra/compute_distance_standard-impl.cuh | 4 +- .../cagra/compute_distance_vpq-impl.cuh | 4 +- .../detail/cagra/search_multi_cta.cuh | 2 +- .../detail/cagra/search_multi_kernel.cuh | 4 +- .../detail/cagra/search_single_cta.cuh | 2 +- 6 files changed, 81 insertions(+), 44 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 81f002c61..e72ce527c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -31,10 +31,41 @@ #include #include +#include +#include #include namespace cuvs::neighbors::cagra::detail { +/** + * @brief Dataset and distance description. + * + * This is the base type for the dataset/distance descriptors. + * The actual implementations are hidden in `compute_distance_***-impl.cuh` files, which should be + * included only in `compute_distance_***.cu` files to enforce separable compilation. + * + * [Note: manual dispatch] + * The descriptor type hierarchy declared here resembles the usual C++ inheritance: the search + * kernels take a pointer to the base type as an argument, but the actual implementation types are + * passed by the host. The kernels only ever need two functions `setup_workspace` and + * `compute_distance`; the choice of the implementation happens at the runtime. + * + * However, for performance reasons, we don't use the C++ virtual dispatch mechanics here. + * The extra pointer-chasing and register usage overheads associated with virtual tables turn out to + * cause a significant slowdown in the performance-critical `compute_distance`. + * Instead, we manually dispatch the two polymorphic functions and store them as fields in the + * descriptor structure. + * + * [Note: initialization/dispatch] + * The host doesn't know the addresses of the device symbols. That means we either need to resolve + * the device functions and store them in the descriptor directly on the device, or use + * `cudaMemcpyFromSymbolAsync` to fetch them (note, there is same problem with classes: if an object + * is created on the host, its pointer to the vtable would be invalid on device). + * We take the first approach: there's an `***_init_kernel` for each descriptor instance that is + * called before the search kernel; all it does is call a (placement) new with an appropriate type + * and arguments in a single GPU thread. + * + */ template struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; @@ -43,6 +74,20 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { using INDEX_T = IndexT; using DISTANCE_T = DistanceT; + /** + * @brief "polymorphic" `compute_distance` arguments. + * + * This is a tightly-packed POD arguments of `compute_distance`. + * **Important** this structure is passed by value to `compute_distance`; it's important it + * remains small. + * + * [Note: arguments layout] + * The descriptor implementations require different sets of arguments (with couple arguments + * overlapping). At the same time the `compute_distance` is defined such that it accepts the + * `args_t` by value. That means the layout of the struct must be identical for all descriptor + * implementations. We workaround this requirement by defining generic fields in this struct and + * assignging the meaning to them on the implementation side. + */ struct alignas(LOAD_T) args_t { void* extra_ptr1; void* extra_ptr2; @@ -53,6 +98,13 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { uint32_t extra_word1; uint32_t extra_word2; + /** + * Load this struct from shared memory. + * + * NB: until `compute_distance` is called, the arguments struct is stored in the shared memory + * as a member of the descriptor struct. This helper functions saves a few instructions by + * forcing the compiler to assume it is indeed in the shared memory address space. + */ RAFT_DEVICE_INLINE_FUNCTION auto load() const -> args_t { constexpr int kCount = sizeof(*this) / sizeof(LOAD_T); @@ -68,6 +120,7 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { } }; + /** Shared memory usage and team_size packed into a single uint32_t to save on memory requests. */ struct smem_and_team_size_t { uint32_t value; RAFT_INLINE_FUNCTION constexpr smem_and_team_size_t(uint32_t smem_size_bytes, @@ -103,6 +156,7 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector * given by the dataset_index. */ compute_distance_type* compute_distance_impl; + /** A placeholder for an implementation-specific pointer. */ void* extra_ptr3; smem_and_team_size_t smem_and_team_size; @@ -161,55 +215,38 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { } }; +/** + * @brief Hosting a device descriptor. + * + * The dataset descriptor is initialized on the device side and stays there. + * The host struct manages the lifetime of the associated device pointer and a couple parameters + * affecting the search kernel launch config. + * + */ template struct dataset_descriptor_host { using dev_descriptor_t = dataset_descriptor_base_t; - dev_descriptor_t* dev_ptr = nullptr; uint32_t smem_ws_size_in_bytes = 0; uint32_t team_size = 0; - uint32_t dataset_block_dim = 0; template - dataset_descriptor_host(const DescriptorImpl& dd_host, - rmm::cuda_stream_view stream, - uint32_t dataset_block_dim) - : stream_{stream}, + dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream) + : dev_ptr_{[stream]() { + dev_descriptor_t* p; + RAFT_CUDA_TRY(cudaMallocAsync(&p, sizeof(DescriptorImpl), stream)); + return p; + }(), + [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }}, smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, - team_size{dd_host.team_size()}, - dataset_block_dim{dataset_block_dim} - { - RAFT_CUDA_TRY(cudaMallocAsync(&dev_ptr, sizeof(DescriptorImpl), stream_)); - } - - ~dataset_descriptor_host() noexcept + team_size{dd_host.team_size()} { - if (dev_ptr == nullptr) { return; } - RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(dev_ptr, stream_)); } - dataset_descriptor_host(dataset_descriptor_host&& other) - { - std::swap(this->dev_ptr, other.dev_ptr); - std::swap(this->smem_ws_size_in_bytes, other.smem_ws_size_in_bytes); - std::swap(this->stream_, other.stream_); - std::swap(this->team_size, other.team_size); - std::swap(this->dataset_block_dim, other.dataset_block_dim); - } - dataset_descriptor_host& operator=(dataset_descriptor_host&& b) - { - auto& a = *this; - std::swap(a.dev_ptr, b.dev_ptr); - std::swap(a.smem_ws_size_in_bytes, b.smem_ws_size_in_bytes); - std::swap(a.stream_, b.stream_); - std::swap(a.team_size, b.team_size); - std::swap(a.dataset_block_dim, b.dataset_block_dim); - return a; - } - dataset_descriptor_host(const dataset_descriptor_host&) = delete; - dataset_descriptor_host& operator=(const dataset_descriptor_host&) = delete; + [[nodiscard]] auto dev_ptr() const -> const dev_descriptor_t* { return dev_ptr_.get(); } + [[nodiscard]] auto dev_ptr() -> dev_descriptor_t* { return dev_ptr_.get(); } private: - rmm::cuda_stream_view stream_; + std::unique_ptr> dev_ptr_; }; template diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh index 50ea88a3e..628bb2db8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -280,7 +280,7 @@ standard_descriptor_spec; using base_type = typename desc_type::base_type; desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld}; - host_type result{dd_host, stream, DatasetBlockDim}; + host_type result{dd_host, stream}; standard_dataset_descriptor_init_kernel - <<<1, 1, 0, stream>>>(result.dev_ptr, ptr, size, dim, desc_type::ld(dd_host.args)); + <<<1, 1, 0, stream>>>(result.dev_ptr(), ptr, size, dim, desc_type::ld(dd_host.args)); RAFT_CUDA_TRY(cudaPeekAtLastError()); return result; } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 310fb6100..7adf5baf0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -448,7 +448,7 @@ vpq_descriptor_spec<<<1, 1, 0, stream>>>(result.dev_ptr, + DistanceT><<<1, 1, 0, stream>>>(result.dev_ptr(), encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index c0234407b..9bcccd9f9 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -209,7 +209,7 @@ struct search : public search_plan_impl& data num_queries); random_pickup_kernel<<>>( - dataset_desc.dev_ptr, + dataset_desc.dev_ptr(), queries_ptr, num_pickup, num_distilation, @@ -410,7 +410,7 @@ void compute_distance_to_child_nodes( parent_distance_ptr, lds, search_width, - dataset_desc.dev_ptr, + dataset_desc.dev_ptr(), neighbor_graph_ptr, graph_degree, query_ptr, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index aefadf643..4abed6760 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -218,7 +218,7 @@ struct search : search_plan_impl { SAMPLE_FILTER_T sample_filter) { cudaStream_t stream = raft::resource::get_cuda_stream(res); - select_and_run(dataset_desc.dev_ptr, + select_and_run(dataset_desc.dev_ptr(), graph, result_indices_ptr, result_distances_ptr, From 9ba3e3f7fb218776131beae9e4ea249160bb2adc Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 17:36:41 +0200 Subject: [PATCH 39/41] Document the dataset/distance descriptor selection logic --- .../detail/cagra/compute_distance.hpp | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index e72ce527c..4bed275ab 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -249,6 +249,13 @@ struct dataset_descriptor_host { std::unique_ptr> dev_ptr_; }; +/** + * @brief The signature for descriptor initialization. + * + * There is an init function associated with every descriptor implementation. It's responsible for + * initializing the device-side descriptor instance (calling the init kernel). + * + */ template using init_desc_type = dataset_descriptor_host (*)(const cagra::search_params&, @@ -256,6 +263,22 @@ using init_desc_type = cuvs::distance::DistanceType, rmm::cuda_stream_view); +/** + * @brief Descriptor instance specification. + * + * This type provides a decentralized way for selecting a descriptor instance best suitable for the + * given dataset and distance metric. + * There is a spec for every descriptor (described in the interface files + * `compute_distance_***.hpp`). + * + * The `instance_spec` implementation must have the following static member template functions: + * * constexpr bool accepts_dataset() + * - tells whether the spec is compatible with the dataset type, executed at compile time. + * * double priority(..) + * - tells how to select a single spec out of possibly several compatible specs + * * init_desc_type init + * - (see `init_desc_type` above) the function to initialize the descriptor. + */ template struct instance_spec { using data_type = DataT; @@ -270,6 +293,9 @@ struct instance_spec { } }; +/** Whether the descriptor is compatible with the dataset and arguments at the type level + * (compile-time check). + */ template && InstanceSpec::template accepts_dataset(); +/** + * @brief Get the init function and the priority of the descriptor given by the InstanceSpec. + * + * @return (init function, priority) + */ template struct instance_selector { template From f77c1b0a92b95eb1e178a571ff8715da2b188071 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 23 Sep 2024 17:51:39 +0200 Subject: [PATCH 40/41] Remove commented-out code sections --- .../cagra/compute_distance_standard-impl.cuh | 17 ----------------- .../detail/cagra/compute_distance_vpq-impl.cuh | 5 ----- 2 files changed, 22 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh index 628bb2db8..b0205508a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh @@ -63,23 +63,6 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t const DATA_T*& - // { - // return (const DATA_T*&)(extra_ptr1); - // } - - // RAFT_INLINE_FUNCTION constexpr auto ptr() const noexcept -> const DATA_T* const& - // { - // return (const DATA_T* const&)(extra_ptr1); - // } - - // RAFT_INLINE_FUNCTION constexpr auto ld() noexcept -> uint32_t& { return extra_word1; } - // RAFT_INLINE_FUNCTION constexpr auto ld() const noexcept -> const uint32_t& { return - // extra_word1; } - static constexpr RAFT_INLINE_FUNCTION auto ptr(const args_t& args) noexcept -> const DATA_T* const& { diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 7adf5baf0..86c592502 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -56,11 +56,6 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); - // alignas(LOAD_T) const std::uint8_t* encoded_dataset_ptr; - // const CODE_BOOK_T* vq_code_book_ptr; - // const CODE_BOOK_T* pq_code_book_ptr; - // std::uint32_t encoded_dataset_dim; - RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(args_t& args) noexcept -> const uint8_t*& { From f1426cf576799b3e2d27d14dd0848eaf0f982036 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 25 Sep 2024 14:18:38 +0200 Subject: [PATCH 41/41] Remove empty comment --- cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d1ba73adf..d8d554648 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -531,7 +531,6 @@ target_compile_options( "$<$:${CUVS_CUDA_FLAGS}>" ) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries -# target_link_options(cuvs PRIVATE $) # ##################################################################################################