diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt index 29a844ee07..6fced2da3d 100644 --- a/jni/CMakeLists.txt +++ b/jni/CMakeLists.txt @@ -136,10 +136,12 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S # Check if faiss exists find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss) - # If not, pull the updated submodule + # If not, pull the updated submodule and apply patches if (NOT EXISTS ${FAISS_REPO_DIR}) message(STATUS "Could not find faiss. Pulling updated submodule.") execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + message(STATUS "Applying custom patches.") + execute_process(COMMAND git apply --directory=external/faiss patches/faiss/multi-vector-support.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif () set(FAISS_ENABLE_GPU OFF) diff --git a/jni/patches/faiss/multi-vector-support.patch b/jni/patches/faiss/multi-vector-support.patch new file mode 100644 index 0000000000..9bac678c8c --- /dev/null +++ b/jni/patches/faiss/multi-vector-support.patch @@ -0,0 +1,265 @@ +commit a97473e0c816fa1213ab97ac340d932bfd265b9b +Author: Heemin Kim +Date: Wed Dec 6 16:33:52 2023 -0800 + + Introduce result collector for HNSW + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 27701586..af682a05 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -162,6 +162,8 @@ set(FAISS_HEADERS + impl/ProductQuantizer.h + impl/Quantizer.h + impl/ResidualQuantizer.h ++ impl/ResultCollector.h ++ impl/ResultCollectorFactory.h + impl/ResultHandler.h + impl/ScalarQuantizer.h + impl/ThreadedIndex-inl.h +diff --git a/faiss/Index.h b/faiss/Index.h +index 4b4b302b..13eab0c0 100644 +--- a/faiss/Index.h ++++ b/faiss/Index.h +@@ -38,11 +38,12 @@ + + namespace faiss { + +-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and +-/// impl/DistanceComputer.h ++/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h, ++/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h + struct IDSelector; + struct RangeSearchResult; + struct DistanceComputer; ++struct ResultCollectorFactory; + + /** Parent class for the optional search paramenters. + * +@@ -52,6 +53,7 @@ struct DistanceComputer; + struct SearchParameters { + /// if non-null, only these IDs will be considered during search. + IDSelector* sel = nullptr; ++ ResultCollectorFactory* col = nullptr; + /// make sure we can dynamic_cast this + virtual ~SearchParameters() {} + }; +diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp +index 7972bec9..e387b4c2 100644 +--- a/faiss/IndexIDMap.cpp ++++ b/faiss/IndexIDMap.cpp +@@ -102,6 +102,20 @@ struct ScopedSelChange { + } + }; + ++/// RAII object to reset the ResultCollectorFactory in the params object ++struct ScopedColChange { ++ SearchParameters* params = nullptr; ++ void set(SearchParameters* params, const std::vector* id_map) { ++ this->params = params; ++ params->col->id_map = id_map; ++ } ++ ~ScopedColChange() { ++ if (params) { ++ params->col->id_map = nullptr; ++ } ++ } ++}; ++ + } // namespace + + template +@@ -114,6 +128,7 @@ void IndexIDMapTemplate::search( + const SearchParameters* params) const { + IDSelectorTranslated this_idtrans(this->id_map, nullptr); + ScopedSelChange sel_change; ++ ScopedColChange col_change; + + if (params && params->sel) { + auto idtrans = dynamic_cast(params->sel); +@@ -131,6 +146,10 @@ void IndexIDMapTemplate::search( + sel_change.set(params_non_const, &this_idtrans); + } + } ++ if (params && params->col) { ++ auto params_non_const = const_cast(params); ++ col_change.set(params_non_const, &this->id_map); ++ } + index->search(n, x, k, distances, labels, params); + idx_t* li = labels; + #pragma omp parallel for +diff --git a/faiss/IndexIDMap.h b/faiss/IndexIDMap.h +index 2d164123..c6a1be73 100644 +--- a/faiss/IndexIDMap.h ++++ b/faiss/IndexIDMap.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + #include + #include +diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp +index 9fc201ea..540210a6 100644 +--- a/faiss/impl/HNSW.cpp ++++ b/faiss/impl/HNSW.cpp +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -530,6 +531,15 @@ int search_from_candidates( + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { ++ ResultCollectorFactory defaultFactory; ++ ResultCollectorFactory* collectorFactory; ++ if (params == nullptr || params->col == nullptr) { ++ collectorFactory = &defaultFactory; ++ } else { ++ collectorFactory = params->col; ++ } ++ ResultCollector* collector = collectorFactory->newCollector(); ++ + int nres = nres_in; + int ndis = 0; + +@@ -544,11 +554,7 @@ int search_from_candidates( + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, d, v1); +- } else if (d < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, d, v1); +- } ++ collector->collect(k, nres, D, I, d, v1); + } + vt.set(v1); + } +@@ -612,11 +618,7 @@ int search_from_candidates( + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, dis, idx); +- } else if (dis < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, dis, idx); +- } ++ collector->collect(k, nres, D, I, dis, idx); + } + candidates.push(idx, dis); + }; +@@ -660,6 +662,9 @@ int search_from_candidates( + } + } + ++ collector->finalize(nres, I); ++ collectorFactory->deleteCollector(collector); ++ + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { +diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h +new file mode 100644 +index 00000000..3e4dac34 +--- /dev/null ++++ b/faiss/impl/ResultCollector.h +@@ -0,0 +1,58 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include ++#include ++ ++/** ResultCollector is intended to define how to collect search result */ ++ ++namespace faiss { ++ ++/** Encapsulates a set of ids to handle. */ ++struct ResultCollector { ++ // For each result, collect method is called to store result ++ virtual void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) = 0; ++ ++ // This method is called after all result is collected ++ virtual void finalize(idx_t nres, idx_t* bh_ids) = 0; ++ virtual ~ResultCollector() {} ++}; ++ ++struct DefaultCollector : ResultCollector { ++ void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) override { ++ if (nres < k) { ++ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids); ++ } else if (val < bh_val[0]) { ++ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids); ++ } ++ } ++ ++ void finalize(idx_t nres, idx_t* bh_ids) override { ++ // Do nothing ++ } ++ ++ ~DefaultCollector() override {} ++}; ++ ++} // namespace faiss +diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h +new file mode 100644 +index 00000000..4d903f8d +--- /dev/null ++++ b/faiss/impl/ResultCollectorFactory.h +@@ -0,0 +1,29 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++#include ++namespace faiss { ++ ++/** ResultCollector is intended to define how to collect search result */ ++struct ResultCollectorFactory { ++ DefaultCollector default_collector; ++ const std::vector* id_map; ++ ++ // For each result, collect method is called to store result ++ virtual ResultCollector* newCollector() { ++ return &default_collector; ++ } ++ ++ virtual void deleteCollector(ResultCollector* collector) { ++ // Do nothing ++ } ++ // This method is called after all result is collected ++ virtual ~ResultCollectorFactory() {} ++}; ++ ++} // namespace faiss