Skip to content

Commit 2142dc6

Browse files
dyashuniyurymalkov
andauthored
Stop condition (#490)
* Add stop condition, multivector search, epsilon search * Fix include * Update readme * Update multivector tests * One header file * Add bare_bone_search flag * Fix epsilon search * Refactoring * Adress comments * Fix assert * Add ef to multivector search, return vector, refactoring * Refactoring * Adress comments * Add bare bone search comment Co-authored-by: Yury Malkov <[email protected]> * Remove has_deletions flag --------- Co-authored-by: Yury Malkov <[email protected]>
1 parent d44bd5d commit 2142dc6

12 files changed

+824
-15
lines changed

Diff for: .github/workflows/build.yml

+4
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,14 @@ jobs:
6767
./example_mt_search
6868
./example_mt_filter
6969
./example_mt_replace_deleted
70+
./example_multivector_search
71+
./example_epsilon_search
7072
./searchKnnCloserFirst_test
7173
./searchKnnWithFilter_test
7274
./multiThreadLoad_test
7375
./multiThread_replace_test
7476
./test_updates
7577
./test_updates update
78+
./multivector_search_test
79+
./epsilon_search_test
7680
shell: bash

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ var/
1010
.vscode/
1111
.vs/
1212
**.DS_Store
13+
*.pyc

Diff for: CMakeLists.txt

+12
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ if(HNSWLIB_EXAMPLES)
5757
add_executable(example_search examples/cpp/example_search.cpp)
5858
target_link_libraries(example_search hnswlib)
5959

60+
add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
61+
target_link_libraries(example_epsilon_search hnswlib)
62+
63+
add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
64+
target_link_libraries(example_multivector_search hnswlib)
65+
6066
add_executable(example_filter examples/cpp/example_filter.cpp)
6167
target_link_libraries(example_filter hnswlib)
6268

@@ -73,6 +79,12 @@ if(HNSWLIB_EXAMPLES)
7379
target_link_libraries(example_mt_replace_deleted hnswlib)
7480

7581
# tests
82+
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
83+
target_link_libraries(multivector_search_test hnswlib)
84+
85+
add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
86+
target_link_libraries(epsilon_search_test hnswlib)
87+
7688
add_executable(test_updates tests/cpp/updates_test.cpp)
7789
target_link_libraries(test_updates hnswlib)
7890

Diff for: README.md

+2
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
229229
* filtering during the search with a boolean function
230230
* deleting the elements and reusing the memory of the deleted elements for newly added elements
231231
* multithreaded usage
232+
* multivector search
233+
* epsilon search
232234

233235

234236
### Bindings installation

Diff for: examples/cpp/EXAMPLES.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,8 @@ int main() {
182182
Multithreaded examples:
183183
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
184184
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
185-
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
185+
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
186+
187+
More examples:
188+
* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
189+
* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)

Diff for: examples/cpp/example_epsilon_search.cpp

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "../../hnswlib/hnswlib.h"
2+
3+
typedef unsigned int docidtype;
4+
typedef float dist_t;
5+
6+
int main() {
7+
int dim = 16; // Dimension of the elements
8+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
9+
int M = 16; // Tightly connected with internal dimensionality of the data
10+
// strongly affects the memory consumption
11+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
12+
int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region
13+
// this parameter is similar to ef
14+
15+
int num_queries = 5;
16+
float epsilon2 = 2.0; // Squared distance to query
17+
18+
// Initing index
19+
hnswlib::L2Space space(dim);
20+
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
21+
22+
// Generate random data
23+
std::mt19937 rng;
24+
rng.seed(47);
25+
std::uniform_real_distribution<> distrib_real;
26+
27+
size_t data_point_size = space.get_data_size();
28+
char* data = new char[data_point_size * max_elements];
29+
for (int i = 0; i < max_elements; i++) {
30+
char* point_data = data + i * data_point_size;
31+
for (int j = 0; j < dim; j++) {
32+
char* vec_data = point_data + j * sizeof(float);
33+
float value = distrib_real(rng);
34+
*(float*)vec_data = value;
35+
}
36+
}
37+
38+
// Add data to index
39+
for (int i = 0; i < max_elements; i++) {
40+
hnswlib::labeltype label = i;
41+
char* point_data = data + i * data_point_size;
42+
alg_hnsw->addPoint(point_data, label);
43+
}
44+
45+
// Query random vectors
46+
for (int i = 0; i < num_queries; i++) {
47+
char* query_data = new char[data_point_size];
48+
for (int j = 0; j < dim; j++) {
49+
size_t offset = j * sizeof(float);
50+
char* vec_data = query_data + offset;
51+
float value = distrib_real(rng);
52+
*(float*)vec_data = value;
53+
}
54+
std::cout << "Query #" << i << "\n";
55+
hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
56+
std::vector<std::pair<float, hnswlib::labeltype>> result =
57+
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
58+
size_t num_vectors = result.size();
59+
std::cout << "Found " << num_vectors << " vectors\n";
60+
delete[] query_data;
61+
}
62+
63+
delete[] data;
64+
delete alg_hnsw;
65+
return 0;
66+
}

Diff for: examples/cpp/example_multivector_search.cpp

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include "../../hnswlib/hnswlib.h"
2+
3+
typedef unsigned int docidtype;
4+
typedef float dist_t;
5+
6+
int main() {
7+
int dim = 16; // Dimension of the elements
8+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
9+
int M = 16; // Tightly connected with internal dimensionality of the data
10+
// strongly affects the memory consumption
11+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
12+
13+
int num_queries = 5;
14+
int num_docs = 5; // Number of documents to search
15+
int ef_collection = 6; // Number of candidate documents during the search
16+
// Controlls the recall: higher ef leads to better accuracy, but slower search
17+
docidtype min_doc_id = 0;
18+
docidtype max_doc_id = 9;
19+
20+
// Initing index
21+
hnswlib::MultiVectorL2Space<docidtype> space(dim);
22+
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
23+
24+
// Generate random data
25+
std::mt19937 rng;
26+
rng.seed(47);
27+
std::uniform_real_distribution<> distrib_real;
28+
std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
29+
30+
size_t data_point_size = space.get_data_size();
31+
char* data = new char[data_point_size * max_elements];
32+
for (int i = 0; i < max_elements; i++) {
33+
// set vector value
34+
char* point_data = data + i * data_point_size;
35+
for (int j = 0; j < dim; j++) {
36+
char* vec_data = point_data + j * sizeof(float);
37+
float value = distrib_real(rng);
38+
*(float*)vec_data = value;
39+
}
40+
// set document id
41+
docidtype doc_id = distrib_docid(rng);
42+
space.set_doc_id(point_data, doc_id);
43+
}
44+
45+
// Add data to index
46+
std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
47+
for (int i = 0; i < max_elements; i++) {
48+
hnswlib::labeltype label = i;
49+
char* point_data = data + i * data_point_size;
50+
alg_hnsw->addPoint(point_data, label);
51+
label_docid_lookup[label] = space.get_doc_id(point_data);
52+
}
53+
54+
// Query random vectors
55+
size_t query_size = dim * sizeof(float);
56+
for (int i = 0; i < num_queries; i++) {
57+
char* query_data = new char[query_size];
58+
for (int j = 0; j < dim; j++) {
59+
size_t offset = j * sizeof(float);
60+
char* vec_data = query_data + offset;
61+
float value = distrib_real(rng);
62+
*(float*)vec_data = value;
63+
}
64+
std::cout << "Query #" << i << "\n";
65+
hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
66+
std::vector<std::pair<float, hnswlib::labeltype>> result =
67+
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
68+
size_t num_vectors = result.size();
69+
70+
std::unordered_map<docidtype, size_t> doc_counter;
71+
for (auto pair: result) {
72+
hnswlib::labeltype label = pair.second;
73+
docidtype doc_id = label_docid_lookup[label];
74+
doc_counter[doc_id] += 1;
75+
}
76+
std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
77+
delete[] query_data;
78+
}
79+
80+
delete[] data;
81+
delete alg_hnsw;
82+
return 0;
83+
}

0 commit comments

Comments
 (0)