Skip to content

Commit 98cf0a1

Browse files
authored
[MOD-9557] Fix incorrect vector blob size calculation (#665)
* add size_t bg_vector_indexing_count; size_t main_vector_indexing_count; * fix datasize in executeInsertJob * disable neighbours print * replace manual blob size calcaulations format comment out new tests * INT8Test::PopulateRandomVector increases seed every call so we will get different vectors. this required a change in metrics_test because now the score is not exactly 0. VecSimIndexAbstract CastToBruteForce()-> rename to GetFlatBufferIndex add BruteForceIndex CastToBruteForce() add getFlatBufferIndexAsBruteForce to tiered index to get BruteForceIndex functions added to hnsw and BF: Formalize getDataByLabel definition: populate the intput empty vector with the vectors data,not including any additional meta data that might present in the vector storage like the norm. add getStoredVectorDataByLabel returns a vector of char vectors. each vector contains the vector as it is stored in the vectors container, including the meta data if exists. * build test with VERBOSE=1 * verbose cov * verbose cov fix assert in FP32SpacesOptimizationTest::FP32InnerProductTest * fix * add force_copy to maybeCopyToAlignedMem and preprocessQuery use preprocessQuery with force_copy in batch iterator instead on preprocessQueryInPlace * also batch in svs * revert redundancies * tiered batch iterator doesn't copy the blob * add // TODO: handle original_blob_size != processed_bytes_count * revert pull request changes * move getDataByLabel and getStoredVectorDataByLabel to VecSimIndexAbstract * add test_index_test_utils currently testing get_stored_vector_data_single_test * getFlatBufferIndex returns BruteForceIndex * little test name fix * small fix * fix range
1 parent b22e5c2 commit 98cf0a1

19 files changed

+602
-99
lines changed

cmake/svs.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ if(USE_SVS)
6161
find_package(svs REQUIRED)
6262
set(SVS_LVQ_HEADER "svs/extensions/vamana/lvq.h")
6363
else()
64-
# This file is included from both CMakeLists.txt and python_bindings/CMakeLists.txt
65-
# Set `root` relative to this file, regardless of where it is included from.
64+
# This file is included from both CMakeLists.txt and python_bindings/CMakeLists.txt
65+
# Set `root` relative to this file, regardless of where it is included from.
6666
get_filename_component(root ${CMAKE_CURRENT_LIST_DIR}/.. ABSOLUTE)
6767
add_subdirectory(
6868
${root}/deps/ScalableVectorSearch

src/VecSim/algorithms/brute_force/brute_force.h

+8-17
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
4242
size_t indexSize() const override;
4343
size_t indexCapacity() const override;
4444
std::unique_ptr<RawDataContainer::Iterator> getVectorsIterator() const;
45-
DataType *getDataByInternalId(idType id) const {
46-
return (DataType *)this->vectors->getElement(id);
45+
const DataType *getDataByInternalId(idType id) const {
46+
return reinterpret_cast<const DataType *>(this->vectors->getElement(id));
4747
}
4848
VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
4949
VecSimQueryParams *queryParams) const override;
@@ -76,16 +76,6 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
7676

7777
virtual ~BruteForceIndex() = default;
7878
#ifdef BUILD_TESTS
79-
/**
80-
* @brief Used for testing - store vector(s) data associated with a given label. This function
81-
* copies the vector(s)' data buffer(s) and place it in the output vector
82-
*
83-
* @param label
84-
* @param vectors_output empty vector to be modified, should store the blob(s) associated with
85-
* the label.
86-
*/
87-
virtual void getDataByLabel(labelType label,
88-
std::vector<std::vector<DataType>> &vectors_output) const = 0;
8979
void fitMemory() override {
9080
if (count == 0) {
9181
return;
@@ -350,12 +340,13 @@ template <typename DataType, typename DistType>
350340
VecSimBatchIterator *
351341
BruteForceIndex<DataType, DistType>::newBatchIterator(const void *queryBlob,
352342
VecSimQueryParams *queryParams) const {
353-
auto *queryBlobCopy =
354-
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
355-
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
356-
this->preprocessQueryInPlace(queryBlobCopy);
343+
// force_copy == true.
344+
auto queryBlobCopy = this->preprocessQuery(queryBlob, true);
345+
346+
// take ownership of the blob copy and pass it to the batch iterator.
347+
auto *queryBlobCopyPtr = queryBlobCopy.release();
357348
// Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end.
358-
return newBatchIterator_Instance(queryBlobCopy, queryParams);
349+
return newBatchIterator_Instance(queryBlobCopyPtr, queryParams);
359350
}
360351

361352
template <typename DataType, typename DistType>

src/VecSim/algorithms/brute_force/brute_force_multi.h

+20
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,30 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
4747

4848
for (idType id : ids->second) {
4949
auto vec = std::vector<DataType>(this->dim);
50+
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like
51+
// the norm
5052
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
5153
vectors_output.push_back(vec);
5254
}
5355
}
56+
57+
std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
58+
std::vector<std::vector<char>> vectors_output;
59+
auto ids = labelToIdsLookup.find(label);
60+
61+
for (idType id : ids->second) {
62+
// Get the data pointer - need to cast to char* for memcpy
63+
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));
64+
65+
// Create a vector with the full data (including any metadata like norms)
66+
std::vector<char> vec(this->getDataSize());
67+
memcpy(vec.data(), data, this->getDataSize());
68+
vectors_output.push_back(std::move(vec));
69+
}
70+
71+
return vectors_output;
72+
}
73+
5474
#endif
5575
private:
5676
// inline definitions

src/VecSim/algorithms/brute_force/brute_force_single.h

+17
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,26 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
4949
auto id = labelToIdLookup.at(label);
5050

5151
auto vec = std::vector<DataType>(this->dim);
52+
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the
53+
// norm
5254
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
5355
vectors_output.push_back(vec);
5456
}
57+
58+
std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
59+
std::vector<std::vector<char>> vectors_output;
60+
auto id = labelToIdLookup.at(label);
61+
62+
// Get the data pointer - need to cast to char* for memcpy
63+
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));
64+
65+
// Create a vector with the full data (including any metadata like norms)
66+
std::vector<char> vec(this->getDataSize());
67+
memcpy(vec.data(), data, this->getDataSize());
68+
vectors_output.push_back(std::move(vec));
69+
70+
return vectors_output;
71+
}
5572
#endif
5673
protected:
5774
// inline definitions

src/VecSim/algorithms/hnsw/hnsw.h

+1-11
Original file line numberDiff line numberDiff line change
@@ -302,16 +302,6 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
302302
virtual int removeLabel(labelType label) = 0;
303303

304304
#ifdef BUILD_TESTS
305-
/**
306-
* @brief Used for testing - store vector(s) data associated with a given label. This function
307-
* copies the vector(s)' data buffer(s) and place it in the output vector
308-
*
309-
* @param label
310-
* @param vectors_output empty vector to be modified, should store the blob(s) associated with
311-
* the label.
312-
*/
313-
virtual void getDataByLabel(labelType label,
314-
std::vector<std::vector<DataType>> &vectors_output) const = 0;
315305
void fitMemory() override {
316306
if (maxElements > 0) {
317307
idToMetaData.shrink_to_fit();
@@ -1561,7 +1551,7 @@ void HNSWIndex<DataType, DistType>::insertElementToGraph(idType element_id,
15611551
for (auto level = static_cast<int>(max_common_level); level >= 0; level--) {
15621552
candidatesMaxHeap<DistType> top_candidates =
15631553
searchLayer(curr_element, vector_data, level, efConstruction);
1564-
// If the entry point was marked deleted between iterations, we may recieve an empty
1554+
// If the entry point was marked deleted between iterations, we may receive an empty
15651555
// candidates set.
15661556
if (!top_candidates.empty()) {
15671557
curr_element = mutuallyConnectNewElement(element_id, top_candidates, level);

src/VecSim/algorithms/hnsw/hnsw_multi.h

+25-6
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,28 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
7474

7575
for (idType id : ids->second) {
7676
auto vec = std::vector<DataType>(this->dim);
77-
memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize);
77+
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like
78+
// the norm
79+
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
7880
vectors_output.push_back(vec);
7981
}
8082
}
83+
84+
std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
85+
std::vector<std::vector<char>> vectors_output;
86+
auto ids = labelLookup.find(label);
87+
88+
for (idType id : ids->second) {
89+
const char *data = this->getDataByInternalId(id);
90+
91+
// Create a vector with the full data (including any metadata like norms)
92+
std::vector<char> vec(this->dataSize);
93+
memcpy(vec.data(), data, this->dataSize);
94+
vectors_output.push_back(std::move(vec));
95+
}
96+
97+
return vectors_output;
98+
}
8199
#endif
82100
~HNSWIndex_Multi() = default;
83101

@@ -201,13 +219,14 @@ template <typename DataType, typename DistType>
201219
VecSimBatchIterator *
202220
HNSWIndex_Multi<DataType, DistType>::newBatchIterator(const void *queryBlob,
203221
VecSimQueryParams *queryParams) const {
204-
auto queryBlobCopy =
205-
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
206-
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
207-
this->preprocessQueryInPlace(queryBlobCopy);
222+
// force_copy == true.
223+
auto queryBlobCopy = this->preprocessQuery(queryBlob, true);
224+
225+
// take ownership of the blob copy and pass it to the batch iterator.
226+
auto *queryBlobCopyPtr = queryBlobCopy.release();
208227
// Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
209228
return new (this->allocator) HNSWMulti_BatchIterator<DataType, DistType>(
210-
queryBlobCopy, this, queryParams, this->allocator);
229+
queryBlobCopyPtr, this, queryParams, this->allocator);
211230
}
212231

213232
/**

src/VecSim/algorithms/hnsw/hnsw_single.h

+22-6
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,24 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
5050
auto id = labelLookup.at(label);
5151

5252
auto vec = std::vector<DataType>(this->dim);
53-
memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize);
53+
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the
54+
// norm
55+
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
5456
vectors_output.push_back(vec);
5557
}
58+
59+
std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
60+
std::vector<std::vector<char>> vectors_output;
61+
auto id = labelLookup.at(label);
62+
const char *data = this->getDataByInternalId(id);
63+
64+
// Create a vector with the full data (including any metadata like norms)
65+
std::vector<char> vec(this->dataSize);
66+
memcpy(vec.data(), data, this->dataSize);
67+
vectors_output.push_back(std::move(vec));
68+
69+
return vectors_output;
70+
}
5671
#endif
5772
~HNSWIndex_Single() = default;
5873

@@ -161,13 +176,14 @@ template <typename DataType, typename DistType>
161176
VecSimBatchIterator *
162177
HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
163178
VecSimQueryParams *queryParams) const {
164-
auto queryBlobCopy =
165-
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
166-
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
167-
this->preprocessQueryInPlace(queryBlobCopy);
179+
// force_copy == true.
180+
auto queryBlobCopy = this->preprocessQuery(queryBlob, true);
181+
182+
// take ownership of the blob copy and pass it to the batch iterator.
183+
auto *queryBlobCopyPtr = queryBlobCopy.release();
168184
// Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
169185
return new (this->allocator) HNSWSingle_BatchIterator<DataType, DistType>(
170-
queryBlobCopy, this, queryParams, this->allocator);
186+
queryBlobCopyPtr, this, queryParams, this->allocator);
171187
}
172188

173189
/**

src/VecSim/algorithms/hnsw/hnsw_tiered.h

+18-13
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
172172
inline void filter_irrelevant_results(VecSimQueryResultContainer &);
173173

174174
public:
175-
TieredHNSW_BatchIterator(void *query_vector,
175+
TieredHNSW_BatchIterator(const void *query_vector,
176176
const TieredHNSWIndex<DataType, DistType> *index,
177177
VecSimQueryParams *queryParams,
178178
std::shared_ptr<VecSimAllocator> allocator);
@@ -206,11 +206,9 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
206206
VecSimDebugInfoIterator *debugInfoIterator() const override;
207207
VecSimBatchIterator *newBatchIterator(const void *queryBlob,
208208
VecSimQueryParams *queryParams) const override {
209-
size_t blobSize = this->frontendIndex->getDim() * sizeof(DataType);
210-
void *queryBlobCopy = this->allocator->allocate(blobSize);
211-
memcpy(queryBlobCopy, queryBlob, blobSize);
209+
// The query blob will be processed and copied by the internal indexes's batch iterator.
212210
return new (this->allocator)
213-
TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator);
211+
TieredHNSW_BatchIterator(queryBlob, this, queryParams, this->allocator);
214212
}
215213
inline void setLastSearchMode(VecSearchMode mode) override {
216214
return this->backendIndex->setLastSearchMode(mode);
@@ -545,10 +543,11 @@ void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
545543
HNSWIndex<DataType, DistType> *hnsw_index = this->getHNSWIndex();
546544
// Copy the vector blob from the flat buffer, so we can release the flat lock while we are
547545
// indexing the vector into HNSW index.
548-
auto blob_copy = this->getAllocator()->allocate_unique(this->frontendIndex->getDataSize());
549-
550-
memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id),
551-
this->frontendIndex->getDim() * sizeof(DataType));
546+
size_t data_size = this->frontendIndex->getDataSize();
547+
auto blob_copy = this->getAllocator()->allocate_unique(data_size);
548+
// Assuming the size of the blob stored in the frontend index matches the size of the blob
549+
// stored in the HNSW index.
550+
memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), data_size);
552551

553552
this->insertVectorToHNSW<true>(hnsw_index, job->label, blob_copy.get());
554553

@@ -719,7 +718,7 @@ int TieredHNSWIndex<DataType, DistType>::addVector(const void *blob, labelType l
719718
int ret = 1;
720719
auto hnsw_index = this->getHNSWIndex();
721720
// writeMode is not protected since it is assumed to be called only from the "main thread"
722-
// (that is the thread that is exculusively calling add/delete vector).
721+
// (that is the thread that is exclusively calling add/delete vector).
723722
if (this->getWriteMode() == VecSim_WriteInPlace) {
724723
// First, check if we need to overwrite the vector in-place for single (from both indexes).
725724
if (!this->backendIndex->isMultiValue()) {
@@ -849,7 +848,7 @@ int TieredHNSWIndex<DataType, DistType>::deleteVector(labelType label) {
849848
// Note that we may remove the same vector that has been removed from the flat index, if it was
850849
// being ingested at that time.
851850
// writeMode is not protected since it is assumed to be called only from the "main thread"
852-
// (that is the thread that is exculusively calling add/delete vector).
851+
// (that is the thread that is exclusively calling add/delete vector).
853852
if (this->getWriteMode() == VecSim_WriteAsync) {
854853
num_deleted_vectors += this->deleteLabelFromHNSW(label);
855854
// Apply ready swap jobs if number of deleted vectors reached the threshold
@@ -924,9 +923,14 @@ double TieredHNSWIndex<DataType, DistType>::getDistanceFrom_Unsafe(labelType lab
924923

925924
template <typename DataType, typename DistType>
926925
TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::TieredHNSW_BatchIterator(
927-
void *query_vector, const TieredHNSWIndex<DataType, DistType> *index,
926+
const void *query_vector, const TieredHNSWIndex<DataType, DistType> *index,
928927
VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
929-
: VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr,
928+
// Tiered batch iterator doesn't hold its own copy of the query vector.
929+
// Instead, each internal batch iterators (flat_iterator and hnsw_iterator) create their own
930+
// copies: flat_iterator copy is created during TieredHNSW_BatchIterator construction When
931+
// TieredHNSW_BatchIterator::getNextResults() is called and hnsw_iterator is not initialized, it
932+
// retrieves the blob from flat_iterator
933+
: VecSimBatchIterator(nullptr, queryParams ? queryParams->timeoutCtx : nullptr,
930934
std::move(allocator)),
931935
index(index), flat_results(this->allocator), hnsw_results(this->allocator),
932936
flat_iterator(this->index->frontendIndex->newBatchIterator(query_vector, queryParams)),
@@ -1192,4 +1196,5 @@ void TieredHNSWIndex<DataType, DistType>::getDataByLabel(
11921196
labelType label, std::vector<std::vector<DataType>> &vectors_output) const {
11931197
this->getHNSWIndex()->getDataByLabel(label, vectors_output);
11941198
}
1199+
11951200
#endif

src/VecSim/algorithms/svs/svs.h

+19-8
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,12 @@ class SVSIndex : public VecSimIndexAbstract<svs_details::vecsim_dt<DataType>, fl
155155
return MemoryUtils::unique_blob{const_cast<void *>(original_data), [](void *) {}};
156156
}
157157

158-
const auto data_size = this->dim * sizeof(DataType) * n;
158+
const auto data_size = this->getDataSize() * n;
159159

160160
auto processed_blob =
161161
MemoryUtils::unique_blob{this->allocator->allocate(data_size),
162162
[this](void *ptr) { this->allocator->free_allocation(ptr); }};
163+
// Assuming original data size equals to processed data size
163164
memcpy(processed_blob.get(), original_data, data_size);
164165
// Preprocess each vector in place
165166
for (size_t i = 0; i < n; i++) {
@@ -435,17 +436,18 @@ class SVSIndex : public VecSimIndexAbstract<svs_details::vecsim_dt<DataType>, fl
435436

436437
VecSimBatchIterator *newBatchIterator(const void *queryBlob,
437438
VecSimQueryParams *queryParams) const override {
438-
auto *queryBlobCopy =
439-
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
440-
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
441-
this->preprocessQueryInPlace(queryBlobCopy);
439+
// force_copy == true.
440+
auto queryBlobCopy = this->preprocessQuery(queryBlob, true);
441+
442+
// take ownership of the blob copy and pass it to the batch iterator.
443+
auto *queryBlobCopyPtr = queryBlobCopy.release();
442444
// Ownership of queryBlobCopy moves to VecSimBatchIterator that will free it at the end.
443445
if (indexSize() == 0) {
444446
return new (this->getAllocator())
445-
NullSVS_BatchIterator(queryBlobCopy, queryParams, this->getAllocator());
447+
NullSVS_BatchIterator(queryBlobCopyPtr, queryParams, this->getAllocator());
446448
} else {
447449
return new (this->getAllocator()) SVS_BatchIterator<impl_type, data_type>(
448-
queryBlobCopy, impl_.get(), queryParams, this->getAllocator());
450+
queryBlobCopyPtr, impl_.get(), queryParams, this->getAllocator());
449451
}
450452
}
451453

@@ -479,6 +481,15 @@ class SVSIndex : public VecSimIndexAbstract<svs_details::vecsim_dt<DataType>, fl
479481
}
480482

481483
#ifdef BUILD_TESTS
482-
virtual void fitMemory() {};
484+
void fitMemory() override {}
485+
std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
486+
assert(nullptr && "Not implemented");
487+
return {};
488+
}
489+
void getDataByLabel(
490+
labelType label,
491+
std::vector<std::vector<svs_details::vecsim_dt<DataType>>> &vectors_output) const override {
492+
assert(nullptr && "Not implemented");
493+
}
483494
#endif
484495
};

0 commit comments

Comments
 (0)