Skip to content

Commit

Permalink
Add metric (#606)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sanhaoji2 authored Dec 5, 2024
1 parent ce63c03 commit 18455de
Show file tree
Hide file tree
Showing 16 changed files with 131 additions and 2 deletions.
2 changes: 2 additions & 0 deletions include/abstract_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ template <typename data_t> class AbstractDataStore
// align the dimension by padding zeros.
virtual size_t get_aligned_dim() const = 0;

virtual size_t get_data_size() const = 0;

// populate the store with vectors (either from a pointer or bin file),
// potentially after pre-processing the vectors if the metric deems so
// e.g., normalizing vectors for cosine distance over floating-point vectors
Expand Down
2 changes: 2 additions & 0 deletions include/abstract_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class AbstractGraphStore
// set during load
virtual size_t get_max_range_of_graph() = 0;

virtual size_t get_graph_size() = 0;

// Total internal points _max_points + _num_frozen_points
size_t get_total_points()
{
Expand Down
2 changes: 2 additions & 0 deletions include/abstract_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "types.h"
#include "index_config.h"
#include "index_build_params.h"
#include "percentile_stats.h"
#include <any>

namespace diskann
Expand Down Expand Up @@ -108,6 +109,7 @@ class AbstractIndex

virtual bool is_label_valid(const std::string &raw_label) const = 0;
virtual bool is_set_universal_label() const = 0;
virtual TableStats get_table_stats() const = 0;

private:
virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0;
Expand Down
4 changes: 4 additions & 0 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_

virtual size_t get_aligned_dim() const override;

virtual size_t get_data_size() const override;

// Populate internal data from unaligned data while doing alignment and any
// normalization that is required.
virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
Expand Down Expand Up @@ -75,6 +77,8 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_

size_t _aligned_dim;

size_t _data_size = 0;

// It may seem weird to put distance metric along with the data store class,
// but this gives us perf benefits as the datastore can do distance
// computations during search and compute norms of vectors internally without
Expand Down
3 changes: 3 additions & 0 deletions include/in_mem_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class InMemGraphStore : public AbstractGraphStore
virtual size_t get_max_range_of_graph() override;
virtual uint32_t get_max_observed_degree() override;

virtual size_t get_graph_size() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points);
#ifdef EXEC_ENV_OLS
Expand All @@ -44,6 +46,7 @@ class InMemGraphStore : public AbstractGraphStore
private:
size_t _max_range_of_graph = 0;
uint32_t _max_observed_degree = 0;
size_t _graph_size = 0;

std::vector<std::vector<uint32_t>> _graph;
};
Expand Down
3 changes: 3 additions & 0 deletions include/in_mem_static_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class InMemStaticGraphStore : public AbstractGraphStore
virtual size_t get_max_range_of_graph() override;
virtual uint32_t get_max_observed_degree() override;

virtual size_t get_graph_size() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string& filename, size_t expected_num_points);
#ifdef EXEC_ENV_OLS
Expand All @@ -69,6 +71,7 @@ class InMemStaticGraphStore : public AbstractGraphStore
private:
size_t _max_range_of_graph = 0;
uint32_t _max_observed_degree = 0;
size_t _graph_size = 0;

std::vector<size_t> _node_index;
std::vector<std::uint32_t> _graph;
Expand Down
5 changes: 5 additions & 0 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "in_mem_data_store.h"
#include "in_mem_graph_store.h"
#include "abstract_index.h"
#include "percentile_stats.h"
#include <bitset>

#include "quantized_distance.h"
Expand Down Expand Up @@ -306,6 +307,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

DISKANN_DLLEXPORT void count_nodes_at_bfs_levels();

DISKANN_DLLEXPORT TableStats get_table_stats() const override;

// This variable MUST be updated if the number of entries in the metadata
// change.
DISKANN_DLLEXPORT static const int METADATA_ROWS = 5;
Expand Down Expand Up @@ -567,6 +570,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

simple_bitmask_buf _bitmask_buf;

TableStats _table_stats;

static const float INDEX_GROWTH_FACTOR;
};
} // namespace diskann
17 changes: 17 additions & 0 deletions include/percentile_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ struct QueryStats
unsigned n_hops = 0; // # search hops
};

struct TableStats
{
size_t total_mem_usage = 0;
size_t node_mem_usage = 0;
size_t graph_mem_usage = 0;
size_t label_mem_usage = 0;
size_t node_count = 0;
size_t label_count = 0;
size_t label_total_count = 0;

// streaming
size_t tag_memory_usage = 0;
size_t insert_count = 0;
size_t delete_count = 0;
size_t active_nodes = 0;
};

template <typename T>
inline T get_percentile_stats(QueryStats *stats, uint64_t len, float percentile,
const std::function<T(const QueryStats &)> &member_fn)
Expand Down
4 changes: 4 additions & 0 deletions include/pq_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ template <typename data_t> class PQDataStore : public AbstractDataStore<data_t>
// for Quantized data stores.
virtual size_t get_aligned_dim() const override;

virtual size_t get_data_size() const override;

// Populate quantized data from unaligned data using PQ functionality
virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
virtual void populate_data(const std::string &filename, const size_t offset) override;
Expand Down Expand Up @@ -86,6 +88,8 @@ template <typename data_t> class PQDataStore : public AbstractDataStore<data_t>
uint8_t *_quantized_data = nullptr;
size_t _num_chunks = 0;

size_t _data_size = 0;

// REFACTOR TODO: Doing this temporarily before refactoring OPQ into
// its own class. Remove later.
bool _use_opq = false;
Expand Down
4 changes: 4 additions & 0 deletions include/pq_flash_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ template <typename T, typename LabelT = uint32_t> class PQFlashIndex

DISKANN_DLLEXPORT uint64_t get_data_dim();

DISKANN_DLLEXPORT TableStats get_table_stats();

std::shared_ptr<AlignedFileReader> &reader;

DISKANN_DLLEXPORT diskann::Metric get_metric();
Expand Down Expand Up @@ -241,6 +243,8 @@ template <typename T, typename LabelT = uint32_t> class PQFlashIndex
tsl::robin_map<uint32_t, std::vector<uint32_t>> _real_to_dummy_map;
std::unordered_map<std::string, LabelT> _label_map;

TableStats _table_stats;

#ifdef EXEC_ENV_OLS
// Set to a larger value than the actual header to accommodate
// any additions we make to the header. This is an outer limit
Expand Down
9 changes: 9 additions & 0 deletions src/in_mem_data_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ InMemDataStore<data_t>::InMemDataStore(const location_t num_points, const size_t
_aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment());
alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
std::memset(_data, 0, this->_capacity * _aligned_dim * sizeof(data_t));

_data_size = this->_capacity * _aligned_dim * sizeof(data_t);
}

template <typename data_t> InMemDataStore<data_t>::~InMemDataStore()
Expand All @@ -33,6 +35,11 @@ template <typename data_t> size_t InMemDataStore<data_t>::get_aligned_dim() cons
return _aligned_dim;
}

template <typename data_t> size_t InMemDataStore<data_t>::get_data_size() const
{
return _data_size;
}

template <typename data_t> size_t InMemDataStore<data_t>::get_alignment_factor() const
{
return _distance_fn->get_required_alignment();
Expand Down Expand Up @@ -251,6 +258,7 @@ template <typename data_t> location_t InMemDataStore<data_t>::expand(const locat
#else
realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
#endif
this->_data_size = new_size * _aligned_dim * sizeof(data_t);
this->_capacity = new_size;
return this->_capacity;
}
Expand All @@ -277,6 +285,7 @@ template <typename data_t> location_t InMemDataStore<data_t>::shrink(const locat
#else
realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
#endif
this->_data_size = new_size * _aligned_dim * sizeof(data_t);
this->_capacity = new_size;
return this->_capacity;
}
Expand Down
6 changes: 6 additions & 0 deletions src/in_mem_graph_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ std::tuple<uint32_t, uint32_t, size_t> InMemGraphStore::load_impl(const std::str
_max_range_of_graph = k;
}
}
_graph_size = cc * sizeof(uint32_t);

diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
<< std::endl;
Expand Down Expand Up @@ -241,4 +242,9 @@ uint32_t InMemGraphStore::get_max_observed_degree()
return _max_observed_degree;
}

size_t InMemGraphStore::get_graph_size()
{
return _graph_size;
}

} // namespace diskann
6 changes: 6 additions & 0 deletions src/in_mem_static_graph_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ std::tuple<uint32_t, uint32_t, size_t> InMemStaticGraphStore::load_impl(const st
}
}

_graph_size = cc * sizeof(uint32_t);
diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
<< std::endl;
return std::make_tuple(nodes_read, start, file_frozen_pts);
Expand All @@ -197,4 +198,9 @@ uint32_t InMemStaticGraphStore::get_max_observed_degree()
return _max_observed_degree;
}

size_t InMemStaticGraphStore::get_graph_size()
{
return _graph_size;
}

} // namespace diskann
37 changes: 36 additions & 1 deletion src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,10 @@ size_t Index<T, TagT, LabelT>::load_tags(const std::string tag_filename)
#else
load_bin<TagT>(std::string(tag_filename), tag_data, file_num_points, file_dim);
#endif
this->_table_stats.tag_memory_usage =
file_num_points * file_dim * sizeof(TagT)
+ file_num_points * (sizeof(TagT) + sizeof(uint32_t))
+ file_num_points * (sizeof(TagT) + sizeof(uint32_t));

if (file_dim != 1)
{
Expand Down Expand Up @@ -562,6 +566,9 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
std::string delete_set_file = std::string(filename) + ".del";
std::string graph_file = std::string(filename);
data_file_num_pts = load_data(data_file);
this->_table_stats.node_count = data_file_num_pts;
this->_table_stats.node_mem_usage = this->_data_store->get_data_size();

if (file_exists(delete_set_file))
{
load_delete_set(delete_set_file);
Expand All @@ -571,6 +578,7 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
tags_file_num_pts = load_tags(tags_file);
}
graph_num_pts = load_graph(graph_file, data_file_num_pts);
this->_table_stats.graph_mem_usage = _graph_store->get_graph_size();
#endif
}
else
Expand All @@ -594,8 +602,12 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
if (file_exists(labels_file))
{
_label_map = load_label_map(labels_map_file);
this->_table_stats.label_count = _label_map.size();

parse_label_file_in_bitset(labels_file, label_num_pts, _label_map.size());
assert(label_num_pts == data_file_num_pts - _num_frozen_pts);
this->_table_stats.label_mem_usage = _bitmask_buf._buf.size() * sizeof(std::uint64_t);

if (file_exists(labels_to_medoids))
{
std::ifstream medoid_stream(labels_to_medoids);
Expand Down Expand Up @@ -644,8 +656,14 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
{
_empty_slots.insert((uint32_t)i);
}

reposition_frozen_point_to_end();

_table_stats.tag_memory_usage = _table_stats.node_mem_usage
+ _table_stats.graph_mem_usage
+ _table_stats.label_mem_usage
+ _table_stats.tag_memory_usage;

diskann::cout << "Num frozen points:" << _num_frozen_pts << " _nd: " << _nd << " _start: " << _start
<< " size(_location_to_tag): " << _location_to_tag.size()
<< " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points
Expand Down Expand Up @@ -2024,6 +2042,7 @@ void Index<T, TagT, LabelT>::parse_label_file_in_bitset(const std::string& label
simple_bitmask bm(_bitmask_buf.get_bitmask(line_cnt), _bitmask_buf._bitmask_size);
bm.set(token_as_num);
_labels.insert(token_as_num);
_table_stats.label_total_count++;

lbl_pos = next_lbl_pos + 1;
}
Expand Down Expand Up @@ -3235,6 +3254,11 @@ int Index<T, TagT, LabelT>::insert_point(const T *point, const TagT tag, const s

inter_insert(location, pruned_list, scratch);

// only support single thread insert
_table_stats.insert_count++;
_table_stats.active_nodes++;
_table_stats.node_count++;

return 0;
}

Expand Down Expand Up @@ -3285,6 +3309,11 @@ template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>
_delete_set->insert(location);
_location_to_tag.erase(location);
_tag_to_location.erase(tag);

//only support single thread delete
_table_stats.delete_count++;
_table_stats.active_nodes--;

return 0;
}

Expand Down Expand Up @@ -3565,6 +3594,12 @@ size_t Index<T, TagT, LabelT>::search_string_range(const std::string& str, char
return std::string::npos;
}

template <typename T, typename TagT, typename LabelT>
TableStats Index<T, TagT, LabelT>::get_table_stats() const
{
return _table_stats;
}

/* Internals of the library */
template <typename T, typename TagT, typename LabelT> const float Index<T, TagT, LabelT>::INDEX_GROWTH_FACTOR = 1.5f;

Expand Down
7 changes: 7 additions & 0 deletions src/pq_data_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,16 @@ template <typename data_t> location_t PQDataStore<data_t>::load_impl(const std::
auto pivots_file = _pq_distance_fn->get_pivot_data_filename(file_prefix);
_pq_distance_fn->load_pivot_data(pivots_file, _num_chunks);

_data_size = num_points * _num_chunks * sizeof(data_t);

return this->_capacity;
}

template <typename data_t> size_t PQDataStore<data_t>::get_data_size() const
{
return _data_size;
}

template <typename data_t> location_t PQDataStore<data_t>::expand(const location_t new_size)
{
throw std::logic_error("Not implemented yet");
Expand Down
Loading

0 comments on commit 18455de

Please sign in to comment.