From 18455de7261287e75e8d72da82bc6144bd5e9716 Mon Sep 17 00:00:00 2001 From: Jerry Gao <109158931+Sanhaoji2@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:01:08 +0800 Subject: [PATCH] Add metric (#606) --- include/abstract_data_store.h | 2 ++ include/abstract_graph_store.h | 2 ++ include/abstract_index.h | 2 ++ include/in_mem_data_store.h | 4 ++++ include/in_mem_graph_store.h | 3 +++ include/in_mem_static_graph_store.h | 3 +++ include/index.h | 5 ++++ include/percentile_stats.h | 17 +++++++++++++ include/pq_data_store.h | 4 ++++ include/pq_flash_index.h | 4 ++++ src/in_mem_data_store.cpp | 9 +++++++ src/in_mem_graph_store.cpp | 6 +++++ src/in_mem_static_graph_store.cpp | 6 +++++ src/index.cpp | 37 ++++++++++++++++++++++++++++- src/pq_data_store.cpp | 7 ++++++ src/pq_flash_index.cpp | 22 ++++++++++++++++- 16 files changed, 131 insertions(+), 2 deletions(-) diff --git a/include/abstract_data_store.h b/include/abstract_data_store.h index 165ada696..96d449475 100644 --- a/include/abstract_data_store.h +++ b/include/abstract_data_store.h @@ -41,6 +41,8 @@ template class AbstractDataStore // align the dimension by padding zeros. virtual size_t get_aligned_dim() const = 0; + virtual size_t get_data_size() const = 0; + // populate the store with vectors (either from a pointer or bin file), // potentially after pre-processing the vectors if the metric deems so // e.g., normalizing vectors for cosine distance over floating-point vectors diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h index 115d9ed1c..eec9e71d0 100644 --- a/include/abstract_graph_store.h +++ b/include/abstract_graph_store.h @@ -43,6 +43,8 @@ class AbstractGraphStore // set during load virtual size_t get_max_range_of_graph() = 0; + virtual size_t get_graph_size() = 0; + // Total internal points _max_points + _num_frozen_points size_t get_total_points() { diff --git a/include/abstract_index.h b/include/abstract_index.h index 7c84a8ec9..68c8fda7a 100644 --- a/include/abstract_index.h +++ b/include/abstract_index.h @@ -5,6 +5,7 @@ #include "types.h" #include "index_config.h" #include "index_build_params.h" +#include "percentile_stats.h" #include namespace diskann @@ -108,6 +109,7 @@ class AbstractIndex virtual bool is_label_valid(const std::string &raw_label) const = 0; virtual bool is_set_universal_label() const = 0; + virtual TableStats get_table_stats() const = 0; private: virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0; diff --git a/include/in_mem_data_store.h b/include/in_mem_data_store.h index eaa1562e0..489f1a729 100644 --- a/include/in_mem_data_store.h +++ b/include/in_mem_data_store.h @@ -29,6 +29,8 @@ template class InMemDataStore : public AbstractDataStore class InMemDataStore : public AbstractDataStore load_impl(const std::string &filename, size_t expected_num_points); #ifdef EXEC_ENV_OLS @@ -44,6 +46,7 @@ class InMemGraphStore : public AbstractGraphStore private: size_t _max_range_of_graph = 0; uint32_t _max_observed_degree = 0; + size_t _graph_size = 0; std::vector> _graph; }; diff --git a/include/in_mem_static_graph_store.h b/include/in_mem_static_graph_store.h index a24066493..6f74e8ad1 100644 --- a/include/in_mem_static_graph_store.h +++ b/include/in_mem_static_graph_store.h @@ -59,6 +59,8 @@ class InMemStaticGraphStore : public AbstractGraphStore virtual size_t get_max_range_of_graph() override; virtual uint32_t get_max_observed_degree() override; + virtual size_t get_graph_size() override; + protected: virtual std::tuple load_impl(const std::string& filename, size_t expected_num_points); #ifdef EXEC_ENV_OLS @@ -69,6 +71,7 @@ class InMemStaticGraphStore : public AbstractGraphStore private: size_t _max_range_of_graph = 0; uint32_t _max_observed_degree = 0; + size_t _graph_size = 0; std::vector _node_index; std::vector _graph; diff --git a/include/index.h b/include/index.h index 320942013..7b7269efb 100644 --- a/include/index.h +++ b/include/index.h @@ -21,6 +21,7 @@ #include "in_mem_data_store.h" #include "in_mem_graph_store.h" #include "abstract_index.h" +#include "percentile_stats.h" #include #include "quantized_distance.h" @@ -306,6 +307,8 @@ template clas DISKANN_DLLEXPORT void count_nodes_at_bfs_levels(); + DISKANN_DLLEXPORT TableStats get_table_stats() const override; + // This variable MUST be updated if the number of entries in the metadata // change. DISKANN_DLLEXPORT static const int METADATA_ROWS = 5; @@ -567,6 +570,8 @@ template clas simple_bitmask_buf _bitmask_buf; + TableStats _table_stats; + static const float INDEX_GROWTH_FACTOR; }; } // namespace diskann diff --git a/include/percentile_stats.h b/include/percentile_stats.h index 793257577..5ff563afa 100644 --- a/include/percentile_stats.h +++ b/include/percentile_stats.h @@ -35,6 +35,23 @@ struct QueryStats unsigned n_hops = 0; // # search hops }; +struct TableStats +{ + size_t total_mem_usage = 0; + size_t node_mem_usage = 0; + size_t graph_mem_usage = 0; + size_t label_mem_usage = 0; + size_t node_count = 0; + size_t label_count = 0; + size_t label_total_count = 0; + + // streaming + size_t tag_memory_usage = 0; + size_t insert_count = 0; + size_t delete_count = 0; + size_t active_nodes = 0; +}; + template inline T get_percentile_stats(QueryStats *stats, uint64_t len, float percentile, const std::function &member_fn) diff --git a/include/pq_data_store.h b/include/pq_data_store.h index 227b8a6af..c324e654a 100644 --- a/include/pq_data_store.h +++ b/include/pq_data_store.h @@ -34,6 +34,8 @@ template class PQDataStore : public AbstractDataStore // for Quantized data stores. virtual size_t get_aligned_dim() const override; + virtual size_t get_data_size() const override; + // Populate quantized data from unaligned data using PQ functionality virtual void populate_data(const data_t *vectors, const location_t num_pts) override; virtual void populate_data(const std::string &filename, const size_t offset) override; @@ -86,6 +88,8 @@ template class PQDataStore : public AbstractDataStore uint8_t *_quantized_data = nullptr; size_t _num_chunks = 0; + size_t _data_size = 0; + // REFACTOR TODO: Doing this temporarily before refactoring OPQ into // its own class. Remove later. bool _use_opq = false; diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index 2b26f1177..964437189 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -95,6 +95,8 @@ template class PQFlashIndex DISKANN_DLLEXPORT uint64_t get_data_dim(); + DISKANN_DLLEXPORT TableStats get_table_stats(); + std::shared_ptr &reader; DISKANN_DLLEXPORT diskann::Metric get_metric(); @@ -241,6 +243,8 @@ template class PQFlashIndex tsl::robin_map> _real_to_dummy_map; std::unordered_map _label_map; + TableStats _table_stats; + #ifdef EXEC_ENV_OLS // Set to a larger value than the actual header to accommodate // any additions we make to the header. This is an outer limit diff --git a/src/in_mem_data_store.cpp b/src/in_mem_data_store.cpp index 28bb7ba4c..0d62b1a45 100644 --- a/src/in_mem_data_store.cpp +++ b/src/in_mem_data_store.cpp @@ -18,6 +18,8 @@ InMemDataStore::InMemDataStore(const location_t num_points, const size_t _aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment()); alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); std::memset(_data, 0, this->_capacity * _aligned_dim * sizeof(data_t)); + + _data_size = this->_capacity * _aligned_dim * sizeof(data_t); } template InMemDataStore::~InMemDataStore() @@ -33,6 +35,11 @@ template size_t InMemDataStore::get_aligned_dim() cons return _aligned_dim; } +template size_t InMemDataStore::get_data_size() const +{ + return _data_size; +} + template size_t InMemDataStore::get_alignment_factor() const { return _distance_fn->get_required_alignment(); @@ -251,6 +258,7 @@ template location_t InMemDataStore::expand(const locat #else realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); #endif + this->_data_size = new_size * _aligned_dim * sizeof(data_t); this->_capacity = new_size; return this->_capacity; } @@ -277,6 +285,7 @@ template location_t InMemDataStore::shrink(const locat #else realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); #endif + this->_data_size = new_size * _aligned_dim * sizeof(data_t); this->_capacity = new_size; return this->_capacity; } diff --git a/src/in_mem_graph_store.cpp b/src/in_mem_graph_store.cpp index 6ba41b148..feb8810bc 100644 --- a/src/in_mem_graph_store.cpp +++ b/src/in_mem_graph_store.cpp @@ -193,6 +193,7 @@ std::tuple InMemGraphStore::load_impl(const std::str _max_range_of_graph = k; } } + _graph_size = cc * sizeof(uint32_t); diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start << std::endl; @@ -241,4 +242,9 @@ uint32_t InMemGraphStore::get_max_observed_degree() return _max_observed_degree; } +size_t InMemGraphStore::get_graph_size() +{ + return _graph_size; +} + } // namespace diskann diff --git a/src/in_mem_static_graph_store.cpp b/src/in_mem_static_graph_store.cpp index ed73edd93..46e3ac1fb 100644 --- a/src/in_mem_static_graph_store.cpp +++ b/src/in_mem_static_graph_store.cpp @@ -182,6 +182,7 @@ std::tuple InMemStaticGraphStore::load_impl(const st } } + _graph_size = cc * sizeof(uint32_t); diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start << std::endl; return std::make_tuple(nodes_read, start, file_frozen_pts); @@ -197,4 +198,9 @@ uint32_t InMemStaticGraphStore::get_max_observed_degree() return _max_observed_degree; } +size_t InMemStaticGraphStore::get_graph_size() +{ + return _graph_size; +} + } // namespace diskann diff --git a/src/index.cpp b/src/index.cpp index 0b01afa20..67254fff1 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -425,6 +425,10 @@ size_t Index::load_tags(const std::string tag_filename) #else load_bin(std::string(tag_filename), tag_data, file_num_points, file_dim); #endif + this->_table_stats.tag_memory_usage = + file_num_points * file_dim * sizeof(TagT) + + file_num_points * (sizeof(TagT) + sizeof(uint32_t)) + + file_num_points * (sizeof(TagT) + sizeof(uint32_t)); if (file_dim != 1) { @@ -562,6 +566,9 @@ void Index::load(const char *filename, uint32_t num_threads, ui std::string delete_set_file = std::string(filename) + ".del"; std::string graph_file = std::string(filename); data_file_num_pts = load_data(data_file); + this->_table_stats.node_count = data_file_num_pts; + this->_table_stats.node_mem_usage = this->_data_store->get_data_size(); + if (file_exists(delete_set_file)) { load_delete_set(delete_set_file); @@ -571,6 +578,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui tags_file_num_pts = load_tags(tags_file); } graph_num_pts = load_graph(graph_file, data_file_num_pts); + this->_table_stats.graph_mem_usage = _graph_store->get_graph_size(); #endif } else @@ -594,8 +602,12 @@ void Index::load(const char *filename, uint32_t num_threads, ui if (file_exists(labels_file)) { _label_map = load_label_map(labels_map_file); + this->_table_stats.label_count = _label_map.size(); + parse_label_file_in_bitset(labels_file, label_num_pts, _label_map.size()); assert(label_num_pts == data_file_num_pts - _num_frozen_pts); + this->_table_stats.label_mem_usage = _bitmask_buf._buf.size() * sizeof(std::uint64_t); + if (file_exists(labels_to_medoids)) { std::ifstream medoid_stream(labels_to_medoids); @@ -644,8 +656,14 @@ void Index::load(const char *filename, uint32_t num_threads, ui { _empty_slots.insert((uint32_t)i); } - + reposition_frozen_point_to_end(); + + _table_stats.tag_memory_usage = _table_stats.node_mem_usage + + _table_stats.graph_mem_usage + + _table_stats.label_mem_usage + + _table_stats.tag_memory_usage; + diskann::cout << "Num frozen points:" << _num_frozen_pts << " _nd: " << _nd << " _start: " << _start << " size(_location_to_tag): " << _location_to_tag.size() << " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points @@ -2024,6 +2042,7 @@ void Index::parse_label_file_in_bitset(const std::string& label simple_bitmask bm(_bitmask_buf.get_bitmask(line_cnt), _bitmask_buf._bitmask_size); bm.set(token_as_num); _labels.insert(token_as_num); + _table_stats.label_total_count++; lbl_pos = next_lbl_pos + 1; } @@ -3235,6 +3254,11 @@ int Index::insert_point(const T *point, const TagT tag, const s inter_insert(location, pruned_list, scratch); + // only support single thread insert + _table_stats.insert_count++; + _table_stats.active_nodes++; + _table_stats.node_count++; + return 0; } @@ -3285,6 +3309,11 @@ template int Index _delete_set->insert(location); _location_to_tag.erase(location); _tag_to_location.erase(tag); + + //only support single thread delete + _table_stats.delete_count++; + _table_stats.active_nodes--; + return 0; } @@ -3565,6 +3594,12 @@ size_t Index::search_string_range(const std::string& str, char return std::string::npos; } +template +TableStats Index::get_table_stats() const +{ + return _table_stats; +} + /* Internals of the library */ template const float Index::INDEX_GROWTH_FACTOR = 1.5f; diff --git a/src/pq_data_store.cpp b/src/pq_data_store.cpp index 2136c71e2..ba153ba22 100644 --- a/src/pq_data_store.cpp +++ b/src/pq_data_store.cpp @@ -234,9 +234,16 @@ template location_t PQDataStore::load_impl(const std:: auto pivots_file = _pq_distance_fn->get_pivot_data_filename(file_prefix); _pq_distance_fn->load_pivot_data(pivots_file, _num_chunks); + _data_size = num_points * _num_chunks * sizeof(data_t); + return this->_capacity; } +template size_t PQDataStore::get_data_size() const +{ + return _data_size; +} + template location_t PQDataStore::expand(const location_t new_size) { throw std::logic_error("Not implemented yet"); diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 8e1b6e985..69386c7db 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -707,7 +707,11 @@ void PQFlashIndex::parse_label_file(std::basic_istream& infile, uint32_t num_pts_in_label_file; uint32_t num_total_labels; get_label_file_metadata(buffer, num_pts_in_label_file, num_total_labels); - + this->_table_stats.label_total_count = num_total_labels; + this->_table_stats.label_mem_usage = num_pts_in_label_file * sizeof(uint32_t) + + num_pts_in_label_file * sizeof(uint32_t) + + num_total_labels * sizeof(LabelT); + _pts_to_label_offsets = new uint32_t[num_pts_in_label_file]; _pts_to_label_counts = new uint32_t[num_pts_in_label_file]; _pts_to_labels = new LabelT[num_total_labels]; @@ -863,6 +867,10 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons this->_num_points = npts_u64; this->_n_chunks = nchunks_u64; + + this->_table_stats.node_count = npts_u64; + this->_table_stats.node_mem_usage = npts_u64 * nchunks_u64; + #ifdef EXEC_ENV_OLS if (files.fileExists(labels_file)) { @@ -891,6 +899,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons std::ifstream map_reader(labels_map_file); #endif _label_map = load_label_map(map_reader); + this->_table_stats.label_count = _label_map.size(); #ifndef EXEC_ENV_OLS map_reader.close(); @@ -1222,6 +1231,12 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons diskann::cout << "Setting re-scaling factor of base vectors to " << this->_max_base_norm << std::endl; delete[] norm_val; } + + _table_stats.tag_memory_usage = _table_stats.node_mem_usage + + _table_stats.graph_mem_usage + + _table_stats.label_mem_usage + + _table_stats.tag_memory_usage; + diskann::cout << "done.." << std::endl; return 0; } @@ -1785,6 +1800,11 @@ template uint64_t PQFlashIndex::get_dat return _data_dim; } +template TableStats PQFlashIndex::get_table_stats() +{ + return _table_stats; +} + template diskann::Metric PQFlashIndex::get_metric() { return this->metric;