diff --git a/apps/search_disk_index.cpp b/apps/search_disk_index.cpp index 4b8d91a45..23e40390a 100644 --- a/apps/search_disk_index.cpp +++ b/apps/search_disk_index.cpp @@ -33,8 +33,9 @@ namespace po = boost::program_options; -#ifdef DISKANN_DEBUG_PRINT_RETSET -void dump_retset(uint64_t test_id, uint64_t query_num, diskann::QueryStats *stats, const std::string &result_output_prefix) +#ifdef DISKANN_DEBUG_PRINT_RETSET +void dump_retset(uint64_t test_id, uint64_t query_num, diskann::QueryStats *stats, + const std::string &result_output_prefix) { std::stringstream ss; if (stats != nullptr) @@ -44,12 +45,10 @@ void dump_retset(uint64_t test_id, uint64_t query_num, diskann::QueryStats *stat ss << i << "\t"; for (int j = 0; j < (stats + i)->query_retset.size(); j++) { - ss << "(" << (stats + i)->query_retset[j].id << ", " << (stats + i)->query_retset[j].distance - << "), "; + ss << "(" << (stats + i)->query_retset[j].id << ", " << (stats + i)->query_retset[j].distance << "), "; } ss << std::endl; } - } std::string results_file = result_output_prefix + "_L" + std::to_string(test_id) + "_retset.tsv"; std::ofstream writer(results_file); @@ -148,7 +147,6 @@ void write_gt_to_tsv(const std::string &cur_result_path, uint64_t query_num, uin } #endif - void print_stats(std::string category, std::vector percentiles, std::vector results) { diskann::cout << std::setw(20) << category << ": " << std::flush; @@ -165,10 +163,10 @@ void print_stats(std::string category, std::vector percentiles, std::vect diskann::cout << std::endl; } -template +template void parse_labels_of_query(const std::string &filters_for_query, - std::unique_ptr> &pFlashIndex, - std::vector &label_ids_for_query) + std::unique_ptr> &pFlashIndex, + std::vector &label_ids_for_query) { std::vector label_strs_for_query; diskann::split_string(filters_for_query, FILTER_OR_SEPARATOR, label_strs_for_query); @@ -178,10 +176,11 @@ void parse_labels_of_query(const std::string &filters_for_query, } } -template +template void populate_label_ids(const std::vector &filters_of_queries, std::unique_ptr> &pFlashIndex, - std::vector> &label_ids_of_queries, bool apply_one_to_all, uint32_t query_count) + std::vector> &label_ids_of_queries, bool apply_one_to_all, + uint32_t query_count) { if (apply_one_to_all) { @@ -332,11 +331,9 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre std::vector> per_query_label_ids; if (filtered_search) { - populate_label_ids(query_filters, _pFlashIndex, per_query_label_ids, (query_filters.size() == 1), query_num ); + populate_label_ids(query_filters, _pFlashIndex, per_query_label_ids, (query_filters.size() == 1), query_num); } - - diskann::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); diskann::cout.precision(2); @@ -402,8 +399,8 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre { _pFlashIndex->cached_beam_search( query + (i * query_aligned_dim), recall_at, L, query_result_ids_64.data() + (i * recall_at), - query_result_dists[test_id].data() + (i * recall_at), optimized_beamwidth, true, per_query_label_ids[i], - search_io_limit, use_reorder_data, stats + i); + query_result_dists[test_id].data() + (i * recall_at), optimized_beamwidth, true, + per_query_label_ids[i], search_io_limit, use_reorder_data, stats + i); } } auto e = std::chrono::high_resolution_clock::now(); @@ -448,7 +445,7 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre << std::setw(16) << mean_io_us << std::setw(16) << mean_cpuus; if (calc_recall_flag) { - diskann::cout << std::setw(16) << recall << std::endl ; + diskann::cout << std::setw(16) << recall << std::endl; } else { diff --git a/include/percentile_stats.h b/include/percentile_stats.h index 361b0109e..ad819d4bc 100644 --- a/include/percentile_stats.h +++ b/include/percentile_stats.h @@ -35,7 +35,7 @@ struct QueryStats unsigned n_hops = 0; // # search hops #ifdef DISKANN_DEBUG_PRINT_RETSET - std::vector query_retset; //copy of the retset to debug PQ distances. + std::vector query_retset; // copy of the retset to debug PQ distances. #endif }; diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index 5eaf85a06..c6926886c 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -19,9 +19,9 @@ #define FULL_PRECISION_REORDER_MULTIPLIER 3 #define DEFAULT_VISITED_RESERVE_SIZE 4096 -//default max filters per query is set to the same -//as what we expect Bing to provide. If this is overkill, -//it can be set by clients in the load() function +// default max filters per query is set to the same +// as what we expect Bing to provide. If this is overkill, +// it can be set by clients in the load() function #define DEFAULT_MAX_FILTERS_PER_QUERY 4096 namespace diskann @@ -51,8 +51,7 @@ template class PQFlashIndex #ifdef EXEC_ENV_OLS DISKANN_DLLEXPORT int load_from_separate_paths(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_filepath, const char *pivots_filepath, - const char *compressed_filepath, - uint32_t max_filters_per_query); + const char *compressed_filepath, uint32_t max_filters_per_query); #else DISKANN_DLLEXPORT int load_from_separate_paths(uint32_t num_threads, const char *index_filepath, const char *pivots_filepath, const char *compressed_filepath, diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 4dcfe6497..4f3dde9de 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -121,7 +121,8 @@ template inline T *PQFlashIndex::offset } template -void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve, uint64_t max_filters_per_query) +void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve, + uint64_t max_filters_per_query) { diskann::cout << "Setting up thread-specific contexts for nthreads: " << nthreads << std::endl; // omp parallel for to generate unique thread IDs @@ -561,7 +562,8 @@ void PQFlashIndex::generate_random_labels(std::vector &labels } template -void PQFlashIndex::load_label_map(std::basic_istream &map_reader, std::unordered_map& string_to_int_map) +void PQFlashIndex::load_label_map(std::basic_istream &map_reader, + std::unordered_map &string_to_int_map) { std::string line, token; LabelT token_as_num; @@ -589,8 +591,7 @@ LabelT PQFlashIndex::get_converted_label(const std::string &filter_la return _universal_filter_label; } std::stringstream stream; - stream << "Unable to find label " << filter_label - << " in the Label Map "; + stream << "Unable to find label " << filter_label << " in the Label Map "; diskann::cerr << stream.str() << std::endl; throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -678,7 +679,6 @@ bool PQFlashIndex::point_has_any_label(uint32_t point_id, const std:: return ret_val; } - template void PQFlashIndex::parse_label_file(std::basic_istream &infile, size_t &num_points_labels) { @@ -769,7 +769,8 @@ template void PQFlashIndex::set_univers } template -void PQFlashIndex::load_label_medoid_map(const std::string& labels_to_medoids_filepath, std::istream& medoid_stream) +void PQFlashIndex::load_label_medoid_map(const std::string &labels_to_medoids_filepath, + std::istream &medoid_stream) { std::string line, token; @@ -831,7 +832,7 @@ void PQFlashIndex::load_dummy_map(const std::string &dummy_map_filepa } catch (std::system_error &e) { - throw FileException (dummy_map_filepath, e, __FUNCSIG__, __FILE__, __LINE__); + throw FileException(dummy_map_filepath, e, __FUNCSIG__, __FILE__, __LINE__); } } @@ -940,10 +941,12 @@ template void PQFlashIndex::load_labels #ifdef EXEC_ENV_OLS template -int PQFlashIndex::load(MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix, uint32_t max_filters_per_query) +int PQFlashIndex::load(MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix, + uint32_t max_filters_per_query) { #else -template int PQFlashIndex::load(uint32_t num_threads, const char *index_prefix, uint32_t max_filters_per_query) +template +int PQFlashIndex::load(uint32_t num_threads, const char *index_prefix, uint32_t max_filters_per_query) { #endif std::string pq_table_bin = std::string(index_prefix) + "_pq_pivots.bin"; @@ -1405,16 +1408,18 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t NeighborPriorityQueue &retset = query_scratch->retset; std::vector &full_retset = query_scratch->full_retset; tsl::robin_set full_retset_ids; - if (use_filters) { + if (use_filters) + { uint64_t size_to_reserve = std::max(l_search, (std::min((uint64_t)filter_label_count, this->_max_degree) + 1)); retset.reserve(size_to_reserve); - full_retset.reserve(4096); + full_retset.reserve(4096); full_retset_ids.reserve(4096); - } else { + } + else + { retset.reserve(l_search + 1); } - uint32_t best_medoid = 0; uint32_t cur_list_size = 0; float best_dist = (std::numeric_limits::max)(); @@ -1437,7 +1442,9 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t #endif visited.insert(best_medoid); cur_list_size = 1; - } else { + } + else + { std::vector filter_specific_medoids; filter_specific_medoids.reserve(filter_label_count); location_t ctr = 0; @@ -1455,12 +1462,12 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (ctr = 0; ctr < filter_specific_medoids.size(); ctr++) { retset.insert(Neighbor(filter_specific_medoids[ctr], dist_scratch[ctr])); - //retset[ctr].id = filter_specific_medoids[ctr]; - //retset[ctr].distance = dist_scratch[ctr]; - //retset[ctr].expanded = false; + // retset[ctr].id = filter_specific_medoids[ctr]; + // retset[ctr].distance = dist_scratch[ctr]; + // retset[ctr].expanded = false; visited.insert(filter_specific_medoids[ctr]); } - cur_list_size = (uint32_t) filter_specific_medoids.size(); + cur_list_size = (uint32_t)filter_specific_medoids.size(); } uint32_t cmps = 0; @@ -1477,10 +1484,10 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t std::vector>> cached_nhoods; cached_nhoods.reserve(2 * beam_width); - //if we are doing multi-filter search we don't want to restrict the number of IOs - //at present. Must revisit this decision later. + // if we are doing multi-filter search we don't want to restrict the number of IOs + // at present. Must revisit this decision later. uint32_t max_ios_for_query = use_filters || (io_limit == 0) ? std::numeric_limits::max() : io_limit; - const std::vector& label_ids = filter_labels; //avoid renaming. + const std::vector &label_ids = filter_labels; // avoid renaming. std::vector lbl_vec; while (retset.has_unexpanded_node() && num_ios < max_ios_for_query) @@ -1494,9 +1501,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // find new beam uint32_t num_seen = 0; - for (const auto &lbl : label_ids) - { + { uint32_t lbl_marker = 0; while (lbl_marker < cur_list_size) { @@ -1522,7 +1528,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t retset[lbl_marker].expanded = true; if (this->_count_visited_nodes) { - reinterpret_cast &>(this->_node_visit_counter[retset[lbl_marker].id].second) + reinterpret_cast &>( + this->_node_visit_counter[retset[lbl_marker].id].second) .fetch_add(1); } break; @@ -1645,7 +1652,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t #ifdef DISKANN_DEBUG_PRINT_RETSET stats->query_retset.push_back(nn); #endif - } } } @@ -1687,7 +1693,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t full_retset.push_back(Neighbor(real_id, cur_expanded_dist)); full_retset_ids.insert(real_id); } - + uint32_t *node_nbrs = (node_buf + 1); // compute node_nbrs <-> query dist in PQ space cpu_timer.reset(); @@ -1723,7 +1729,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t #ifdef DISKANN_DEBUG_PRINT_RETSET stats->query_retset.push_back(nn); #endif - } } diff --git a/src/scratch.cpp b/src/scratch.cpp index 99900454e..2a9c1f96e 100644 --- a/src/scratch.cpp +++ b/src/scratch.cpp @@ -93,7 +93,8 @@ template void SSDQueryScratch::reset() full_retset.clear(); } -template SSDQueryScratch::SSDQueryScratch(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) +template +SSDQueryScratch::SSDQueryScratch(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) { size_t coord_alloc_size = ROUND_UP(sizeof(T) * aligned_dim, 256); @@ -124,7 +125,8 @@ template SSDQueryScratch::~SSDQueryScratch() } template -SSDThreadData::SSDThreadData(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) : scratch(aligned_dim, visited_reserve, max_filters_per_query) +SSDThreadData::SSDThreadData(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) + : scratch(aligned_dim, visited_reserve, max_filters_per_query) { } diff --git a/src/utils.cpp b/src/utils.cpp index 481a63f64..d6a01a952 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -252,7 +252,7 @@ double calculate_range_search_recall(uint32_t num_queries, std::vector &pieces) +void split_string(const std::string &string_to_split, const std::string &delimiter, std::vector &pieces) { size_t start = 0; size_t end;