From 3f58b99777e6f462c4f487fe5c24b6288eb55436 Mon Sep 17 00:00:00 2001 From: Jon McLean <4429525+jonmclean@users.noreply.github.com> Date: Fri, 4 Aug 2023 09:21:57 -0700 Subject: [PATCH 01/23] Added PDoc workflow to publish github pages documentation (#412) * Added PDoc workflow * Added documentation to the push-test workflow * Added diskannpy to the env for pdoc to use * Initial commit of doc publish workflow * Tried heredoc to get python version * Tried another way of getting the version * Tried another way of getting the version * Moved to docs/python path * Removing the test harness * Add dependencies per wheel * Moved dependency tree to the 'push' file so it runs on push * Added label name to the dependency file * Trying maxtrix.os to get the os and version * Moved doc generation from push-test to python-release. Will add 'dev' doc generation to push-test * Publish latest/version docs only on release. Publish docs for every dev build on main. * Install the local-file version of the library * Disable branch check so I can test the install * Use python build to build a wheel for use in documentation * Tried changing to python instead of python3 * Added checkout depth in order to get boost * Use the python build action to create wheel for documentation * Revert "Use the python build action to create wheel for documentation" This reverts commit d900c1d42c0f4bc8295955e0d6da7a868a073661. * Added linux environment setup * Made only publish dev when on main and added comments --------- Co-authored-by: Jonathan McLean --- .github/workflows/build-python-pdoc.yml | 78 +++++++++++++++++++++++++ .github/workflows/push-test.yml | 16 +++++ .github/workflows/python-release.yml | 5 ++ 3 files changed, 99 insertions(+) create mode 100644 .github/workflows/build-python-pdoc.yml diff --git a/.github/workflows/build-python-pdoc.yml b/.github/workflows/build-python-pdoc.yml new file mode 100644 index 000000000..c9f4c6494 --- /dev/null +++ b/.github/workflows/build-python-pdoc.yml @@ -0,0 +1,78 @@ +name: DiskANN Build PDoc Documentation +on: [workflow_call] +jobs: + build-reference-documentation: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install python build + run: python -m pip install build + shell: bash + # Install required dependencies + - name: Prepare Linux environment + run: | + sudo scripts/dev/install-dev-deps-ubuntu.bash + shell: bash + # We need to build the wheel in order to run pdoc. pdoc does not seem to work if you just point it at + # our source directory. + - name: Building Python Wheel for documentation generation + run: python -m build --wheel --outdir documentation_dist + shell: bash + - name: "Run Reference Documentation Generation" + run: | + pip install pdoc pipdeptree + pip install documentation_dist/*.whl + echo "documentation" > dependencies_documentation.txt + pipdeptree >> dependencies_documentation.txt + pdoc -o docs/python/html diskannpy + - name: Create version environment variable + run: | + echo "DISKANN_VERSION=$(python <> $GITHUB_ENV + - name: Archive documentation version artifact + uses: actions/upload-artifact@v2 + with: + name: dependencies + path: | + dependencies_documentation.txt + - name: Archive documentation artifacts + uses: actions/upload-artifact@v2 + with: + name: documentation-site + path: | + docs/python/html + # Publish to /dev if we are on the "main" branch + - name: Publish reference docs for latest development version (main branch) + uses: peaceiris/actions-gh-pages@v3 + if: github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/python/html + destination_dir: docs/python/dev + # Publish to / if we are on the "main" branch and releasing + - name: Publish reference docs by version (main branch) + uses: peaceiris/actions-gh-pages@v3 + if: github.event_name == 'release' && github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/python/html + destination_dir: docs/python/${{ env.DISKANN_VERSION }} + # Publish to /latest if we are on the "main" branch and releasing + - name: Publish latest reference docs (main branch) + uses: peaceiris/actions-gh-pages@v3 + if: github.event_name == 'release' && github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/python/html + destination_dir: docs/python/latest \ No newline at end of file diff --git a/.github/workflows/push-test.yml b/.github/workflows/push-test.yml index 4de999014..23403c0c2 100644 --- a/.github/workflows/push-test.yml +++ b/.github/workflows/push-test.yml @@ -6,6 +6,11 @@ jobs: fail-fast: true name: DiskANN Common Build Checks uses: ./.github/workflows/common.yml + build-documentation: + strategy: + fail-fast: true + name: DiskANN Build Documentation + uses: ./.github/workflows/build-python-pdoc.yml build: strategy: fail-fast: false @@ -28,6 +33,17 @@ jobs: with: fetch-depth: 1 submodules: true + - name: Build dispannpy dependency tree + run: | + pip install diskannpy pipdeptree + echo "dependencies" > dependencies_${{ matrix.os }}.txt + pipdeptree >> dependencies_${{ matrix.os }}.txt + - name: Archive dispannpy dependencies artifact + uses: actions/upload-artifact@v3 + with: + name: dependencies + path: | + dependencies_${{ matrix.os }}.txt - name: DiskANN Build CLI Applications uses: ./.github/actions/build # python: diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index a1e72ad90..a0414f13b 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -6,6 +6,11 @@ jobs: python-release-wheels: name: Python uses: ./.github/workflows/build-python.yml + build-documentation: + strategy: + fail-fast: true + name: DiskANN Build Documentation + uses: ./.github/workflows/build-python-pdoc.yml release: runs-on: ubuntu-latest needs: python-release-wheels From 637ed515aa8d0956b0bd6f057e789897cc12e05e Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Tue, 8 Aug 2023 13:08:33 -0700 Subject: [PATCH 02/23] Update README.md (#416) --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2922c16c1..a381bdedf 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,15 @@ # DiskANN -[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf) -[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613) -[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) [![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml) [![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/) [![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) + + DiskANN is a suite of scalable, accurate and cost-effective approximate nearest neighbor search algorithms for large-scale vector search that support real-time changes and simple filters. This code is based on ideas from the [DiskANN](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf), [Fresh-DiskANN](https://arxiv.org/abs/2105.09613) and the [Filtered-DiskANN](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) papers with further improvements. This code forked off from [code for NSG](https://github.com/ZJULearning/nsg) algorithm. From b5725718889529e60915acdde4a9d5fbe95f367f Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Tue, 8 Aug 2023 15:00:51 -0700 Subject: [PATCH 03/23] moved ssd index defaults to defaults.h (#415) * moved ssd index constants to defaults.h --- include/defaults.h | 9 +++++++++ include/index.h | 2 +- include/scratch.h | 13 +++---------- src/disk_utils.cpp | 20 ++++++++++---------- src/index.cpp | 10 +++++----- src/pq_flash_index.cpp | 38 ++++++++++++++++++++------------------ src/scratch.cpp | 11 ++++++----- 7 files changed, 54 insertions(+), 49 deletions(-) diff --git a/include/defaults.h b/include/defaults.h index 2f157cb25..834234e57 100644 --- a/include/defaults.h +++ b/include/defaults.h @@ -14,6 +14,15 @@ const uint32_t MAX_OCCLUSION_SIZE = 750; const uint32_t FILTER_LIST_SIZE = 0; const uint32_t NUM_FROZEN_POINTS_STATIC = 0; const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1; + +// In-mem index related limits +const float GRAPH_SLACK_FACTOR = 1.3; + +// SSD Index related limits +const uint64_t MAX_GRAPH_DEGREE = 512; +const uint64_t SECTOR_LEN = 4096; +const uint64_t MAX_N_SECTOR_READS = 128; + // following constants should always be specified, but are useful as a // sensible default at cli / python boundaries const uint32_t MAX_DEGREE = 64; diff --git a/include/index.h b/include/index.h index 3ea80bc63..0d9b6edb9 100644 --- a/include/index.h +++ b/include/index.h @@ -31,7 +31,7 @@ namespace diskann inline double estimate_ram_usage(size_t size, uint32_t dim, uint32_t datasize, uint32_t degree) { double size_of_data = ((double)size) * ROUND_UP(dim, 8) * datasize; - double size_of_graph = ((double)size) * degree * sizeof(uint32_t) * GRAPH_SLACK_FACTOR; + double size_of_graph = ((double)size) * degree * sizeof(uint32_t) * defaults::GRAPH_SLACK_FACTOR; double size_of_locks = ((double)size) * sizeof(non_recursive_mutex); double size_of_outer_vector = ((double)size) * sizeof(ptrdiff_t); diff --git a/include/scratch.h b/include/scratch.h index 3b44f8f80..f685b36d9 100644 --- a/include/scratch.h +++ b/include/scratch.h @@ -11,18 +11,11 @@ #include "tsl/robin_map.h" #include "tsl/sparse_map.h" -#include "neighbor.h" +#include "aligned_file_reader.h" #include "concurrent_queue.h" +#include "defaults.h" +#include "neighbor.h" #include "pq.h" -#include "aligned_file_reader.h" - -// In-mem index related limits -#define GRAPH_SLACK_FACTOR 1.3 - -// SSD Index related limits -#define MAX_GRAPH_DEGREE 512 -#define SECTOR_LEN (size_t)4096 -#define MAX_N_SECTOR_READS 128 namespace diskann { diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index aadeb6dd1..a0e4c25ed 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -895,14 +895,14 @@ void create_disk_layout(const std::string base_file, const std::string mem_index if (vamana_frozen_num == 1) vamana_frozen_loc = medoid; max_node_len = (((uint64_t)width_u32 + 1) * sizeof(uint32_t)) + (ndims_64 * sizeof(T)); - nnodes_per_sector = SECTOR_LEN / max_node_len; + nnodes_per_sector = defaults::SECTOR_LEN / max_node_len; diskann::cout << "medoid: " << medoid << "B" << std::endl; diskann::cout << "max_node_len: " << max_node_len << "B" << std::endl; diskann::cout << "nnodes_per_sector: " << nnodes_per_sector << "B" << std::endl; - // SECTOR_LEN buffer for each sector - std::unique_ptr sector_buf = std::make_unique(SECTOR_LEN); + // defaults::SECTOR_LEN buffer for each sector + std::unique_ptr sector_buf = std::make_unique(defaults::SECTOR_LEN); std::unique_ptr node_buf = std::make_unique(max_node_len); uint32_t &nnbrs = *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T)); uint32_t *nhood_buf = (uint32_t *)(node_buf.get() + (ndims_64 * sizeof(T)) + sizeof(uint32_t)); @@ -914,10 +914,10 @@ void create_disk_layout(const std::string base_file, const std::string mem_index if (append_reorder_data) { - n_data_nodes_per_sector = SECTOR_LEN / (ndims_reorder_file * sizeof(float)); + n_data_nodes_per_sector = defaults::SECTOR_LEN / (ndims_reorder_file * sizeof(float)); n_reorder_sectors = ROUND_UP(npts_64, n_data_nodes_per_sector) / n_data_nodes_per_sector; } - uint64_t disk_index_file_size = (n_sectors + n_reorder_sectors + 1) * SECTOR_LEN; + uint64_t disk_index_file_size = (n_sectors + n_reorder_sectors + 1) * defaults::SECTOR_LEN; std::vector output_file_meta; output_file_meta.push_back(npts_64); @@ -936,7 +936,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index } output_file_meta.push_back(disk_index_file_size); - diskann_writer.write(sector_buf.get(), SECTOR_LEN); + diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN); std::unique_ptr cur_node_coords = std::make_unique(ndims_64); diskann::cout << "# sectors: " << n_sectors << std::endl; @@ -947,7 +947,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index { diskann::cout << "Sector #" << sector << "written" << std::endl; } - memset(sector_buf.get(), 0, SECTOR_LEN); + memset(sector_buf.get(), 0, defaults::SECTOR_LEN); for (uint64_t sector_node_id = 0; sector_node_id < nnodes_per_sector && cur_node_id < npts_64; sector_node_id++) { memset(node_buf.get(), 0, max_node_len); @@ -985,7 +985,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index cur_node_id++; } // flush sector to disk - diskann_writer.write(sector_buf.get(), SECTOR_LEN); + diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN); } if (append_reorder_data) { @@ -1001,7 +1001,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index diskann::cout << "Reorder data Sector #" << sector << "written" << std::endl; } - memset(sector_buf.get(), 0, SECTOR_LEN); + memset(sector_buf.get(), 0, defaults::SECTOR_LEN); for (uint64_t sector_node_id = 0; sector_node_id < n_data_nodes_per_sector && sector_node_id < npts_64; sector_node_id++) @@ -1013,7 +1013,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index memcpy(sector_buf.get() + (sector_node_id * vec_len), vec_buf.get(), vec_len); } // flush sector to disk - diskann_writer.write(sector_buf.get(), SECTOR_LEN); + diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN); } } diskann_writer.close(); diff --git a/src/index.cpp b/src/index.cpp index 55ba60ac9..eb7592a4e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1359,7 +1359,7 @@ void Index::inter_insert(uint32_t n, std::vector &pru auto &des_pool = _final_graph[des]; if (std::find(des_pool.begin(), des_pool.end(), n) == des_pool.end()) { - if (des_pool.size() < (uint64_t)(GRAPH_SLACK_FACTOR * range)) + if (des_pool.size() < (uint64_t)(defaults::GRAPH_SLACK_FACTOR * range)) { des_pool.emplace_back(n); prune_needed = false; @@ -1379,7 +1379,7 @@ void Index::inter_insert(uint32_t n, std::vector &pru tsl::robin_set dummy_visited(0); std::vector dummy_pool(0); - size_t reserveSize = (size_t)(std::ceil(1.05 * GRAPH_SLACK_FACTOR * range)); + size_t reserveSize = (size_t)(std::ceil(1.05 * defaults::GRAPH_SLACK_FACTOR * range)); dummy_visited.reserve(reserveSize); dummy_pool.reserve(reserveSize); @@ -1448,7 +1448,7 @@ void Index::link(const IndexWriteParameters ¶meters) for (size_t p = 0; p < _nd; p++) { - _final_graph[p].reserve((size_t)(std::ceil(_indexingRange * GRAPH_SLACK_FACTOR * 1.05))); + _final_graph[p].reserve((size_t)(std::ceil(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05))); } diskann::Timer link_timer; @@ -1473,7 +1473,7 @@ void Index::link(const IndexWriteParameters ¶meters) } { LockGuard guard(_locks[node]); - _final_graph[node].reserve((size_t)(_indexingRange * GRAPH_SLACK_FACTOR * 1.05)); + _final_graph[node].reserve((size_t)(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05)); _final_graph[node] = pruned_list; assert(_final_graph[node].size() <= _indexingRange); } @@ -3031,7 +3031,7 @@ int Index::insert_point(const T *point, const TagT tag) LockGuard guard(_locks[location]); _final_graph[location].clear(); - _final_graph[location].reserve((size_t)(_indexingRange * GRAPH_SLACK_FACTOR * 1.05)); + _final_graph[location].reserve((size_t)(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05)); for (auto link : pruned_list) { diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 5bd23ecb0..a96d6e69f 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -150,11 +150,11 @@ template void PQFlashIndex::load_cache_ { AlignedRead read; char *buf = nullptr; - alloc_aligned((void **)&buf, SECTOR_LEN, SECTOR_LEN); + alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); nhoods.push_back(std::make_pair(node_list[node_idx], buf)); - read.len = SECTOR_LEN; + read.len = defaults::SECTOR_LEN; read.buf = buf; - read.offset = NODE_SECTOR_NO(node_list[node_idx]) * SECTOR_LEN; + read.offset = NODE_SECTOR_NO(node_list[node_idx]) * defaults::SECTOR_LEN; read_reqs.push_back(read); } @@ -377,12 +377,12 @@ void PQFlashIndex::cache_bfs_levels(uint64_t num_nodes_to_cache, std: for (size_t cur_pt = start; cur_pt < end; cur_pt++) { char *buf = nullptr; - alloc_aligned((void **)&buf, SECTOR_LEN, SECTOR_LEN); + alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); nhoods.emplace_back(nodes_to_expand[cur_pt], buf); AlignedRead read; - read.len = SECTOR_LEN; + read.len = defaults::SECTOR_LEN; read.buf = buf; - read.offset = NODE_SECTOR_NO(nodes_to_expand[cur_pt]) * SECTOR_LEN; + read.offset = NODE_SECTOR_NO(nodes_to_expand[cur_pt]) * defaults::SECTOR_LEN; read_reqs.push_back(read); } @@ -460,11 +460,11 @@ template void PQFlashIndex::use_medoids auto medoid = medoids[cur_m]; // read medoid nhood char *medoid_buf = nullptr; - alloc_aligned((void **)&medoid_buf, SECTOR_LEN, SECTOR_LEN); + alloc_aligned((void **)&medoid_buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); std::vector medoid_read(1); - medoid_read[0].len = SECTOR_LEN; + medoid_read[0].len = defaults::SECTOR_LEN; medoid_read[0].buf = medoid_buf; - medoid_read[0].offset = NODE_SECTOR_NO(medoid) * SECTOR_LEN; + medoid_read[0].offset = NODE_SECTOR_NO(medoid) * defaults::SECTOR_LEN; reader->read(medoid_read, ctx); // all data about medoid @@ -929,12 +929,12 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons READ_U64(index_metadata, nnodes_per_sector); max_degree = ((max_node_len - disk_bytes_per_point) / sizeof(uint32_t)) - 1; - if (max_degree > MAX_GRAPH_DEGREE) + if (max_degree > defaults::MAX_GRAPH_DEGREE) { std::stringstream stream; stream << "Error loading index. Ensure that max graph degree (R) does " "not exceed " - << MAX_GRAPH_DEGREE << std::endl; + << defaults::MAX_GRAPH_DEGREE << std::endl; throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -1150,8 +1150,9 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t } } - if (beam_width > MAX_N_SECTOR_READS) - throw ANNException("Beamwidth can not be higher than MAX_N_SECTOR_READS", -1, __FUNCSIG__, __FILE__, __LINE__); + if (beam_width > defaults::MAX_N_SECTOR_READS) + throw ANNException("Beamwidth can not be higher than defaults::MAX_N_SECTOR_READS", -1, __FUNCSIG__, __FILE__, + __LINE__); ScratchStoreManager> manager(this->thread_data); auto data = manager.scratch_space(); @@ -1328,10 +1329,11 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t auto id = frontier[i]; std::pair fnhood; fnhood.first = id; - fnhood.second = sector_scratch + sector_scratch_idx * SECTOR_LEN; + fnhood.second = sector_scratch + sector_scratch_idx * defaults::SECTOR_LEN; sector_scratch_idx++; frontier_nhoods.push_back(fnhood); - frontier_read_reqs.emplace_back(NODE_SECTOR_NO(((size_t)id)) * SECTOR_LEN, SECTOR_LEN, fnhood.second); + frontier_read_reqs.emplace_back(NODE_SECTOR_NO(((size_t)id)) * defaults::SECTOR_LEN, + defaults::SECTOR_LEN, fnhood.second); if (stats != nullptr) { stats->n_4k++; @@ -1498,8 +1500,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (size_t i = 0; i < full_retset.size(); ++i) { - vec_read_reqs.emplace_back(VECTOR_SECTOR_NO(((size_t)full_retset[i].id)) * SECTOR_LEN, SECTOR_LEN, - sector_scratch + i * SECTOR_LEN); + vec_read_reqs.emplace_back(VECTOR_SECTOR_NO(((size_t)full_retset[i].id)) * defaults::SECTOR_LEN, + defaults::SECTOR_LEN, sector_scratch + i * defaults::SECTOR_LEN); if (stats != nullptr) { @@ -1522,7 +1524,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (size_t i = 0; i < full_retset.size(); ++i) { auto id = full_retset[i].id; - auto location = (sector_scratch + i * SECTOR_LEN) + VECTOR_SECTOR_OFFSET(id); + auto location = (sector_scratch + i * defaults::SECTOR_LEN) + VECTOR_SECTOR_OFFSET(id); full_retset[i].distance = dist_cmp->compare(aligned_query_T, (T *)location, (uint32_t)this->data_dim); } diff --git a/src/scratch.cpp b/src/scratch.cpp index e6305cd29..112c65d28 100644 --- a/src/scratch.cpp +++ b/src/scratch.cpp @@ -28,14 +28,14 @@ InMemQueryScratch::InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, memset(_aligned_query, 0, aligned_dim * sizeof(T)); if (init_pq_scratch) - _pq_scratch = new PQScratch(MAX_GRAPH_DEGREE, aligned_dim); + _pq_scratch = new PQScratch(defaults::MAX_GRAPH_DEGREE, aligned_dim); else _pq_scratch = nullptr; _occlude_factor.reserve(maxc); _inserted_into_pool_bs = new boost::dynamic_bitset<>(); - _id_scratch.reserve((size_t)std::ceil(1.5 * GRAPH_SLACK_FACTOR * _R)); - _dist_scratch.reserve((size_t)std::ceil(1.5 * GRAPH_SLACK_FACTOR * _R)); + _id_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R)); + _dist_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R)); resize_for_new_L(std::max(search_l, indexing_l)); } @@ -96,10 +96,11 @@ template SSDQueryScratch::SSDQueryScratch(size_t aligned_dim, si size_t coord_alloc_size = ROUND_UP(sizeof(T) * aligned_dim, 256); diskann::alloc_aligned((void **)&coord_scratch, coord_alloc_size, 256); - diskann::alloc_aligned((void **)§or_scratch, (size_t)MAX_N_SECTOR_READS * (size_t)SECTOR_LEN, SECTOR_LEN); + diskann::alloc_aligned((void **)§or_scratch, defaults::MAX_N_SECTOR_READS * defaults::SECTOR_LEN, + defaults::SECTOR_LEN); diskann::alloc_aligned((void **)&aligned_query_T, aligned_dim * sizeof(T), 8 * sizeof(T)); - _pq_scratch = new PQScratch(MAX_GRAPH_DEGREE, aligned_dim); + _pq_scratch = new PQScratch(defaults::MAX_GRAPH_DEGREE, aligned_dim); memset(coord_scratch, 0, coord_alloc_size); memset(aligned_query_T, 0, aligned_dim * sizeof(T)); From c729e5c6b7b1cfa439d6116088a43f2bb3d740a5 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Fri, 11 Aug 2023 11:54:05 -0700 Subject: [PATCH 04/23] Add Performance Tests (#421) * Have a working dockerfile to run perf tests and report the times they take. We can also capture stdout/stderr with it for further information, especially for tools that report internal latencies. * Slight changes to the perf test script, a perf.yml for the github action --- .github/workflows/perf.yml | 26 +++++++++++++++ scripts/dev/install-dev-deps-ubuntu.bash | 2 +- scripts/perf/Dockerfile | 31 ++++++++++++++++++ scripts/perf/README.md | 20 ++++++++++++ scripts/perf/perf_test.sh | 40 ++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/perf.yml create mode 100644 scripts/perf/Dockerfile create mode 100644 scripts/perf/README.md create mode 100644 scripts/perf/perf_test.sh diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml new file mode 100644 index 000000000..1595a4221 --- /dev/null +++ b/.github/workflows/perf.yml @@ -0,0 +1,26 @@ +name: DiskANN Nightly Performance Metrics +on: + schedule: + - cron: "41 14 * * *" # 14:41 UTC, 7:41 PDT, 8:41 PST, 08:11 IST +jobs: + perf-test: + name: Run Perf Test from main + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - name: Build Perf Container + run: | + docker build --build-arg GIT_COMMIT_ISH="$GITHUB_SHA" -t perf -f scripts/perf/Dockerfile scripts + - name: Performance Tests + run: | + mkdir metrics + docker run -v ./metrics:/app/logs perf &> ./metrics/combined_stdouterr.log + - name: Upload Metrics Logs + uses: actions/upload-artifact@v3 + with: + name: metrics + path: | + ./metrics/** diff --git a/scripts/dev/install-dev-deps-ubuntu.bash b/scripts/dev/install-dev-deps-ubuntu.bash index 84f558ed6..09ad6ebb9 100755 --- a/scripts/dev/install-dev-deps-ubuntu.bash +++ b/scripts/dev/install-dev-deps-ubuntu.bash @@ -1,6 +1,6 @@ #!/bin/bash -apt install cmake \ +DEBIAN_FRONTEND=noninteractive apt install -y cmake \ g++ \ libaio-dev \ libgoogle-perftools-dev \ diff --git a/scripts/perf/Dockerfile b/scripts/perf/Dockerfile new file mode 100644 index 000000000..f900627ab --- /dev/null +++ b/scripts/perf/Dockerfile @@ -0,0 +1,31 @@ +#Copyright(c) Microsoft Corporation.All rights reserved. +#Licensed under the MIT license. + +FROM ubuntu:jammy + +# Can be provided at build to point to a specific commit-ish, by default builds from HEAD +ARG GIT_COMMIT_ISH=HEAD + +RUN apt update +RUN apt install -y software-properties-common +RUN add-apt-repository -y ppa:git-core/ppa +RUN apt update +RUN DEBIAN_FRONTEND=noninteractive apt install -y git time + +COPY dev/install-dev-deps-ubuntu.bash /app/fallback/install-dev-deps-ubuntu.bash +WORKDIR /app +RUN git clone https://github.com/microsoft/DiskANN.git +WORKDIR /app/DiskANN +RUN git checkout $GIT_COMMIT_ISH + +# we would prefer to use the deps requested at the same commit. if the script doesn't exist we'll use the current one. +RUN bash scripts/dev/install-dev-deps-ubuntu.bash || bash /app/fallback/install-dev-deps-ubuntu.bash + +RUN mkdir build +RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True +RUN cmake --build build -- -j + +RUN mkdir /app/logs +COPY perf/perf_test.sh /app/DiskANN/perf_test.sh + +ENTRYPOINT bash perf_test.sh diff --git a/scripts/perf/README.md b/scripts/perf/README.md new file mode 100644 index 000000000..692eedca7 --- /dev/null +++ b/scripts/perf/README.md @@ -0,0 +1,20 @@ +#Performance Tests + +The bash scripts in this folder are responsible for running a suite of performance +tests. + +The timing and recall metrics reported by these tests when run periodically can then +be used to identify performance improvements or regressions as +development continues. + +## Usage + +`docker build` must be run with the context directory set to `scripts`, but the Dockerfile set to `scripts/perf/Dockerfile` as in: +```bash +docker build [--build-arg GIT_COMMIT_ISH=] -f scripts/perf/Dockerfile scripts +``` + +We prefer to install the dependencies from the commit-ish that we're building against, but as the deps were not stored +in a known file in all commits, we will fall back to the one currently in HEAD if one is not found already. + +The `--build-arg GIT_COMMIT_ISH=` is optional, with a default value of HEAD if not otherwise specified. diff --git a/scripts/perf/perf_test.sh b/scripts/perf/perf_test.sh new file mode 100644 index 000000000..a8d537f01 --- /dev/null +++ b/scripts/perf/perf_test.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +function json_time { + command="$@" + echo "Executing $command" + /usr/bin/time --quiet -o /app/logs/time.log -a --format '{"command":"%C", "wallclock": %e, "user": %U, "sys": %S}' $command + ret=$? + if [ $ret -ne 0 ]; then + echo "{\"command\": \""$command"\", \"status_code\": $ret}" >> /app/logs/time.log + fi +} + +mkdir data +rm /app/logs/time.log +touch /app/logs/time.log +chmod 666 /app/logs/time.log + +if [ -d "build/apps" ]; then + export BASE_PATH="build/apps" +else + export BASE_PATH="build/tests" +fi + +json_time $BASE_PATH/utils/rand_data_gen --data_type float --output_file data/rand_float_768D_1M_norm1.0.bin -D 768 -N 1000000 --norm 1.0 +json_time $BASE_PATH/utils/rand_data_gen --data_type float --output_file data/rand_float_768D_10K_norm1.0.bin -D 768 -N 10000 --norm 1.0 + +json_time $BASE_PATH/utils/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_768D_1M_norm1.0.bin --query_file data/rand_float_768D_10K_norm1.0.bin --gt_file data/l2_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 --K 100 +json_time $BASE_PATH/utils/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_768D_1M_norm1.0.bin --query_file data/rand_float_768D_10K_norm1.0.bin --gt_file data/mips_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 --K 100 +json_time $BASE_PATH/utils/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_768D_1M_norm1.0.bin --query_file data/rand_float_768D_10K_norm1.0.bin --gt_file data/cosine_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 --K 100 + +json_time $BASE_PATH/build_memory_index --data_type float --dist_fn l2 --data_path data/rand_float_768D_1M_norm1.0.bin --index_path_prefix data/index_l2_rand_float_768D_1M_norm1.0 +json_time $BASE_PATH/search_memory_index --data_type float --dist_fn l2 --index_path_prefix data/index_l2_rand_float_768D_1M_norm1.0 --query_file data/rand_float_768D_10K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 -L 100 32 +json_time $BASE_PATH/search_memory_index --data_type float --dist_fn fast_l2 --index_path_prefix data/index_l2_rand_float_768D_1M_norm1.0 --query_file data/rand_float_768D_10K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 -L 100 32 + +json_time $BASE_PATH/build_memory_index --data_type float --dist_fn mips --data_path data/rand_float_768D_1M_norm1.0.bin --index_path_prefix data/index_mips_rand_float_768D_1M_norm1.0 +json_time $BASE_PATH/search_memory_index --data_type float --dist_fn mips --index_path_prefix data/index_l2_rand_float_768D_1M_norm1.0 --query_file data/rand_float_768D_10K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/mips_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 -L 100 32 + +json_time $BASE_PATH/build_memory_index --data_type float --dist_fn cosine --data_path data/rand_float_768D_1M_norm1.0.bin --index_path_prefix data/index_cosine_rand_float_768D_1M_norm1.0 +json_time $BASE_PATH/search_memory_index --data_type float --dist_fn cosine --index_path_prefix data/index_l2_rand_float_768D_1M_norm1.0 --query_file data/rand_float_768D_10K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_float_768D_1M_norm1.0_768D_10K_norm1.0_gt100 -L 100 32 + From 977dd3cd20a2cb94c181e4fc7fa162a9339248d8 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Mon, 14 Aug 2023 16:30:42 -0700 Subject: [PATCH 05/23] allow multi-sector layout for large vectors (#417) * make sector node an inline function * convert offset_node macro to inline method * rename member vars to start with underscore in pq_flash_index.h * added support in create_disk_index * add read sector util * load_cache_list now uses read_blocks util * allow nullptr for read_nodes * BFS cache generation uses util * add num_sectors info to cache_beam_Search * add CI test for 1020,1024,1536D float and 4096D int8 rand vector on disk --- .../generate-high-dim-random/action.yml | 28 + .github/actions/generate-random/action.yml | 2 +- .github/workflows/multi-sector-disk-pq.yml | 60 ++ .github/workflows/pr-test.yml | 3 + apps/search_disk_index.cpp | 12 +- include/pq_flash_index.h | 118 ++-- src/disk_utils.cpp | 88 ++- src/pq_flash_index.cpp | 584 ++++++++++-------- 8 files changed, 559 insertions(+), 336 deletions(-) create mode 100644 .github/actions/generate-high-dim-random/action.yml create mode 100644 .github/workflows/multi-sector-disk-pq.yml diff --git a/.github/actions/generate-high-dim-random/action.yml b/.github/actions/generate-high-dim-random/action.yml new file mode 100644 index 000000000..0c7eeb8fd --- /dev/null +++ b/.github/actions/generate-high-dim-random/action.yml @@ -0,0 +1,28 @@ +name: 'Generating Random Data (Basic)' +description: 'Generates the random data files used in acceptance tests' +runs: + using: "composite" + steps: + - name: Generate Random Data (Basic) + run: | + mkdir data + + echo "Generating random 1020,1024,1536D float and 4096 int8 vectors for index" + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_10K_norm1.0.bin -D 1020 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_10K_norm1.0.bin -D 1024 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_10K_norm1.0.bin -D 1536 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_10K_norm1.0.bin -D 4096 -N 10000 --norm 1.0 + + echo "Generating random 1020,1024,1536D float and 4096D int8 avectors for query" + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_1K_norm1.0.bin -D 1020 -N 1000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_1K_norm1.0.bin -D 1024 -N 1000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_1K_norm1.0.bin -D 1536 -N 1000 --norm 1.0 + dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_1K_norm1.0.bin -D 4096 -N 1000 --norm 1.0 + + echo "Computing ground truth for 1020,1024,1536D float and 4096D int8 avectors for query" + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1020D_10K_norm1.0.bin --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_10K_norm1.0_1020D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1024D_10K_norm1.0.bin --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_10K_norm1.0_1024D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1536D_10K_norm1.0.bin --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_10K_norm1.0_1536D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_4096D_10K_norm1.0.bin --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_10K_norm1.0_4096D_1K_norm1.0_gt100 --K 100 + + shell: bash diff --git a/.github/actions/generate-random/action.yml b/.github/actions/generate-random/action.yml index 75554773e..297209d7b 100644 --- a/.github/actions/generate-random/action.yml +++ b/.github/actions/generate-random/action.yml @@ -16,7 +16,7 @@ runs: dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 - + echo "Computing ground truth for floats across l2, mips, and cosine distance functions" dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 dist/bin/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 diff --git a/.github/workflows/multi-sector-disk-pq.yml b/.github/workflows/multi-sector-disk-pq.yml new file mode 100644 index 000000000..1f010b124 --- /dev/null +++ b/.github/workflows/multi-sector-disk-pq.yml @@ -0,0 +1,60 @@ +name: Disk With PQ +on: [workflow_call] +jobs: + acceptance-tests-disk-pq: + name: Disk, PQ + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-2019, windows-latest] + runs-on: ${{matrix.os}} + defaults: + run: + shell: bash + steps: + - name: Checkout repository + if: ${{ runner.os == 'Linux' }} + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - name: Checkout repository + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@v3 + with: + fetch-depth: 1 + submodules: true + - name: DiskANN Build CLI Applications + uses: ./.github/actions/build + + - name: Generate Data + uses: ./.github/actions/generate-high-dim-random + + - name: build and search disk index (1020D, one shot graph build, L2, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1020D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1020D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1020D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_10K_norm1.0_1020D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + - name: build and search disk index (1024D, one shot graph build, L2, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1024D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1024D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1024D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_10K_norm1.0_1024D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + - name: build and search disk index (1536D, one shot graph build, L2, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1536D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1536D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1536D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_10K_norm1.0_1536D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + + - name: build and search disk index (4096D, one shot graph build, L2, no diskPQ) (int8) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_4096D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_int8_4096D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_4096D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_10K_norm1.0_4096D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + + - name: upload data and bin + uses: actions/upload-artifact@v3 + with: + name: multi-sector-disk-pq + path: | + ./dist/** + ./data/** diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 38eefb3ff..f332f185e 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -18,6 +18,9 @@ jobs: disk-pq: name: Disk with PQ uses: ./.github/workflows/disk-pq.yml + multi-sector-disk-pq: + name: Multi-sector Disk with PQ + uses: ./.github/workflows/multi-sector-disk-pq.yml labels: name: Labels uses: ./.github/workflows/labels.yml diff --git a/apps/search_disk_index.cpp b/apps/search_disk_index.cpp index b46b37aef..7e2a7ac6d 100644 --- a/apps/search_disk_index.cpp +++ b/apps/search_disk_index.cpp @@ -118,13 +118,13 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre { return res; } - // cache bfs levels + std::vector node_list; - diskann::cout << "Caching " << num_nodes_to_cache << " BFS nodes around medoid(s)" << std::endl; - //_pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list); - if (num_nodes_to_cache > 0) - _pFlashIndex->generate_cache_list_from_sample_queries(warmup_query_file, 15, 6, num_nodes_to_cache, num_threads, - node_list); + diskann::cout << "Caching " << num_nodes_to_cache << " nodes around medoid(s)" << std::endl; + _pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list); + // if (num_nodes_to_cache > 0) + // _pFlashIndex->generate_cache_list_from_sample_queries(warmup_query_file, 15, 6, num_nodes_to_cache, + // num_threads, node_list); _pFlashIndex->load_cache_list(node_list); node_list.clear(); node_list.shrink_to_fit(); diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index 5872a0ebf..bc7fe312d 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -109,81 +109,113 @@ template class PQFlashIndex DISKANN_DLLEXPORT void generate_random_labels(std::vector &labels, const uint32_t num_labels, const uint32_t nthreads); - // index info + // sector # on disk where node_id is present with in the graph part + DISKANN_DLLEXPORT uint64_t get_node_sector(uint64_t node_id); + + // ptr to start of the node + DISKANN_DLLEXPORT char *offset_to_node(char *sector_buf, uint64_t node_id); + + // returns region of `node_buf` containing [NNBRS][NBR_ID(uint32_t)] + DISKANN_DLLEXPORT uint32_t *offset_to_node_nhood(char *node_buf); + + // returns region of `node_buf` containing [COORD(T)] + DISKANN_DLLEXPORT T *offset_to_node_coords(char *node_buf); + + // + // node_ids: input list of node_ids to be read + // coord_buffers: pointers to pre-allocated buffers that coords need to copied to. If null, dont copy. + // nbr_buffers: pre-allocated buffers to copy neighbors into + // + // returns a vector of bool one for each node_id: true if read is success, else false + // + DISKANN_DLLEXPORT std::vector read_nodes(const std::vector &node_ids, + std::vector &coord_buffers, + std::vector> &nbr_buffers); + + // index info for multi-node sectors // nhood of node `i` is in sector: [i / nnodes_per_sector] // offset in sector: [(i % nnodes_per_sector) * max_node_len] - // nnbrs of node `i`: *(unsigned*) (buf) - // nbrs of node `i`: ((unsigned*)buf) + 1 - - uint64_t max_node_len = 0, nnodes_per_sector = 0, max_degree = 0; + // + // index info for multi-sector nodes + // nhood of node `i` is in sector: [i * DIV_ROUND_UP(_max_node_len, SECTOR_LEN)] + // offset in sector: [0] + // + // Common info + // coords start at ofsset + // #nbrs of node `i`: *(unsigned*) (offset + disk_bytes_per_point) + // nbrs of node `i` : (unsigned*) (offset + disk_bytes_per_point + 1) + + uint64_t _max_node_len = 0; + uint64_t _nnodes_per_sector = 0; // 0 for multi-sector nodes, >0 for multi-node sectors + uint64_t _max_degree = 0; // Data used for searching with re-order vectors - uint64_t ndims_reorder_vecs = 0, reorder_data_start_sector = 0, nvecs_per_sector = 0; + uint64_t _ndims_reorder_vecs = 0; + uint64_t _reorder_data_start_sector = 0; + uint64_t _nvecs_per_sector = 0; diskann::Metric metric = diskann::Metric::L2; // used only for inner product search to re-scale the result value // (due to the pre-processing of base during index build) - float max_base_norm = 0.0f; + float _max_base_norm = 0.0f; // data info - uint64_t num_points = 0; - uint64_t num_frozen_points = 0; - uint64_t frozen_location = 0; - uint64_t data_dim = 0; - uint64_t disk_data_dim = 0; // will be different from data_dim only if we use - // PQ for disk data (very large dimensionality) - uint64_t aligned_dim = 0; - uint64_t disk_bytes_per_point = 0; - - std::string disk_index_file; - std::vector> node_visit_counter; + uint64_t _num_points = 0; + uint64_t _num_frozen_points = 0; + uint64_t _frozen_location = 0; + uint64_t _data_dim = 0; + uint64_t _aligned_dim = 0; + uint64_t _disk_bytes_per_point = 0; // Number of bytes + + std::string _disk_index_file; + std::vector> _node_visit_counter; // PQ data - // n_chunks = # of chunks ndims is split into - // data: char * n_chunks + // _n_chunks = # of chunks ndims is split into + // data: char * _n_chunks // chunk_size = chunk size of each dimension chunk - // pq_tables = float* [[2^8 * [chunk_size]] * n_chunks] + // pq_tables = float* [[2^8 * [chunk_size]] * _n_chunks] uint8_t *data = nullptr; - uint64_t n_chunks; - FixedChunkPQTable pq_table; + uint64_t _n_chunks; + FixedChunkPQTable _pq_table; // distance comparator - std::shared_ptr> dist_cmp; - std::shared_ptr> dist_cmp_float; + std::shared_ptr> _dist_cmp; + std::shared_ptr> _dist_cmp_float; // for very large datasets: we use PQ even for the disk resident index - bool use_disk_index_pq = false; - uint64_t disk_pq_n_chunks = 0; - FixedChunkPQTable disk_pq_table; + bool _use_disk_index_pq = false; + uint64_t _disk_pq_n_chunks = 0; + FixedChunkPQTable _disk_pq_table; // medoid/start info // graph has one entry point by default, // we can optionally have multiple starting points - uint32_t *medoids = nullptr; + uint32_t *_medoids = nullptr; // defaults to 1 - size_t num_medoids; + size_t _num_medoids; // by default, it is empty. If there are multiple // centroids, we pick the medoid corresponding to the // closest centroid as the starting point of search - float *centroid_data = nullptr; + float *_centroid_data = nullptr; - // nhood_cache - unsigned *nhood_cache_buf = nullptr; - tsl::robin_map> nhood_cache; + // nhood_cache; the uint32_t in nhood_Cache are offsets into nhood_cache_buf + unsigned *_nhood_cache_buf = nullptr; + tsl::robin_map> _nhood_cache; - // coord_cache - T *coord_cache_buf = nullptr; - tsl::robin_map coord_cache; + // coord_cache; The T* in coord_cache are offsets into coord_cache_buf + T *_coord_cache_buf = nullptr; + tsl::robin_map _coord_cache; // thread-specific scratch - ConcurrentQueue *> thread_data; - uint64_t max_nthreads; - bool load_flag = false; - bool count_visited_nodes = false; - bool reorder_data_exists = false; - uint64_t reoreder_data_offset = 0; + ConcurrentQueue *> _thread_data; + uint64_t _max_nthreads; + bool _load_flag = false; + bool _count_visited_nodes = false; + bool _reorder_data_exists = false; + uint64_t _reoreder_data_offset = 0; // filter support uint32_t *_pts_to_label_offsets = nullptr; diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index a0e4c25ed..4ece797d1 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -895,7 +895,7 @@ void create_disk_layout(const std::string base_file, const std::string mem_index if (vamana_frozen_num == 1) vamana_frozen_loc = medoid; max_node_len = (((uint64_t)width_u32 + 1) * sizeof(uint32_t)) + (ndims_64 * sizeof(T)); - nnodes_per_sector = defaults::SECTOR_LEN / max_node_len; + nnodes_per_sector = defaults::SECTOR_LEN / max_node_len; // 0 if max_node_len > SECTOR_LEN diskann::cout << "medoid: " << medoid << "B" << std::endl; diskann::cout << "max_node_len: " << max_node_len << "B" << std::endl; @@ -903,12 +903,14 @@ void create_disk_layout(const std::string base_file, const std::string mem_index // defaults::SECTOR_LEN buffer for each sector std::unique_ptr sector_buf = std::make_unique(defaults::SECTOR_LEN); + std::unique_ptr multisector_buf = std::make_unique(ROUND_UP(max_node_len, defaults::SECTOR_LEN)); std::unique_ptr node_buf = std::make_unique(max_node_len); uint32_t &nnbrs = *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T)); uint32_t *nhood_buf = (uint32_t *)(node_buf.get() + (ndims_64 * sizeof(T)) + sizeof(uint32_t)); // number of sectors (1 for meta data) - uint64_t n_sectors = ROUND_UP(npts_64, nnodes_per_sector) / nnodes_per_sector; + uint64_t n_sectors = nnodes_per_sector > 0 ? ROUND_UP(npts_64, nnodes_per_sector) / nnodes_per_sector + : npts_64 * DIV_ROUND_UP(max_node_len, defaults::SECTOR_LEN); uint64_t n_reorder_sectors = 0; uint64_t n_data_nodes_per_sector = 0; @@ -941,15 +943,68 @@ void create_disk_layout(const std::string base_file, const std::string mem_index std::unique_ptr cur_node_coords = std::make_unique(ndims_64); diskann::cout << "# sectors: " << n_sectors << std::endl; uint64_t cur_node_id = 0; - for (uint64_t sector = 0; sector < n_sectors; sector++) - { - if (sector % 100000 == 0) + + if (nnodes_per_sector > 0) + { // Write multiple nodes per sector + for (uint64_t sector = 0; sector < n_sectors; sector++) { - diskann::cout << "Sector #" << sector << "written" << std::endl; + if (sector % 100000 == 0) + { + diskann::cout << "Sector #" << sector << "written" << std::endl; + } + memset(sector_buf.get(), 0, defaults::SECTOR_LEN); + for (uint64_t sector_node_id = 0; sector_node_id < nnodes_per_sector && cur_node_id < npts_64; + sector_node_id++) + { + memset(node_buf.get(), 0, max_node_len); + // read cur node's nnbrs + vamana_reader.read((char *)&nnbrs, sizeof(uint32_t)); + + // sanity checks on nnbrs + assert(nnbrs > 0); + assert(nnbrs <= width_u32); + + // read node's nhood + vamana_reader.read((char *)nhood_buf, (std::min)(nnbrs, width_u32) * sizeof(uint32_t)); + if (nnbrs > width_u32) + { + vamana_reader.seekg((nnbrs - width_u32) * sizeof(uint32_t), vamana_reader.cur); + } + + // write coords of node first + // T *node_coords = data + ((uint64_t) ndims_64 * cur_node_id); + base_reader.read((char *)cur_node_coords.get(), sizeof(T) * ndims_64); + memcpy(node_buf.get(), cur_node_coords.get(), ndims_64 * sizeof(T)); + + // write nnbrs + *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T)) = (std::min)(nnbrs, width_u32); + + // write nhood next + memcpy(node_buf.get() + ndims_64 * sizeof(T) + sizeof(uint32_t), nhood_buf, + (std::min)(nnbrs, width_u32) * sizeof(uint32_t)); + + // get offset into sector_buf + char *sector_node_buf = sector_buf.get() + (sector_node_id * max_node_len); + + // copy node buf into sector_node_buf + memcpy(sector_node_buf, node_buf.get(), max_node_len); + cur_node_id++; + } + // flush sector to disk + diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN); } - memset(sector_buf.get(), 0, defaults::SECTOR_LEN); - for (uint64_t sector_node_id = 0; sector_node_id < nnodes_per_sector && cur_node_id < npts_64; sector_node_id++) + } + else + { // Write multi-sector nodes + uint64_t nsectors_per_node = DIV_ROUND_UP(max_node_len, defaults::SECTOR_LEN); + for (uint64_t i = 0; i < npts_64; i++) { + if ((i * nsectors_per_node) % 100000 == 0) + { + diskann::cout << "Sector #" << i * nsectors_per_node << "written" << std::endl; + } + memset(multisector_buf.get(), 0, nsectors_per_node * defaults::SECTOR_LEN); + memset(node_buf.get(), 0, max_node_len); // read cur node's nnbrs vamana_reader.read((char *)&nnbrs, sizeof(uint32_t)); @@ -968,25 +1023,20 @@ void create_disk_layout(const std::string base_file, const std::string mem_index // write coords of node first // T *node_coords = data + ((uint64_t) ndims_64 * cur_node_id); base_reader.read((char *)cur_node_coords.get(), sizeof(T) * ndims_64); - memcpy(node_buf.get(), cur_node_coords.get(), ndims_64 * sizeof(T)); + memcpy(multisector_buf.get(), cur_node_coords.get(), ndims_64 * sizeof(T)); // write nnbrs - *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T)) = (std::min)(nnbrs, width_u32); + *(uint32_t *)(multisector_buf.get() + ndims_64 * sizeof(T)) = (std::min)(nnbrs, width_u32); // write nhood next - memcpy(node_buf.get() + ndims_64 * sizeof(T) + sizeof(uint32_t), nhood_buf, + memcpy(multisector_buf.get() + ndims_64 * sizeof(T) + sizeof(uint32_t), nhood_buf, (std::min)(nnbrs, width_u32) * sizeof(uint32_t)); - // get offset into sector_buf - char *sector_node_buf = sector_buf.get() + (sector_node_id * max_node_len); - - // copy node buf into sector_node_buf - memcpy(sector_node_buf, node_buf.get(), max_node_len); - cur_node_id++; + // flush sector to disk + diskann_writer.write(multisector_buf.get(), nsectors_per_node * defaults::SECTOR_LEN); } - // flush sector to disk - diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN); } + if (append_reorder_data) { diskann::cout << "Index written. Appending reorder data..." << std::endl; diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index a96d6e69f..4e5dab7b8 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -17,31 +17,18 @@ #define READ_U32(stream, val) stream.read((char *)&val, sizeof(uint32_t)) #define READ_UNSIGNED(stream, val) stream.read((char *)&val, sizeof(unsigned)) -// sector # on disk where node_id is present with in the graph part -#define NODE_SECTOR_NO(node_id) (((uint64_t)(node_id)) / nnodes_per_sector + 1) - -// obtains region of sector containing node -#define OFFSET_TO_NODE(sector_buf, node_id) \ - ((char *)sector_buf + (((uint64_t)node_id) % nnodes_per_sector) * max_node_len) - -// returns region of `node_buf` containing [NNBRS][NBR_ID(uint32_t)] -#define OFFSET_TO_NODE_NHOOD(node_buf) (unsigned *)((char *)node_buf + disk_bytes_per_point) - -// returns region of `node_buf` containing [COORD(T)] -#define OFFSET_TO_NODE_COORDS(node_buf) (T *)(node_buf) - // sector # beyond the end of graph where data for id is present for reordering -#define VECTOR_SECTOR_NO(id) (((uint64_t)(id)) / nvecs_per_sector + reorder_data_start_sector) +#define VECTOR_SECTOR_NO(id) (((uint64_t)(id)) / _nvecs_per_sector + _reorder_data_start_sector) // sector # beyond the end of graph where data for id is present for reordering -#define VECTOR_SECTOR_OFFSET(id) ((((uint64_t)(id)) % nvecs_per_sector) * data_dim * sizeof(float)) +#define VECTOR_SECTOR_OFFSET(id) ((((uint64_t)(id)) % _nvecs_per_sector) * _data_dim * sizeof(float)) namespace diskann { template PQFlashIndex::PQFlashIndex(std::shared_ptr &fileReader, diskann::Metric m) - : reader(fileReader), metric(m), thread_data(nullptr) + : reader(fileReader), metric(m), _thread_data(nullptr) { if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT) { @@ -60,8 +47,8 @@ PQFlashIndex::PQFlashIndex(std::shared_ptr &fileRe } } - this->dist_cmp.reset(diskann::get_distance_function(metric)); - this->dist_cmp_float.reset(diskann::get_distance_function(metric)); + this->_dist_cmp.reset(diskann::get_distance_function(metric)); + this->_dist_cmp_float.reset(diskann::get_distance_function(metric)); } template PQFlashIndex::~PQFlashIndex() @@ -73,19 +60,19 @@ template PQFlashIndex::~PQFlashIndex() } #endif - if (centroid_data != nullptr) - aligned_free(centroid_data); + if (_centroid_data != nullptr) + aligned_free(_centroid_data); // delete backing bufs for nhood and coord cache - if (nhood_cache_buf != nullptr) + if (_nhood_cache_buf != nullptr) { - delete[] nhood_cache_buf; - diskann::aligned_free(coord_cache_buf); + delete[] _nhood_cache_buf; + diskann::aligned_free(_coord_cache_buf); } - if (load_flag) + if (_load_flag) { diskann::cout << "Clearing scratch" << std::endl; - ScratchStoreManager> manager(this->thread_data); + ScratchStoreManager> manager(this->_thread_data); manager.destroy(); this->reader->deregister_all_threads(); reader->close(); @@ -101,6 +88,28 @@ template PQFlashIndex::~PQFlashIndex() } } +template inline uint64_t PQFlashIndex::get_node_sector(uint64_t node_id) +{ + return 1 + (_nnodes_per_sector > 0 ? node_id / _nnodes_per_sector + : node_id * DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN)); +} + +template +inline char *PQFlashIndex::offset_to_node(char *sector_buf, uint64_t node_id) +{ + return sector_buf + (_nnodes_per_sector == 0 ? 0 : (node_id % _nnodes_per_sector) * _max_node_len); +} + +template inline uint32_t *PQFlashIndex::offset_to_node_nhood(char *node_buf) +{ + return (unsigned *)(node_buf + _disk_bytes_per_point); +} + +template inline T *PQFlashIndex::offset_to_node_coords(char *node_buf) +{ + return (T *)(node_buf); +} + template void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve) { @@ -111,13 +120,77 @@ void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visi { #pragma omp critical { - SSDThreadData *data = new SSDThreadData(this->aligned_dim, visited_reserve); + SSDThreadData *data = new SSDThreadData(this->_aligned_dim, visited_reserve); this->reader->register_thread(); data->ctx = this->reader->get_ctx(); - this->thread_data.push(data); + this->_thread_data.push(data); } } - load_flag = true; + _load_flag = true; +} + +template +std::vector PQFlashIndex::read_nodes(const std::vector &node_ids, + std::vector &coord_buffers, + std::vector> &nbr_buffers) +{ + std::vector read_reqs; + std::vector retval(node_ids.size(), true); + + char *buf = nullptr; + auto num_sectors = _nnodes_per_sector > 0 ? 1 : DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN); + alloc_aligned((void **)&buf, node_ids.size() * num_sectors * defaults::SECTOR_LEN, defaults::SECTOR_LEN); + + // create read requests + for (size_t i = 0; i < node_ids.size(); ++i) + { + auto node_id = node_ids[i]; + + AlignedRead read; + read.len = num_sectors * defaults::SECTOR_LEN; + read.buf = buf + i * num_sectors * defaults::SECTOR_LEN; + read.offset = get_node_sector(node_id) * defaults::SECTOR_LEN; + read_reqs.push_back(read); + } + + // borrow thread data and issue reads + ScratchStoreManager> manager(this->_thread_data); + auto this_thread_data = manager.scratch_space(); + IOContext &ctx = this_thread_data->ctx; + reader->read(read_reqs, ctx); + + // copy reads into buffers + for (uint32_t i = 0; i < read_reqs.size(); i++) + { +#if defined(_WINDOWS) && defined(USE_BING_INFRA) // this block is to handle failed reads in + // production settings + if ((*ctx.m_pRequestsStatus)[i] != IOContext::READ_SUCCESS) + { + retval[i] = false; + continue; + } +#endif + + char *node_buf = offset_to_node((char *)read_reqs[i].buf, node_ids[i]); + + if (coord_buffers[i] != nullptr) + { + T *node_coords = offset_to_node_coords(node_buf); + memcpy(coord_buffers[i], node_coords, _disk_bytes_per_point); + } + + if (nbr_buffers[i].second != nullptr) + { + uint32_t *node_nhood = offset_to_node_nhood(node_buf); + auto num_nbrs = *node_nhood; + nbr_buffers[i].first = num_nbrs; + memcpy(nbr_buffers[i].second, node_nhood + 1, num_nbrs * sizeof(uint32_t)); + } + } + + aligned_free(buf); + + return retval; } template void PQFlashIndex::load_cache_list(std::vector &node_list) @@ -126,69 +199,48 @@ template void PQFlashIndex::load_cache_ size_t num_cached_nodes = node_list.size(); // borrow thread data - ScratchStoreManager> manager(this->thread_data); + ScratchStoreManager> manager(this->_thread_data); auto this_thread_data = manager.scratch_space(); IOContext &ctx = this_thread_data->ctx; - nhood_cache_buf = new uint32_t[num_cached_nodes * (max_degree + 1)]; - memset(nhood_cache_buf, 0, num_cached_nodes * (max_degree + 1)); + // Allocate space for neighborhood cache + _nhood_cache_buf = new uint32_t[num_cached_nodes * (_max_degree + 1)]; + memset(_nhood_cache_buf, 0, num_cached_nodes * (_max_degree + 1)); - size_t coord_cache_buf_len = num_cached_nodes * aligned_dim; - diskann::alloc_aligned((void **)&coord_cache_buf, coord_cache_buf_len * sizeof(T), 8 * sizeof(T)); - memset(coord_cache_buf, 0, coord_cache_buf_len * sizeof(T)); + // Allocate space for coordinate cache + size_t coord_cache_buf_len = num_cached_nodes * _aligned_dim; + diskann::alloc_aligned((void **)&_coord_cache_buf, coord_cache_buf_len * sizeof(T), 8 * sizeof(T)); + memset(_coord_cache_buf, 0, coord_cache_buf_len * sizeof(T)); size_t BLOCK_SIZE = 8; size_t num_blocks = DIV_ROUND_UP(num_cached_nodes, BLOCK_SIZE); - for (size_t block = 0; block < num_blocks; block++) { size_t start_idx = block * BLOCK_SIZE; size_t end_idx = (std::min)(num_cached_nodes, (block + 1) * BLOCK_SIZE); - std::vector read_reqs; - std::vector> nhoods; + + // Copy offset into buffers to read into + std::vector nodes_to_read; + std::vector coord_buffers; + std::vector> nbr_buffers; for (size_t node_idx = start_idx; node_idx < end_idx; node_idx++) { - AlignedRead read; - char *buf = nullptr; - alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); - nhoods.push_back(std::make_pair(node_list[node_idx], buf)); - read.len = defaults::SECTOR_LEN; - read.buf = buf; - read.offset = NODE_SECTOR_NO(node_list[node_idx]) * defaults::SECTOR_LEN; - read_reqs.push_back(read); + nodes_to_read.push_back(node_list[node_idx]); + coord_buffers.push_back(_coord_cache_buf + node_idx * _aligned_dim); + nbr_buffers.emplace_back(0, _nhood_cache_buf + node_idx * (_max_degree + 1)); } - reader->read(read_reqs, ctx); + // issue the reads + auto read_status = read_nodes(nodes_to_read, coord_buffers, nbr_buffers); - size_t node_idx = start_idx; - for (uint32_t i = 0; i < read_reqs.size(); i++) + // check for success and insert into the cache. + for (size_t i = 0; i < read_status.size(); i++) { -#if defined(_WINDOWS) && defined(USE_BING_INFRA) // this block is to handle failed reads in - // production settings - if ((*ctx.m_pRequestsStatus)[i] != IOContext::READ_SUCCESS) + if (read_status[i] == true) { - continue; + _coord_cache.insert(std::make_pair(nodes_to_read[i], coord_buffers[i])); + _nhood_cache.insert(std::make_pair(nodes_to_read[i], nbr_buffers[i])); } -#endif - auto &nhood = nhoods[i]; - char *node_buf = OFFSET_TO_NODE(nhood.second, nhood.first); - T *node_coords = OFFSET_TO_NODE_COORDS(node_buf); - T *cached_coords = coord_cache_buf + node_idx * aligned_dim; - memcpy(cached_coords, node_coords, disk_bytes_per_point); - coord_cache.insert(std::make_pair(nhood.first, cached_coords)); - - // insert node nhood into nhood_cache - uint32_t *node_nhood = OFFSET_TO_NODE_NHOOD(node_buf); - - auto nnbrs = *node_nhood; - uint32_t *nbrs = node_nhood + 1; - std::pair cnhood; - cnhood.first = nnbrs; - cnhood.second = nhood_cache_buf + node_idx * (max_degree + 1); - memcpy(cnhood.second, nbrs, nnbrs * sizeof(uint32_t)); - nhood_cache.insert(std::make_pair(nhood.first, cnhood)); - aligned_free(nhood.second); - node_idx++; } } diskann::cout << "..done." << std::endl; @@ -209,24 +261,24 @@ void PQFlashIndex::generate_cache_list_from_sample_queries(std::strin std::vector &node_list) { #endif - if (num_nodes_to_cache >= this->num_points) + if (num_nodes_to_cache >= this->_num_points) { // for small num_points and big num_nodes_to_cache, use below way to get the node_list quickly - node_list.resize(this->num_points); - for (uint32_t i = 0; i < this->num_points; ++i) + node_list.resize(this->_num_points); + for (uint32_t i = 0; i < this->_num_points; ++i) { node_list[i] = i; } return; } - this->count_visited_nodes = true; - this->node_visit_counter.clear(); - this->node_visit_counter.resize(this->num_points); - for (uint32_t i = 0; i < node_visit_counter.size(); i++) + this->_count_visited_nodes = true; + this->_node_visit_counter.clear(); + this->_node_visit_counter.resize(this->_num_points); + for (uint32_t i = 0; i < _node_visit_counter.size(); i++) { - this->node_visit_counter[i].first = i; - this->node_visit_counter[i].second = 0; + this->_node_visit_counter[i].first = i; + this->_node_visit_counter[i].second = 0; } uint64_t sample_num, sample_dim, sample_aligned_dim; @@ -271,19 +323,19 @@ void PQFlashIndex::generate_cache_list_from_sample_queries(std::strin tmp_result_dists.data() + i, beamwidth, filtered_search, label_for_search, false); } - std::sort(this->node_visit_counter.begin(), node_visit_counter.end(), + std::sort(this->_node_visit_counter.begin(), _node_visit_counter.end(), [](std::pair &left, std::pair &right) { return left.second > right.second; }); node_list.clear(); node_list.shrink_to_fit(); - num_nodes_to_cache = std::min(num_nodes_to_cache, this->node_visit_counter.size()); + num_nodes_to_cache = std::min(num_nodes_to_cache, this->_node_visit_counter.size()); node_list.reserve(num_nodes_to_cache); for (uint64_t i = 0; i < num_nodes_to_cache; i++) { - node_list.push_back(this->node_visit_counter[i].first); + node_list.push_back(this->_node_visit_counter[i].first); } - this->count_visited_nodes = false; + this->_count_visited_nodes = false; diskann::aligned_free(samples); } @@ -298,17 +350,17 @@ void PQFlashIndex::cache_bfs_levels(uint64_t num_nodes_to_cache, std: tsl::robin_set node_set; // Do not cache more than 10% of the nodes in the index - uint64_t tenp_nodes = (uint64_t)(std::round(this->num_points * 0.1)); + uint64_t tenp_nodes = (uint64_t)(std::round(this->_num_points * 0.1)); if (num_nodes_to_cache > tenp_nodes) { diskann::cout << "Reducing nodes to cache from: " << num_nodes_to_cache << " to: " << tenp_nodes - << "(10 percent of total nodes:" << this->num_points << ")" << std::endl; + << "(10 percent of total nodes:" << this->_num_points << ")" << std::endl; num_nodes_to_cache = tenp_nodes == 0 ? 1 : tenp_nodes; } diskann::cout << "Caching " << num_nodes_to_cache << "..." << std::endl; // borrow thread data - ScratchStoreManager> manager(this->thread_data); + ScratchStoreManager> manager(this->_thread_data); auto this_thread_data = manager.scratch_space(); IOContext &ctx = this_thread_data->ctx; @@ -316,9 +368,9 @@ void PQFlashIndex::cache_bfs_levels(uint64_t num_nodes_to_cache, std: cur_level = std::make_unique>(); prev_level = std::make_unique>(); - for (uint64_t miter = 0; miter < num_medoids && cur_level->size() < num_nodes_to_cache; miter++) + for (uint64_t miter = 0; miter < _num_medoids && cur_level->size() < num_nodes_to_cache; miter++) { - cur_level->insert(medoids[miter]); + cur_level->insert(_medoids[miter]); } if ((_filter_to_medoid_ids.size() > 0) && (cur_level->size() < num_nodes_to_cache)) @@ -372,53 +424,46 @@ void PQFlashIndex::cache_bfs_levels(uint64_t num_nodes_to_cache, std: diskann::cout << "." << std::flush; size_t start = block * BLOCK_SIZE; size_t end = (std::min)((block + 1) * BLOCK_SIZE, nodes_to_expand.size()); - std::vector read_reqs; - std::vector> nhoods; + + std::vector nodes_to_read; + std::vector coord_buffers(end - start, nullptr); + std::vector> nbr_buffers; + for (size_t cur_pt = start; cur_pt < end; cur_pt++) { - char *buf = nullptr; - alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); - nhoods.emplace_back(nodes_to_expand[cur_pt], buf); - AlignedRead read; - read.len = defaults::SECTOR_LEN; - read.buf = buf; - read.offset = NODE_SECTOR_NO(nodes_to_expand[cur_pt]) * defaults::SECTOR_LEN; - read_reqs.push_back(read); + nodes_to_read.push_back(nodes_to_expand[cur_pt]); + nbr_buffers.emplace_back(0, new uint32_t[_max_degree + 1]); } // issue read requests - reader->read(read_reqs, ctx); + auto read_status = read_nodes(nodes_to_read, coord_buffers, nbr_buffers); // process each nhood buf - for (uint32_t i = 0; i < read_reqs.size(); i++) + for (uint32_t i = 0; i < read_status.size(); i++) { -#if defined(_WINDOWS) && defined(USE_BING_INFRA) // this block is to handle read failures in - // production settings - if ((*ctx.m_pRequestsStatus)[i] != IOContext::READ_SUCCESS) + if (read_status[i] == false) { continue; } -#endif - auto &nhood = nhoods[i]; - - // insert node coord into coord_cache - char *node_buf = OFFSET_TO_NODE(nhood.second, nhood.first); - uint32_t *node_nhood = OFFSET_TO_NODE_NHOOD(node_buf); - uint64_t nnbrs = (uint64_t)*node_nhood; - uint32_t *nbrs = node_nhood + 1; - // explore next level - for (uint64_t j = 0; j < nnbrs && !finish_flag; j++) + else { - if (node_set.find(nbrs[j]) == node_set.end()) - { - cur_level->insert(nbrs[j]); - } - if (cur_level->size() + node_set.size() >= num_nodes_to_cache) + uint32_t nnbrs = nbr_buffers[i].first; + uint32_t *nbrs = nbr_buffers[i].second; + + // explore next level + for (uint32_t j = 0; j < nnbrs && !finish_flag; j++) { - finish_flag = true; + if (node_set.find(nbrs[j]) == node_set.end()) + { + cur_level->insert(nbrs[j]); + } + if (cur_level->size() + node_set.size() >= num_nodes_to_cache) + { + finish_flag = true; + } } } - aligned_free(nhood.second); + delete[] nbr_buffers[i].second; } } @@ -445,48 +490,49 @@ void PQFlashIndex::cache_bfs_levels(uint64_t num_nodes_to_cache, std: template void PQFlashIndex::use_medoids_data_as_centroids() { - if (centroid_data != nullptr) - aligned_free(centroid_data); - alloc_aligned(((void **)¢roid_data), num_medoids * aligned_dim * sizeof(float), 32); - std::memset(centroid_data, 0, num_medoids * aligned_dim * sizeof(float)); + if (_centroid_data != nullptr) + aligned_free(_centroid_data); + alloc_aligned(((void **)&_centroid_data), _num_medoids * _aligned_dim * sizeof(float), 32); + std::memset(_centroid_data, 0, _num_medoids * _aligned_dim * sizeof(float)); // borrow ctx - ScratchStoreManager> manager(this->thread_data); + ScratchStoreManager> manager(this->_thread_data); auto data = manager.scratch_space(); IOContext &ctx = data->ctx; - diskann::cout << "Loading centroid data from medoids vector data of " << num_medoids << " medoid(s)" << std::endl; - for (uint64_t cur_m = 0; cur_m < num_medoids; cur_m++) + diskann::cout << "Loading centroid data from medoids vector data of " << _num_medoids << " medoid(s)" << std::endl; + + std::vector nodes_to_read; + std::vector medoid_bufs; + std::vector> nbr_bufs; + + for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++) + { + nodes_to_read.push_back(_medoids[cur_m]); + medoid_bufs.push_back(new T[_data_dim]); + nbr_bufs.emplace_back(0, nullptr); + } + + auto read_status = read_nodes(nodes_to_read, medoid_bufs, nbr_bufs); + + for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++) { - auto medoid = medoids[cur_m]; - // read medoid nhood - char *medoid_buf = nullptr; - alloc_aligned((void **)&medoid_buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); - std::vector medoid_read(1); - medoid_read[0].len = defaults::SECTOR_LEN; - medoid_read[0].buf = medoid_buf; - medoid_read[0].offset = NODE_SECTOR_NO(medoid) * defaults::SECTOR_LEN; - reader->read(medoid_read, ctx); - - // all data about medoid - char *medoid_node_buf = OFFSET_TO_NODE(medoid_buf, medoid); - - // add medoid coords to `coord_cache` - T *medoid_coords = new T[data_dim]; - T *medoid_disk_coords = OFFSET_TO_NODE_COORDS(medoid_node_buf); - memcpy(medoid_coords, medoid_disk_coords, disk_bytes_per_point); - - if (!use_disk_index_pq) + if (read_status[cur_m] == true) { - for (uint32_t i = 0; i < data_dim; i++) - centroid_data[cur_m * aligned_dim + i] = medoid_coords[i]; + if (!_use_disk_index_pq) + { + for (uint32_t i = 0; i < _data_dim; i++) + _centroid_data[cur_m * _aligned_dim + i] = medoid_bufs[cur_m][i]; + } + else + { + _disk_pq_table.inflate_vector((uint8_t *)medoid_bufs[cur_m], (_centroid_data + cur_m * _aligned_dim)); + } } else { - disk_pq_table.inflate_vector((uint8_t *)medoid_coords, (centroid_data + cur_m * aligned_dim)); + throw ANNException("Unable to read a medoid", -1, __FUNCSIG__, __FILE__, __LINE__); } - - aligned_free(medoid_buf); - delete[] medoid_coords; + delete[] medoid_bufs[cur_m]; } } @@ -514,12 +560,12 @@ void PQFlashIndex::generate_random_labels(std::vector &labels labels.resize(num_labels); uint64_t num_total_labels = - _pts_to_label_offsets[num_points - 1] + _pts_to_labels[_pts_to_label_offsets[num_points - 1]]; + _pts_to_label_offsets[_num_points - 1] + _pts_to_labels[_pts_to_label_offsets[_num_points - 1]]; std::mt19937 gen(rd()); std::uniform_int_distribution dis(0, num_total_labels); tsl::robin_set skip_locs; - for (uint32_t i = 0; i < num_points; i++) + for (uint32_t i = 0; i < _num_points; i++) { skip_locs.insert(_pts_to_label_offsets[i]); } @@ -702,12 +748,12 @@ template int PQFlashIndex::load(uint32_ #endif std::string pq_table_bin = std::string(index_prefix) + "_pq_pivots.bin"; std::string pq_compressed_vectors = std::string(index_prefix) + "_pq_compressed.bin"; - std::string disk_index_file = std::string(index_prefix) + "_disk.index"; + std::string _disk_index_file = std::string(index_prefix) + "_disk.index"; #ifdef EXEC_ENV_OLS return load_from_separate_paths(files, num_threads, disk_index_file.c_str(), pq_table_bin.c_str(), pq_compressed_vectors.c_str()); #else - return load_from_separate_paths(num_threads, disk_index_file.c_str(), pq_table_bin.c_str(), + return load_from_separate_paths(num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(), pq_compressed_vectors.c_str()); #endif } @@ -726,14 +772,14 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons #endif std::string pq_table_bin = pivots_filepath; std::string pq_compressed_vectors = compressed_filepath; - std::string disk_index_file = index_filepath; - std::string medoids_file = std::string(disk_index_file) + "_medoids.bin"; - std::string centroids_file = std::string(disk_index_file) + "_centroids.bin"; - - std::string labels_file = std ::string(disk_index_file) + "_labels.txt"; - std::string labels_to_medoids = std ::string(disk_index_file) + "_labels_to_medoids.txt"; - std::string dummy_map_file = std ::string(disk_index_file) + "_dummy_map.txt"; - std::string labels_map_file = std ::string(disk_index_file) + "_labels_map.txt"; + std::string _disk_index_file = index_filepath; + std::string medoids_file = std::string(_disk_index_file) + "_medoids.bin"; + std::string centroids_file = std::string(_disk_index_file) + "_centroids.bin"; + + std::string labels_file = std ::string(_disk_index_file) + "_labels.txt"; + std::string labels_to_medoids = std ::string(_disk_index_file) + "_labels_to_medoids.txt"; + std::string dummy_map_file = std ::string(_disk_index_file) + "_dummy_map.txt"; + std::string labels_map_file = std ::string(_disk_index_file) + "_labels_map.txt"; size_t num_pts_in_label_file = 0; size_t pq_file_dim, pq_file_num_centroids; @@ -743,7 +789,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons get_bin_metadata(pq_table_bin, pq_file_num_centroids, pq_file_dim, METADATA_SIZE); #endif - this->disk_index_file = disk_index_file; + this->_disk_index_file = _disk_index_file; if (pq_file_num_centroids != 256) { @@ -751,13 +797,11 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons return -1; } - this->data_dim = pq_file_dim; - // will reset later if we use PQ on disk - this->disk_data_dim = this->data_dim; + this->_data_dim = pq_file_dim; // will change later if we use PQ on disk or if we are using // inner product without PQ - this->disk_bytes_per_point = this->data_dim * sizeof(T); - this->aligned_dim = ROUND_UP(pq_file_dim, 8); + this->_disk_bytes_per_point = this->_data_dim * sizeof(T); + this->_aligned_dim = ROUND_UP(pq_file_dim, 8); size_t npts_u64, nchunks_u64; #ifdef EXEC_ENV_OLS @@ -766,12 +810,12 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons diskann::load_bin(pq_compressed_vectors, this->data, npts_u64, nchunks_u64); #endif - this->num_points = npts_u64; - this->n_chunks = nchunks_u64; + this->_num_points = npts_u64; + this->_n_chunks = nchunks_u64; if (file_exists(labels_file)) { parse_label_file(labels_file, num_pts_in_label_file); - assert(num_pts_in_label_file == this->num_points); + assert(num_pts_in_label_file == this->_num_points); _label_map = load_label_map(labels_map_file); if (file_exists(labels_to_medoids)) { @@ -804,7 +848,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons throw FileException(labels_to_medoids, e, __FUNCSIG__, __FILE__, __LINE__); } } - std::string univ_label_file = std ::string(disk_index_file) + "_universal_label.txt"; + std::string univ_label_file = std ::string(_disk_index_file) + "_universal_label.txt"; if (file_exists(univ_label_file)) { std::ifstream universal_label_reader(univ_label_file); @@ -850,15 +894,16 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons } #ifdef EXEC_ENV_OLS - pq_table.load_pq_centroid_bin(files, pq_table_bin.c_str(), nchunks_u64); + _pq_table.load_pq_centroid_bin(files, pq_table_bin.c_str(), nchunks_u64); #else - pq_table.load_pq_centroid_bin(pq_table_bin.c_str(), nchunks_u64); + _pq_table.load_pq_centroid_bin(pq_table_bin.c_str(), nchunks_u64); #endif - diskann::cout << "Loaded PQ centroids and in-memory compressed vectors. #points: " << num_points - << " #dim: " << data_dim << " #aligned_dim: " << aligned_dim << " #chunks: " << n_chunks << std::endl; + diskann::cout << "Loaded PQ centroids and in-memory compressed vectors. #points: " << _num_points + << " #dim: " << _data_dim << " #aligned_dim: " << _aligned_dim << " #chunks: " << _n_chunks + << std::endl; - if (n_chunks > MAX_PQ_CHUNKS) + if (_n_chunks > MAX_PQ_CHUNKS) { std::stringstream stream; stream << "Error loading index. Ensure that max PQ bytes for in-memory " @@ -867,23 +912,23 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - std::string disk_pq_pivots_path = this->disk_index_file + "_pq_pivots.bin"; + std::string disk_pq_pivots_path = this->_disk_index_file + "_pq_pivots.bin"; if (file_exists(disk_pq_pivots_path)) { - use_disk_index_pq = true; + _use_disk_index_pq = true; #ifdef EXEC_ENV_OLS - // giving 0 chunks to make the pq_table infer from the + // giving 0 chunks to make the _pq_table infer from the // chunk_offsets file the correct value disk_pq_table.load_pq_centroid_bin(files, disk_pq_pivots_path.c_str(), 0); #else - // giving 0 chunks to make the pq_table infer from the + // giving 0 chunks to make the _pq_table infer from the // chunk_offsets file the correct value - disk_pq_table.load_pq_centroid_bin(disk_pq_pivots_path.c_str(), 0); + _disk_pq_table.load_pq_centroid_bin(disk_pq_pivots_path.c_str(), 0); #endif - disk_pq_n_chunks = disk_pq_table.get_num_chunks(); - disk_bytes_per_point = - disk_pq_n_chunks * sizeof(uint8_t); // revising disk_bytes_per_point since DISK PQ is used. - diskann::cout << "Disk index uses PQ data compressed down to " << disk_pq_n_chunks << " bytes per point." + _disk_pq_n_chunks = _disk_pq_table.get_num_chunks(); + _disk_bytes_per_point = + _disk_pq_n_chunks * sizeof(uint8_t); // revising disk_bytes_per_point since DISK PQ is used. + diskann::cout << "Disk index uses PQ data compressed down to " << _disk_pq_n_chunks << " bytes per point." << std::endl; } @@ -896,13 +941,13 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons // 'standard' aligned file reader approach. reader->open(disk_index_file); this->setup_thread_data(num_threads); - this->max_nthreads = num_threads; + this->_max_nthreads = num_threads; char *bytes = getHeaderBytes(); ContentBuf buf(bytes, HEADER_SIZE); std::basic_istream index_metadata(&buf); #else - std::ifstream index_metadata(disk_index_file, std::ios::binary); + std::ifstream index_metadata(_disk_index_file, std::ios::binary); #endif uint32_t nr, nc; // metadata itself is stored as bin format (nr is number of @@ -915,21 +960,21 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons READ_U64(index_metadata, disk_nnodes); READ_U64(index_metadata, disk_ndims); - if (disk_nnodes != num_points) + if (disk_nnodes != _num_points) { diskann::cout << "Mismatch in #points for compressed data file and disk " "index file: " - << disk_nnodes << " vs " << num_points << std::endl; + << disk_nnodes << " vs " << _num_points << std::endl; return -1; } size_t medoid_id_on_file; READ_U64(index_metadata, medoid_id_on_file); - READ_U64(index_metadata, max_node_len); - READ_U64(index_metadata, nnodes_per_sector); - max_degree = ((max_node_len - disk_bytes_per_point) / sizeof(uint32_t)) - 1; + READ_U64(index_metadata, _max_node_len); + READ_U64(index_metadata, _nnodes_per_sector); + _max_degree = ((_max_node_len - _disk_bytes_per_point) / sizeof(uint32_t)) - 1; - if (max_degree > defaults::MAX_GRAPH_DEGREE) + if (_max_degree > defaults::MAX_GRAPH_DEGREE) { std::stringstream stream; stream << "Error loading index. Ensure that max graph degree (R) does " @@ -939,35 +984,35 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons } // setting up concept of frozen points in disk index for streaming-DiskANN - READ_U64(index_metadata, this->num_frozen_points); + READ_U64(index_metadata, this->_num_frozen_points); uint64_t file_frozen_id; READ_U64(index_metadata, file_frozen_id); - if (this->num_frozen_points == 1) - this->frozen_location = file_frozen_id; - if (this->num_frozen_points == 1) + if (this->_num_frozen_points == 1) + this->_frozen_location = file_frozen_id; + if (this->_num_frozen_points == 1) { - diskann::cout << " Detected frozen point in index at location " << this->frozen_location + diskann::cout << " Detected frozen point in index at location " << this->_frozen_location << ". Will not output it at search time." << std::endl; } - READ_U64(index_metadata, this->reorder_data_exists); - if (this->reorder_data_exists) + READ_U64(index_metadata, this->_reorder_data_exists); + if (this->_reorder_data_exists) { - if (this->use_disk_index_pq == false) + if (this->_use_disk_index_pq == false) { throw ANNException("Reordering is designed for used with disk PQ " "compression option", -1, __FUNCSIG__, __FILE__, __LINE__); } - READ_U64(index_metadata, this->reorder_data_start_sector); - READ_U64(index_metadata, this->ndims_reorder_vecs); - READ_U64(index_metadata, this->nvecs_per_sector); + READ_U64(index_metadata, this->_reorder_data_start_sector); + READ_U64(index_metadata, this->_ndims_reorder_vecs); + READ_U64(index_metadata, this->_nvecs_per_sector); } diskann::cout << "Disk-Index File Meta-data: "; - diskann::cout << "# nodes per sector: " << nnodes_per_sector; - diskann::cout << ", max node len (bytes): " << max_node_len; - diskann::cout << ", max node degree: " << max_degree << std::endl; + diskann::cout << "# nodes per sector: " << _nnodes_per_sector; + diskann::cout << ", max node len (bytes): " << _max_node_len; + diskann::cout << ", max node degree: " << _max_degree << std::endl; #ifdef EXEC_ENV_OLS delete[] bytes; @@ -977,10 +1022,10 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons #ifndef EXEC_ENV_OLS // open AlignedFileReader handle to index_file - std::string index_fname(disk_index_file); + std::string index_fname(_disk_index_file); reader->open(index_fname); this->setup_thread_data(num_threads); - this->max_nthreads = num_threads; + this->_max_nthreads = num_threads; #endif @@ -988,12 +1033,12 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons if (files.fileExists(medoids_file)) { size_t tmp_dim; - diskann::load_bin(files, medoids_file, medoids, num_medoids, tmp_dim); + diskann::load_bin(files, medoids_file, medoids, _num_medoids, tmp_dim); #else if (file_exists(medoids_file)) { size_t tmp_dim; - diskann::load_bin(medoids_file, medoids, num_medoids, tmp_dim); + diskann::load_bin(medoids_file, _medoids, _num_medoids, tmp_dim); #endif if (tmp_dim != 1) @@ -1020,12 +1065,12 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons { size_t num_centroids, aligned_tmp_dim; #ifdef EXEC_ENV_OLS - diskann::load_aligned_bin(files, centroids_file, centroid_data, num_centroids, tmp_dim, + diskann::load_aligned_bin(files, centroids_file, _centroid_data, num_centroids, tmp_dim, aligned_tmp_dim); #else - diskann::load_aligned_bin(centroids_file, centroid_data, num_centroids, tmp_dim, aligned_tmp_dim); + diskann::load_aligned_bin(centroids_file, _centroid_data, num_centroids, tmp_dim, aligned_tmp_dim); #endif - if (aligned_tmp_dim != aligned_dim || num_centroids != num_medoids) + if (aligned_tmp_dim != _aligned_dim || num_centroids != _num_medoids) { std::stringstream stream; stream << "Error loading centroids data file. Expected bin format " @@ -1040,21 +1085,21 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons } else { - num_medoids = 1; - medoids = new uint32_t[1]; - medoids[0] = (uint32_t)(medoid_id_on_file); + _num_medoids = 1; + _medoids = new uint32_t[1]; + _medoids[0] = (uint32_t)(medoid_id_on_file); use_medoids_data_as_centroids(); } - std::string norm_file = std::string(disk_index_file) + "_max_base_norm.bin"; + std::string norm_file = std::string(_disk_index_file) + "_max_base_norm.bin"; if (file_exists(norm_file) && metric == diskann::Metric::INNER_PRODUCT) { uint64_t dumr, dumc; float *norm_val; diskann::load_bin(norm_file, norm_val, dumr, dumc); - this->max_base_norm = norm_val[0]; - diskann::cout << "Setting re-scaling factor of base vectors to " << this->max_base_norm << std::endl; + this->_max_base_norm = norm_val[0]; + diskann::cout << "Setting re-scaling factor of base vectors to " << this->_max_base_norm << std::endl; delete[] norm_val; } diskann::cout << "done.." << std::endl; @@ -1150,11 +1195,12 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t } } - if (beam_width > defaults::MAX_N_SECTOR_READS) + uint64_t num_sector_per_nodes = DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN); + if (beam_width > num_sector_per_nodes * defaults::MAX_N_SECTOR_READS) throw ANNException("Beamwidth can not be higher than defaults::MAX_N_SECTOR_READS", -1, __FUNCSIG__, __FILE__, __LINE__); - ScratchStoreManager> manager(this->thread_data); + ScratchStoreManager> manager(this->_thread_data); auto data = manager.scratch_space(); IOContext &ctx = data->ctx; auto query_scratch = &(data->scratch); @@ -1174,28 +1220,28 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // to 0 (this is the extra coordindate used to convert MIPS to L2 search) if (metric == diskann::Metric::INNER_PRODUCT) { - for (size_t i = 0; i < this->data_dim - 1; i++) + for (size_t i = 0; i < this->_data_dim - 1; i++) { aligned_query_T[i] = query1[i]; query_norm += query1[i] * query1[i]; } - aligned_query_T[this->data_dim - 1] = 0; + aligned_query_T[this->_data_dim - 1] = 0; query_norm = std::sqrt(query_norm); - for (size_t i = 0; i < this->data_dim - 1; i++) + for (size_t i = 0; i < this->_data_dim - 1; i++) { aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm); } - pq_query_scratch->set(this->data_dim, aligned_query_T); + pq_query_scratch->set(this->_data_dim, aligned_query_T); } else { - for (size_t i = 0; i < this->data_dim; i++) + for (size_t i = 0; i < this->_data_dim; i++) { aligned_query_T[i] = query1[i]; } - pq_query_scratch->set(this->data_dim, aligned_query_T); + pq_query_scratch->set(this->_data_dim, aligned_query_T); } // pointers to buffers for data @@ -1205,12 +1251,14 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // sector scratch char *sector_scratch = query_scratch->sector_scratch; uint64_t §or_scratch_idx = query_scratch->sector_idx; + const uint64_t num_sectors_per_node = + _nnodes_per_sector > 0 ? 1 : DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN); // query <-> PQ chunk centers distances - pq_table.preprocess_query(query_rotated); // center the query and rotate if - // we have a rotation matrix + _pq_table.preprocess_query(query_rotated); // center the query and rotate if + // we have a rotation matrix float *pq_dists = pq_query_scratch->aligned_pqtable_dist_scratch; - pq_table.populate_chunk_distances(query_rotated, pq_dists); + _pq_table.populate_chunk_distances(query_rotated, pq_dists); // query <-> neighbor list float *dist_scratch = pq_query_scratch->aligned_dist_scratch; @@ -1219,8 +1267,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // lambda to batch compute query<-> node distances in PQ space auto compute_dists = [this, pq_coord_scratch, pq_dists](const uint32_t *ids, const uint64_t n_ids, float *dists_out) { - diskann::aggregate_coords(ids, n_ids, this->data, this->n_chunks, pq_coord_scratch); - diskann::pq_dist_lookup(pq_coord_scratch, n_ids, this->n_chunks, pq_dists, dists_out); + diskann::aggregate_coords(ids, n_ids, this->data, this->_n_chunks, pq_coord_scratch); + diskann::pq_dist_lookup(pq_coord_scratch, n_ids, this->_n_chunks, pq_dists, dists_out); }; Timer query_timer, io_timer, cpu_timer; @@ -1233,13 +1281,13 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t float best_dist = (std::numeric_limits::max)(); if (!use_filter) { - for (uint64_t cur_m = 0; cur_m < num_medoids; cur_m++) + for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++) { float cur_expanded_dist = - dist_cmp_float->compare(query_float, centroid_data + aligned_dim * cur_m, (uint32_t)aligned_dim); + _dist_cmp_float->compare(query_float, _centroid_data + _aligned_dim * cur_m, (uint32_t)_aligned_dim); if (cur_expanded_dist < best_dist) { - best_medoid = medoids[cur_m]; + best_medoid = _medoids[cur_m]; best_dist = cur_expanded_dist; } } @@ -1300,8 +1348,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t { auto nbr = retset.closest_unexpanded(); num_seen++; - auto iter = nhood_cache.find(nbr.id); - if (iter != nhood_cache.end()) + auto iter = _nhood_cache.find(nbr.id); + if (iter != _nhood_cache.end()) { cached_nhoods.push_back(std::make_pair(nbr.id, iter->second)); if (stats != nullptr) @@ -1313,9 +1361,9 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t { frontier.push_back(nbr.id); } - if (this->count_visited_nodes) + if (this->_count_visited_nodes) { - reinterpret_cast &>(this->node_visit_counter[nbr.id].second).fetch_add(1); + reinterpret_cast &>(this->_node_visit_counter[nbr.id].second).fetch_add(1); } } @@ -1329,11 +1377,11 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t auto id = frontier[i]; std::pair fnhood; fnhood.first = id; - fnhood.second = sector_scratch + sector_scratch_idx * defaults::SECTOR_LEN; + fnhood.second = sector_scratch + num_sectors_per_node * sector_scratch_idx * defaults::SECTOR_LEN; sector_scratch_idx++; frontier_nhoods.push_back(fnhood); - frontier_read_reqs.emplace_back(NODE_SECTOR_NO(((size_t)id)) * defaults::SECTOR_LEN, - defaults::SECTOR_LEN, fnhood.second); + frontier_read_reqs.emplace_back(get_node_sector((size_t)id) * defaults::SECTOR_LEN, + num_sectors_per_node * defaults::SECTOR_LEN, fnhood.second); if (stats != nullptr) { stats->n_4k++; @@ -1344,7 +1392,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t io_timer.reset(); #ifdef USE_BING_INFRA reader->read(frontier_read_reqs, ctx, - true); // async reader windows. + false); // synhronous reader for Bing. #else reader->read(frontier_read_reqs, ctx); // synchronous IO linux #endif @@ -1357,19 +1405,19 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // process cached nhoods for (auto &cached_nhood : cached_nhoods) { - auto global_cache_iter = coord_cache.find(cached_nhood.first); + auto global_cache_iter = _coord_cache.find(cached_nhood.first); T *node_fp_coords_copy = global_cache_iter->second; float cur_expanded_dist; - if (!use_disk_index_pq) + if (!_use_disk_index_pq) { - cur_expanded_dist = dist_cmp->compare(aligned_query_T, node_fp_coords_copy, (uint32_t)aligned_dim); + cur_expanded_dist = _dist_cmp->compare(aligned_query_T, node_fp_coords_copy, (uint32_t)_aligned_dim); } else { if (metric == diskann::Metric::INNER_PRODUCT) - cur_expanded_dist = disk_pq_table.inner_product(query_float, (uint8_t *)node_fp_coords_copy); + cur_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)node_fp_coords_copy); else - cur_expanded_dist = disk_pq_table.l2_distance( // disk_pq does not support OPQ yet + cur_expanded_dist = _disk_pq_table.l2_distance( // disk_pq does not support OPQ yet query_float, (uint8_t *)node_fp_coords_copy); } full_retset.push_back(Neighbor((uint32_t)cached_nhood.first, cur_expanded_dist)); @@ -1419,22 +1467,22 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (auto &frontier_nhood : frontier_nhoods) { #endif - char *node_disk_buf = OFFSET_TO_NODE(frontier_nhood.second, frontier_nhood.first); - uint32_t *node_buf = OFFSET_TO_NODE_NHOOD(node_disk_buf); + char *node_disk_buf = offset_to_node(frontier_nhood.second, frontier_nhood.first); + uint32_t *node_buf = offset_to_node_nhood(node_disk_buf); uint64_t nnbrs = (uint64_t)(*node_buf); - T *node_fp_coords = OFFSET_TO_NODE_COORDS(node_disk_buf); - memcpy(data_buf, node_fp_coords, disk_bytes_per_point); + T *node_fp_coords = offset_to_node_coords(node_disk_buf); + memcpy(data_buf, node_fp_coords, _disk_bytes_per_point); float cur_expanded_dist; - if (!use_disk_index_pq) + if (!_use_disk_index_pq) { - cur_expanded_dist = dist_cmp->compare(aligned_query_T, data_buf, (uint32_t)aligned_dim); + cur_expanded_dist = _dist_cmp->compare(aligned_query_T, data_buf, (uint32_t)_aligned_dim); } else { if (metric == diskann::Metric::INNER_PRODUCT) - cur_expanded_dist = disk_pq_table.inner_product(query_float, (uint8_t *)data_buf); + cur_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)data_buf); else - cur_expanded_dist = disk_pq_table.l2_distance(query_float, (uint8_t *)data_buf); + cur_expanded_dist = _disk_pq_table.l2_distance(query_float, (uint8_t *)data_buf); } full_retset.push_back(Neighbor(frontier_nhood.first, cur_expanded_dist)); uint32_t *node_nbrs = (node_buf + 1); @@ -1485,7 +1533,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t if (use_reorder_data) { - if (!(this->reorder_data_exists)) + if (!(this->_reorder_data_exists)) { throw ANNException("Requested use of reordering data which does " "not exist in index " @@ -1500,6 +1548,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (size_t i = 0; i < full_retset.size(); ++i) { + // MULTISECTORFIX vec_read_reqs.emplace_back(VECTOR_SECTOR_NO(((size_t)full_retset[i].id)) * defaults::SECTOR_LEN, defaults::SECTOR_LEN, sector_scratch + i * defaults::SECTOR_LEN); @@ -1524,8 +1573,9 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (size_t i = 0; i < full_retset.size(); ++i) { auto id = full_retset[i].id; + // MULTISECTORFIX auto location = (sector_scratch + i * defaults::SECTOR_LEN) + VECTOR_SECTOR_OFFSET(id); - full_retset[i].distance = dist_cmp->compare(aligned_query_T, (T *)location, (uint32_t)this->data_dim); + full_retset[i].distance = _dist_cmp->compare(aligned_query_T, (T *)location, (uint32_t)this->_data_dim); } std::sort(full_retset.begin(), full_retset.end()); @@ -1550,8 +1600,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t distances[i] = (-distances[i]); // rescale to revert back to original norms (cancelling the // effect of base and query pre-processing) - if (max_base_norm != 0) - distances[i] *= (max_base_norm * query_norm); + if (_max_base_norm != 0) + distances[i] *= (_max_base_norm * query_norm); } } } @@ -1612,7 +1662,7 @@ uint32_t PQFlashIndex::range_search(const T *query1, const double ran template uint64_t PQFlashIndex::get_data_dim() { - return data_dim; + return _data_dim; } template diskann::Metric PQFlashIndex::get_metric() From 6d4e2bfa72c8b325ce6876c4dff04065a62bd39d Mon Sep 17 00:00:00 2001 From: Yash Patel <47032340+yashpatel007@users.noreply.github.com> Date: Tue, 15 Aug 2023 15:58:31 -0400 Subject: [PATCH 06/23] Consolidate Index Constructors (#418) * initial commit * updating python bindings to use new ctor * python binding error fix * error fix * reverting some changes -> experiment * removing redundnt code from native index * python build error fix * tyring to resolve python build error * attempt at python build fix * adding IndexSearchParams * setting search threads to non zero * minor check removed * eperiment 3-> making distance fully owned by data_store * exp 3 clang fix * exp 4 * making distance as unique_ptr * trying to fix build * finally fixing problem * some minor fix * adding dll export to index_factory static function * adding dll export for static fn in index_factory * code cleanup * resolving gopal's comments * resolving build failures --- apps/build_memory_index.cpp | 44 ------- apps/build_stitched_index.cpp | 2 +- apps/test_insert_deletes_consolidate.cpp | 4 +- apps/test_streaming_scenario.cpp | 5 +- apps/utils/count_bfs_levels.cpp | 2 +- include/in_mem_data_store.h | 4 +- include/index.h | 20 ++- include/index_config.h | 58 +++++---- include/index_factory.h | 10 +- include/parameters.h | 11 ++ python/include/static_disk_index.h | 19 +-- python/src/builder.cpp | 9 +- python/src/dynamic_memory_index.cpp | 13 +- python/src/static_memory_index.cpp | 18 +-- src/disk_utils.cpp | 11 +- src/filter_utils.cpp | 3 +- src/in_mem_data_store.cpp | 4 +- src/index.cpp | 152 ++++++++++------------- src/index_factory.cpp | 25 ++-- src/restapi/search_wrapper.cpp | 3 +- 20 files changed, 194 insertions(+), 223 deletions(-) diff --git a/apps/build_memory_index.cpp b/apps/build_memory_index.cpp index 92b269f4f..1d6f0e7c6 100644 --- a/apps/build_memory_index.cpp +++ b/apps/build_memory_index.cpp @@ -22,50 +22,6 @@ namespace po = boost::program_options; -template -int build_in_memory_index(const diskann::Metric &metric, const std::string &data_path, const uint32_t R, - const uint32_t L, const float alpha, const std::string &save_path, const uint32_t num_threads, - const bool use_pq_build, const size_t num_pq_bytes, const bool use_opq, - const std::string &label_file, const std::string &universal_label, const uint32_t Lf) -{ - diskann::IndexWriteParameters paras = diskann::IndexWriteParametersBuilder(L, R) - .with_filter_list_size(Lf) - .with_alpha(alpha) - .with_saturate_graph(false) - .with_num_threads(num_threads) - .build(); - std::string labels_file_to_use = save_path + "_label_formatted.txt"; - std::string mem_labels_int_map_file = save_path + "_labels_map.txt"; - - size_t data_num, data_dim; - diskann::get_bin_metadata(data_path, data_num, data_dim); - - diskann::Index index(metric, data_dim, data_num, false, false, false, use_pq_build, num_pq_bytes, - use_opq); - auto s = std::chrono::high_resolution_clock::now(); - if (label_file == "") - { - index.build(data_path.c_str(), data_num, paras); - } - else - { - convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label); - if (universal_label != "") - { - LabelT unv_label_as_num = 0; - index.set_universal_label(unv_label_as_num); - } - index.build_filtered_index(data_path.c_str(), labels_file_to_use, data_num, paras); - } - std::chrono::duration diff = std::chrono::high_resolution_clock::now() - s; - - std::cout << "Indexing time: " << diff.count() << "\n"; - index.save(save_path.c_str()); - if (label_file != "") - std::remove(labels_file_to_use.c_str()); - return 0; -} - int main(int argc, char **argv) { std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type; diff --git a/apps/build_stitched_index.cpp b/apps/build_stitched_index.cpp index 80481f8b0..069651781 100644 --- a/apps/build_stitched_index.cpp +++ b/apps/build_stitched_index.cpp @@ -285,7 +285,7 @@ void prune_and_save(path final_index_path_prefix, path full_index_path_prefix, p auto pruning_index_timer = std::chrono::high_resolution_clock::now(); diskann::get_bin_metadata(input_data_path, number_of_label_points, dimension); - diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, false, false); + diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, false); // not searching this index, set search_l to 0 index.load(full_index_path_prefix.c_str(), num_threads, 1); diff --git a/apps/test_insert_deletes_consolidate.cpp b/apps/test_insert_deletes_consolidate.cpp index 700f4d7b6..8999688ea 100644 --- a/apps/test_insert_deletes_consolidate.cpp +++ b/apps/test_insert_deletes_consolidate.cpp @@ -152,14 +152,14 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa using TagT = uint32_t; auto data_type = diskann_type_to_name(); auto tag_type = diskann_type_to_name(); + auto index_search_params = diskann::IndexSearchParams(params.search_list_size, params.num_threads); diskann::IndexConfig index_config = diskann::IndexConfigBuilder() .with_metric(diskann::L2) .with_dimension(dim) .with_max_points(max_points_to_insert) .is_dynamic_index(true) .with_index_write_params(params) - .with_search_threads(params.num_threads) - .with_initial_search_list_size(params.search_list_size) + .with_index_search_params(index_search_params) .with_data_type(data_type) .with_tag_type(tag_type) .with_data_load_store_strategy(diskann::MEMORY) diff --git a/apps/test_streaming_scenario.cpp b/apps/test_streaming_scenario.cpp index 55e4e61cf..c40ee251e 100644 --- a/apps/test_streaming_scenario.cpp +++ b/apps/test_streaming_scenario.cpp @@ -186,6 +186,7 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con .with_num_frozen_points(num_start_pts) .build(); + auto index_search_params = diskann::IndexSearchParams(L, insert_threads); diskann::IndexWriteParameters delete_params = diskann::IndexWriteParametersBuilder(L, R) .with_max_occlusion_size(C) .with_alpha(alpha) @@ -200,7 +201,6 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims" << std::endl; aligned_dim = ROUND_UP(dim, 8); - auto index_config = diskann::IndexConfigBuilder() .with_metric(diskann::L2) .with_dimension(dim) @@ -210,12 +210,11 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con .is_use_opq(false) .with_num_pq_chunks(0) .is_pq_dist_build(false) - .with_search_threads(insert_threads) - .with_initial_search_list_size(L) .with_tag_type(diskann_type_to_name()) .with_label_type(diskann_type_to_name()) .with_data_type(diskann_type_to_name()) .with_index_write_params(params) + .with_index_search_params(index_search_params) .with_data_load_store_strategy(diskann::MEMORY) .build(); diff --git a/apps/utils/count_bfs_levels.cpp b/apps/utils/count_bfs_levels.cpp index ddc4eaf0b..1ec8225db 100644 --- a/apps/utils/count_bfs_levels.cpp +++ b/apps/utils/count_bfs_levels.cpp @@ -27,7 +27,7 @@ template void bfs_count(const std::string &index_path, uint32_t dat { using TagT = uint32_t; using LabelT = uint32_t; - diskann::Index index(diskann::Metric::L2, data_dims, 0, false, false); + diskann::Index index(diskann::Metric::L2, data_dims, 0, nullptr, nullptr, 0, false, false); std::cout << "Index class instantiated" << std::endl; index.load(index_path.c_str(), 1, 100); std::cout << "Index loaded" << std::endl; diff --git a/include/in_mem_data_store.h b/include/in_mem_data_store.h index 0509b3b82..9b6968b03 100644 --- a/include/in_mem_data_store.h +++ b/include/in_mem_data_store.h @@ -21,7 +21,7 @@ namespace diskann template class InMemDataStore : public AbstractDataStore { public: - InMemDataStore(const location_t capacity, const size_t dim, std::shared_ptr> distance_fn); + InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr> distance_fn); virtual ~InMemDataStore(); virtual location_t load(const std::string &filename) override; @@ -73,7 +73,7 @@ template class InMemDataStore : public AbstractDataStore> _distance_fn; + std::unique_ptr> _distance_fn; // in case we need to save vector norms for optimization std::shared_ptr _pre_computed_norms; diff --git a/include/index.h b/include/index.h index 0d9b6edb9..095d1599a 100644 --- a/include/index.h +++ b/include/index.h @@ -49,21 +49,16 @@ template clas **************************************************************************/ public: - // Constructor for Bulk operations and for creating the index object solely - // for loading a prexisting index. - DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points = 1, const bool dynamic_index = false, + // Call this when creating and passing Index Config is inconvenient. + DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points, + const std::shared_ptr index_parameters, + const std::shared_ptr index_search_params, + const size_t num_frozen_pts = 0, const bool dynamic_index = false, const bool enable_tags = false, const bool concurrent_consolidate = false, const bool pq_dist_build = false, const size_t num_pq_chunks = 0, - const bool use_opq = false, const size_t num_frozen_pts = 0, - const bool init_data_store = true); - - // Constructor for incremental index - DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index, - const IndexWriteParameters &indexParameters, const uint32_t initial_search_list_size, - const uint32_t search_threads, const bool enable_tags = false, - const bool concurrent_consolidate = false, const bool pq_dist_build = false, - const size_t num_pq_chunks = 0, const bool use_opq = false); + const bool use_opq = false); + // This is called by IndexFactory which returns AbstractIndex's simplified API DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr> data_store /* std::unique_ptr graph_store*/); @@ -329,7 +324,6 @@ template clas private: // Distance functions Metric _dist_metric = diskann::L2; - std::shared_ptr> _distance; // Data std::unique_ptr> _data_store; diff --git a/include/index_config.h b/include/index_config.h index b291c744d..2a8e0e8ba 100644 --- a/include/index_config.h +++ b/include/index_config.h @@ -33,24 +33,23 @@ struct IndexConfig std::string tag_type; std::string data_type; + // Params for building index std::shared_ptr index_write_params; - - uint32_t search_threads; - uint32_t initial_search_list_size; + // Params for searching index + std::shared_ptr index_search_params; private: IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension, size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags, bool pq_dist_build, bool concurrent_consolidate, bool use_opq, const std::string &data_type, const std::string &tag_type, const std::string &label_type, - std::shared_ptr index_write_params, uint32_t search_threads, - uint32_t initial_search_list_size) + std::shared_ptr index_write_params, + std::shared_ptr index_search_params) : data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension), max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build), concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), num_pq_chunks(num_pq_chunks), num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type), data_type(data_type), - index_write_params(index_write_params), search_threads(search_threads), - initial_search_list_size(initial_search_list_size) + index_write_params(index_write_params), index_search_params(index_search_params) { } @@ -60,9 +59,7 @@ struct IndexConfig class IndexConfigBuilder { public: - IndexConfigBuilder() - { - } + IndexConfigBuilder() = default; IndexConfigBuilder &with_metric(Metric m) { @@ -160,15 +157,31 @@ class IndexConfigBuilder return *this; } - IndexConfigBuilder &with_search_threads(uint32_t search_threads) + IndexConfigBuilder &with_index_write_params(std::shared_ptr index_write_params_ptr) + { + if (index_write_params_ptr == nullptr) + { + diskann::cout << "Passed, empty build_params while creating index config" << std::endl; + return *this; + } + this->_index_write_params = index_write_params_ptr; + return *this; + } + + IndexConfigBuilder &with_index_search_params(IndexSearchParams &search_params) { - this->_search_threads = search_threads; + this->_index_search_params = std::make_shared(search_params); return *this; } - IndexConfigBuilder &with_initial_search_list_size(uint32_t search_list_size) + IndexConfigBuilder &with_index_search_params(std::shared_ptr search_params_ptr) { - this->_initial_search_list_size = search_list_size; + if (search_params_ptr == nullptr) + { + diskann::cout << "Passed, empty search_params while creating index config" << std::endl; + return *this; + } + this->_index_search_params = search_params_ptr; return *this; } @@ -177,19 +190,20 @@ class IndexConfigBuilder if (_data_type == "" || _data_type.empty()) throw ANNException("Error: data_type can not be empty", -1); - if (_dynamic_index && _index_write_params != nullptr) + if (_dynamic_index && _num_frozen_pts == 0) { - if (_search_threads == 0) - throw ANNException("Error: please pass search_threads for building dynamic index.", -1); + _num_frozen_pts = 1; + } - if (_initial_search_list_size == 0) + if (_dynamic_index) + { + if (_index_search_params != nullptr && _index_search_params->initial_search_list_size == 0) throw ANNException("Error: please pass initial_search_list_size for building dynamic index.", -1); } return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks, _num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate, - _use_opq, _data_type, _tag_type, _label_type, _index_write_params, _search_threads, - _initial_search_list_size); + _use_opq, _data_type, _tag_type, _label_type, _index_write_params, _index_search_params); } IndexConfigBuilder(const IndexConfigBuilder &) = delete; @@ -217,8 +231,6 @@ class IndexConfigBuilder std::string _data_type; std::shared_ptr _index_write_params; - - uint32_t _search_threads; - uint32_t _initial_search_list_size; + std::shared_ptr _index_search_params; }; } // namespace diskann diff --git a/include/index_factory.h b/include/index_factory.h index 3d1eb7992..7ad0893cc 100644 --- a/include/index_factory.h +++ b/include/index_factory.h @@ -10,13 +10,15 @@ class IndexFactory DISKANN_DLLEXPORT explicit IndexFactory(const IndexConfig &config); DISKANN_DLLEXPORT std::unique_ptr create_instance(); + // Consruct a data store with distance function emplaced within + template + DISKANN_DLLEXPORT static std::unique_ptr> construct_datastore(DataStoreStrategy stratagy, + size_t num_points, + size_t dimension, Metric m); + private: void check_config(); - template - std::unique_ptr> construct_datastore(DataStoreStrategy stratagy, size_t num_points, - size_t dimension); - std::unique_ptr construct_graphstore(GraphStoreStrategy stratagy, size_t size); template diff --git a/include/parameters.h b/include/parameters.h index 81a336da7..209b9128c 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -38,6 +38,17 @@ class IndexWriteParameters friend class IndexWriteParametersBuilder; }; +class IndexSearchParams +{ + public: + IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads) + : initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads) + { + } + const uint32_t initial_search_list_size; // search L + const uint32_t num_search_threads; // search threads +}; + class IndexWriteParametersBuilder { /** diff --git a/python/include/static_disk_index.h b/python/include/static_disk_index.h index 71a1b5aff..4a399ab3e 100644 --- a/python/include/static_disk_index.h +++ b/python/include/static_disk_index.h @@ -6,7 +6,6 @@ #include #include - #include #include @@ -21,7 +20,8 @@ namespace py = pybind11; -namespace diskannpy { +namespace diskannpy +{ #ifdef _WINDOWS typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader; @@ -29,8 +29,7 @@ typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader; typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader; #endif -template -class StaticDiskIndex +template class StaticDiskIndex { public: StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads, @@ -40,13 +39,15 @@ class StaticDiskIndex void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads); - NeighborsAndDistances search(py::array_t &query, uint64_t knn, - uint64_t complexity, uint64_t beam_width); + NeighborsAndDistances search(py::array_t &query, + uint64_t knn, uint64_t complexity, uint64_t beam_width); + + NeighborsAndDistances batch_search( + py::array_t &queries, uint64_t num_queries, uint64_t knn, + uint64_t complexity, uint64_t beam_width, uint32_t num_threads); - NeighborsAndDistances batch_search(py::array_t &queries, uint64_t num_queries, - uint64_t knn, uint64_t complexity, uint64_t beam_width, uint32_t num_threads); private: std::shared_ptr _reader; diskann::PQFlashIndex
_index; }; -} +} // namespace diskannpy diff --git a/python/src/builder.cpp b/python/src/builder.cpp index 4485d66e6..2e593e72b 100644 --- a/python/src/builder.cpp +++ b/python/src/builder.cpp @@ -44,10 +44,15 @@ void build_memory_index(const diskann::Metric metric, const std::string &vector_ .with_saturate_graph(false) .with_num_threads(num_threads) .build(); + diskann::IndexSearchParams index_search_params = + diskann::IndexSearchParams(index_build_params.search_list_size, num_threads); size_t data_num, data_dim; diskann::get_bin_metadata(vector_bin_path, data_num, data_dim); - diskann::Index index(metric, data_dim, data_num, use_tags, use_tags, false, use_pq_build, - num_pq_bytes, use_opq); + + diskann::Index index(metric, data_dim, data_num, + std::make_shared(index_build_params), + std::make_shared(index_search_params), 0, + use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq); if (use_tags) { diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp index af276b85f..f92f4157e 100644 --- a/python/src/dynamic_memory_index.cpp +++ b/python/src/dynamic_memory_index.cpp @@ -36,14 +36,15 @@ diskann::Index dynamic_index_builder(const diskann:: { const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_threads(); + + auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, _initial_search_threads); return diskann::Index( m, dimensions, max_vectors, - true, // dynamic_index - write_params, // used for insert - initial_search_complexity, // used to prepare the scratch space for searching. can / may - // be expanded if the search asks for a larger L. - _initial_search_threads, // also used for the scratch space - true, // enable_tags + std::make_shared(write_params), // index write params + std::make_shared(index_search_params), // index_search_params + write_params.num_frozen_points, // frozen_points + true, // dynamic_index + true, // enable_tags concurrent_consolidation, false, // pq_dist_build 0, // num_pq_chunks diff --git a/python/src/static_memory_index.cpp b/python/src/static_memory_index.cpp index 3bd927174..0dbb24dc3 100644 --- a/python/src/static_memory_index.cpp +++ b/python/src/static_memory_index.cpp @@ -17,15 +17,17 @@ diskann::Index static_index_builder(const diskann::Me { throw std::runtime_error("initial_search_complexity must be a positive uint32_t"); } - + auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_threads()); return diskann::Index
(m, dimensions, num_points, - false, // not a dynamic_index - false, // no enable_tags/ids - false, // no concurrent_consolidate, - false, // pq_dist_build - 0, // num_pq_chunks - false, // use_opq = false - 0); // num_frozen_points + nullptr, // index write params + std::make_shared(index_search_params), // index search params + 0, // num frozen points + false, // not a dynamic_index + false, // no enable_tags/ids + false, // no concurrent_consolidate, + false, // pq_dist_build + 0, // num_pq_chunks + false); // use_opq = false } template diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 4ece797d1..6544df33a 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -635,8 +635,9 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr .with_num_threads(num_threads) .build(); using TagT = uint32_t; - diskann::Index _index(compareMetric, base_dim, base_num, false, false, false, - build_pq_bytes > 0, build_pq_bytes, use_opq); + diskann::Index _index( + compareMetric, base_dim, base_num, std::make_shared(paras), nullptr, + paras.num_frozen_points, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); if (!use_filters) _index.build(base_file.c_str(), base_num, paras); else @@ -696,8 +697,10 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr uint64_t shard_base_dim, shard_base_pts; get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim); - diskann::Index _index(compareMetric, shard_base_dim, shard_base_pts, false, false, false, build_pq_bytes > 0, - build_pq_bytes, use_opq); + + diskann::Index _index( + compareMetric, shard_base_dim, shard_base_pts, std::make_shared(paras), + nullptr, paras.num_frozen_points, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); if (!use_filters) { _index.build(shard_base_file.c_str(), shard_base_pts, paras); diff --git a/src/filter_utils.cpp b/src/filter_utils.cpp index 965762d1f..618666488 100644 --- a/src/filter_utils.cpp +++ b/src/filter_utils.cpp @@ -45,7 +45,8 @@ void generate_label_indices(path input_data_path, path final_index_path_prefix, size_t number_of_label_points, dimension; diskann::get_bin_metadata(curr_label_input_data_path, number_of_label_points, dimension); - diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, false, false); + diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, + false); auto index_build_timer = std::chrono::high_resolution_clock::now(); index.build(curr_label_input_data_path.c_str(), number_of_label_points, label_index_build_parameters); diff --git a/src/in_mem_data_store.cpp b/src/in_mem_data_store.cpp index f5f973917..7d02bba17 100644 --- a/src/in_mem_data_store.cpp +++ b/src/in_mem_data_store.cpp @@ -11,8 +11,8 @@ namespace diskann template InMemDataStore::InMemDataStore(const location_t num_points, const size_t dim, - std::shared_ptr> distance_fn) - : AbstractDataStore(num_points, dim), _distance_fn(distance_fn) + std::unique_ptr> distance_fn) + : AbstractDataStore(num_points, dim), _distance_fn(std::move(distance_fn)) { _aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment()); alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t)); diff --git a/src/index.cpp b/src/index.cpp index eb7592a4e..eeb7169e1 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. +#include "index_factory.h" #include #include @@ -27,59 +28,38 @@ namespace diskann // Initialize an index with metric m, load the data of type T with filename // (bin), and initialize max_points template -Index::Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index, - const IndexWriteParameters &indexParams, const uint32_t initial_search_list_size, - const uint32_t search_threads, const bool enable_tags, const bool concurrent_consolidate, - const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq) - : Index(m, dim, max_points, dynamic_index, enable_tags, concurrent_consolidate, pq_dist_build, num_pq_chunks, - use_opq, indexParams.num_frozen_points) +Index::Index(const IndexConfig &index_config, std::unique_ptr> data_store) + : _dist_metric(index_config.metric), _dim(index_config.dimension), _max_points(index_config.max_points), + _num_frozen_pts(index_config.num_frozen_pts), _dynamic_index(index_config.dynamic_index), + _enable_tags(index_config.enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), + _pq_dist(index_config.pq_dist_build), _use_opq(index_config.use_opq), _num_pq_chunks(index_config.num_pq_chunks), + _delete_set(new tsl::robin_set), _conc_consolidate(index_config.concurrent_consolidate) { - if (dynamic_index) - { - this->enable_delete(); - } - _indexingQueueSize = indexParams.search_list_size; - _indexingRange = indexParams.max_degree; - _indexingMaxC = indexParams.max_occlusion_size; - _indexingAlpha = indexParams.alpha; - _filterIndexingQueueSize = indexParams.filter_list_size; - - uint32_t num_threads_indx = indexParams.num_threads; - uint32_t num_scratch_spaces = search_threads + num_threads_indx; - initialize_query_scratch(num_scratch_spaces, initial_search_list_size, _indexingQueueSize, _indexingRange, - _indexingMaxC, dim); -} - -template -Index::Index(Metric m, const size_t dim, const size_t max_points, const bool dynamic_index, - const bool enable_tags, const bool concurrent_consolidate, const bool pq_dist_build, - const size_t num_pq_chunks, const bool use_opq, const size_t num_frozen_pts, - const bool init_data_store) - : _dist_metric(m), _dim(dim), _max_points(max_points), _num_frozen_pts(num_frozen_pts), - _dynamic_index(dynamic_index), _enable_tags(enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), - _pq_dist(pq_dist_build), _use_opq(use_opq), _num_pq_chunks(num_pq_chunks), - _delete_set(new tsl::robin_set), _conc_consolidate(concurrent_consolidate) -{ - if (dynamic_index && !enable_tags) + if (_dynamic_index && !_enable_tags) { throw ANNException("ERROR: Dynamic Indexing must have tags enabled.", -1, __FUNCSIG__, __FILE__, __LINE__); } if (_pq_dist) { - if (dynamic_index) + if (_dynamic_index) throw ANNException("ERROR: Dynamic Indexing not supported with PQ distance based " "index construction", -1, __FUNCSIG__, __FILE__, __LINE__); - if (m == diskann::Metric::INNER_PRODUCT) + if (_dist_metric == diskann::Metric::INNER_PRODUCT) throw ANNException("ERROR: Inner product metrics not yet supported " "with PQ distance " "base index", -1, __FUNCSIG__, __FILE__, __LINE__); } - if (dynamic_index && _num_frozen_pts == 0) + if (_dist_metric == diskann::Metric::COSINE && std::is_floating_point::value) + { + this->_normalize_vecs = true; + } + + if (_dynamic_index && _num_frozen_pts == 0) { _num_frozen_pts = 1; } @@ -90,7 +70,6 @@ Index::Index(Metric m, const size_t dim, const size_t max_point _max_points = 1; } const size_t total_internal_points = _max_points + _num_frozen_pts; - if (_pq_dist) { if (_num_pq_chunks > _dim) @@ -103,68 +82,63 @@ Index::Index(Metric m, const size_t dim, const size_t max_point _final_graph.resize(total_internal_points); - if (init_data_store) - { - // Issue #374: data_store is injected from index factory. Keeping this for backward compatibility. - // distance is owned by data_store - if (m == diskann::Metric::COSINE && std::is_floating_point::value) - { - // This is safe because T is float inside the if block. - this->_distance.reset((Distance *)new AVXNormalizedCosineDistanceFloat()); - this->_normalize_vecs = true; - diskann::cout << "Normalizing vectors and using L2 for cosine " - "AVXNormalizedCosineDistanceFloat()." - << std::endl; - } - else - { - this->_distance.reset((Distance *)get_distance_function(m)); - } - // Note: moved this to factory, keeping this for backward compatibility. - _data_store = - std::make_unique>((location_t)total_internal_points, _dim, this->_distance); - } + _data_store = std::move(data_store); _locks = std::vector(total_internal_points); - - if (enable_tags) + if (_enable_tags) { _location_to_tag.reserve(total_internal_points); _tag_to_location.reserve(total_internal_points); } -} - -template -Index::Index(const IndexConfig &index_config, std::unique_ptr> data_store) - : Index(index_config.metric, index_config.dimension, index_config.max_points, index_config.dynamic_index, - index_config.enable_tags, index_config.concurrent_consolidate, index_config.pq_dist_build, - index_config.num_pq_chunks, index_config.use_opq, index_config.num_frozen_pts, false) -{ - - _data_store = std::move(data_store); - _distance.reset(_data_store->get_dist_fn()); - // enable delete by default for dynamic index if (_dynamic_index) { - this->enable_delete(); - } - if (_dynamic_index && index_config.index_write_params != nullptr) - { - _indexingQueueSize = index_config.index_write_params->search_list_size; - _indexingRange = index_config.index_write_params->max_degree; - _indexingMaxC = index_config.index_write_params->max_occlusion_size; - _indexingAlpha = index_config.index_write_params->alpha; - _filterIndexingQueueSize = index_config.index_write_params->filter_list_size; + this->enable_delete(); // enable delete by default for dynamic index + // if write params are not passed, it is inffered that ctor is called by search + if (index_config.index_write_params != nullptr && index_config.index_search_params != nullptr) + { + _indexingQueueSize = index_config.index_write_params->search_list_size; + _indexingRange = index_config.index_write_params->max_degree; + _indexingMaxC = index_config.index_write_params->max_occlusion_size; + _indexingAlpha = index_config.index_write_params->alpha; + _filterIndexingQueueSize = index_config.index_write_params->filter_list_size; - uint32_t num_threads_indx = index_config.index_write_params->num_threads; - uint32_t num_scratch_spaces = index_config.search_threads + num_threads_indx; + uint32_t num_threads_indx = index_config.index_write_params->num_threads; + uint32_t num_scratch_spaces = index_config.index_search_params->num_search_threads + num_threads_indx; - initialize_query_scratch(num_scratch_spaces, index_config.initial_search_list_size, _indexingQueueSize, - _indexingRange, _indexingMaxC, _data_store->get_dims()); + initialize_query_scratch(num_scratch_spaces, index_config.index_search_params->initial_search_list_size, + _indexingQueueSize, _indexingRange, _indexingMaxC, _data_store->get_dims()); + } } } +template +Index::Index(Metric m, const size_t dim, const size_t max_points, + const std::shared_ptr index_parameters, + const std::shared_ptr index_search_params, const size_t num_frozen_pts, + const bool dynamic_index, const bool enable_tags, const bool concurrent_consolidate, + const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq) + : Index(IndexConfigBuilder() + .with_metric(m) + .with_dimension(dim) + .with_max_points(max_points) + .with_index_write_params(index_parameters) + .with_index_search_params(index_search_params) + .with_num_frozen_pts(num_frozen_pts) + .is_dynamic_index(dynamic_index) + .is_enable_tags(enable_tags) + .is_concurrent_consolidate(concurrent_consolidate) + .is_pq_dist_build(pq_dist_build) + .with_num_pq_chunks(num_pq_chunks) + .is_use_opq(use_opq) + .with_data_type(diskann_type_to_name()) + .build(), + std::move(IndexFactory::construct_datastore( + diskann::MEMORY, max_points + (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), dim, + m))) +{ +} + template Index::~Index() { // Ensure that no other activity is happening before dtor() @@ -2164,7 +2138,8 @@ std::pair Index::search(const T *query, con std::shared_lock lock(_update_lock); - _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + auto retval = iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, false, unused_filter_label, true); @@ -2266,7 +2241,7 @@ std::pair Index::search_with_filters(const // REFACTOR // T *aligned_query = scratch->aligned_query(); // memcpy(aligned_query, query, _dim * sizeof(T)); - _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); auto retval = iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, true, filter_vec, true); auto best_L_nodes = scratch->best_l_nodes(); @@ -2345,7 +2320,8 @@ size_t Index::search_with_tags(const T *query, const uint64_t K const std::vector init_ids = get_init_ids(); const std::vector unused_filter_label; - _distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + //_distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, false, unused_filter_label, true); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); diff --git a/src/index_factory.cpp b/src/index_factory.cpp index c5607f4a0..88ac44a16 100644 --- a/src/index_factory.cpp +++ b/src/index_factory.cpp @@ -51,22 +51,21 @@ void IndexFactory::check_config() template std::unique_ptr> IndexFactory::construct_datastore(DataStoreStrategy strategy, size_t num_points, - size_t dimension) + size_t dimension, Metric m) { - const size_t total_internal_points = num_points + _config->num_frozen_pts; - std::shared_ptr> distance; + std::unique_ptr> distance; switch (strategy) { case MEMORY: - if (_config->metric == diskann::Metric::COSINE && std::is_same::value) + if (m == diskann::Metric::COSINE && std::is_same::value) { distance.reset((Distance *)new AVXNormalizedCosineDistanceFloat()); - return std::make_unique>((location_t)total_internal_points, dimension, distance); + return std::make_unique>((location_t)num_points, dimension, std::move(distance)); } else { - distance.reset((Distance *)get_distance_function(_config->metric)); - return std::make_unique>((location_t)total_internal_points, dimension, distance); + distance.reset((Distance *)get_distance_function(m)); + return std::make_unique>((location_t)num_points, dimension, std::move(distance)); } break; default: @@ -83,10 +82,11 @@ std::unique_ptr IndexFactory::construct_graphstore(GraphStor template std::unique_ptr IndexFactory::create_instance() { - size_t num_points = _config->max_points; + size_t num_points = _config->max_points + _config->num_frozen_pts; size_t dim = _config->dimension; // auto graph_store = construct_graphstore(_config->graph_strategy, num_points); - auto data_store = construct_datastore(_config->data_strategy, num_points, dim); + auto data_store = + IndexFactory::construct_datastore(_config->data_strategy, num_points, dim, _config->metric); return std::make_unique>(*_config, std::move(data_store)); } @@ -147,4 +147,11 @@ std::unique_ptr IndexFactory::create_instance(const std::string & throw ANNException("Error: unsupported label_type please choose from [uint/ushort]", -1); } +template DISKANN_DLLEXPORT std::unique_ptr> IndexFactory::construct_datastore( + DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m); +template DISKANN_DLLEXPORT std::unique_ptr> IndexFactory::construct_datastore( + DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m); +template DISKANN_DLLEXPORT std::unique_ptr> IndexFactory::construct_datastore( + DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m); + } // namespace diskann diff --git a/src/restapi/search_wrapper.cpp b/src/restapi/search_wrapper.cpp index dc9f5734e..2cbefef3f 100644 --- a/src/restapi/search_wrapper.cpp +++ b/src/restapi/search_wrapper.cpp @@ -100,7 +100,8 @@ InMemorySearch::InMemorySearch(const std::string &baseFile, const std::string { size_t dimensions, total_points = 0; diskann::get_bin_metadata(baseFile, total_points, dimensions); - _index = std::unique_ptr>(new diskann::Index(m, dimensions, total_points, false)); + _index = std::unique_ptr>( + new diskann::Index(m, dimensions, total_points, nullptr, search_l, 0, false)); _index->load(indexFile.c_str(), num_threads, search_l); } From 39b33304964d543dd95e20c77a3e25576647bcfb Mon Sep 17 00:00:00 2001 From: Philip Adams <35666630+PhilipBAdams@users.noreply.github.com> Date: Tue, 15 Aug 2023 15:36:57 -0700 Subject: [PATCH 07/23] Add convenience functions for parsing the PQ index (#349) * move read_nodes to public, add get_pq_vector and get_num_points * clang-format * Match new private var naming convention * more private (_) fixes * VID->vid * VID->vid cpp --- include/pq_flash_index.h | 25 ++++++++++++++----------- src/pq_flash_index.cpp | 12 ++++++++++++ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index bc7fe312d..7214bd6a1 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -94,6 +94,20 @@ template class PQFlashIndex DISKANN_DLLEXPORT diskann::Metric get_metric(); + // + // node_ids: input list of node_ids to be read + // coord_buffers: pointers to pre-allocated buffers that coords need to copied to. If null, dont copy. + // nbr_buffers: pre-allocated buffers to copy neighbors into + // + // returns a vector of bool one for each node_id: true if read is success, else false + // + DISKANN_DLLEXPORT std::vector read_nodes(const std::vector &node_ids, + std::vector &coord_buffers, + std::vector> &nbr_buffers); + + DISKANN_DLLEXPORT std::vector get_pq_vector(std::uint64_t vid); + DISKANN_DLLEXPORT uint64_t get_num_points(); + protected: DISKANN_DLLEXPORT void use_medoids_data_as_centroids(); DISKANN_DLLEXPORT void setup_thread_data(uint64_t nthreads, uint64_t visited_reserve = 4096); @@ -121,17 +135,6 @@ template class PQFlashIndex // returns region of `node_buf` containing [COORD(T)] DISKANN_DLLEXPORT T *offset_to_node_coords(char *node_buf); - // - // node_ids: input list of node_ids to be read - // coord_buffers: pointers to pre-allocated buffers that coords need to copied to. If null, dont copy. - // nbr_buffers: pre-allocated buffers to copy neighbors into - // - // returns a vector of bool one for each node_id: true if read is success, else false - // - DISKANN_DLLEXPORT std::vector read_nodes(const std::vector &node_ids, - std::vector &coord_buffers, - std::vector> &nbr_buffers); - // index info for multi-node sectors // nhood of node `i` is in sector: [i / nnodes_per_sector] // offset in sector: [(i % nnodes_per_sector) * max_node_len] diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 4e5dab7b8..721009830 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -1688,6 +1688,18 @@ template char *PQFlashIndex::getHeaderB } #endif +template +std::vector PQFlashIndex::get_pq_vector(std::uint64_t vid) +{ + std::uint8_t *pqVec = &this->data[vid * this->_n_chunks]; + return std::vector(pqVec, pqVec + this->_n_chunks); +} + +template std::uint64_t PQFlashIndex::get_num_points() +{ + return _num_points; +} + // instantiations template class PQFlashIndex; template class PQFlashIndex; From df7c5303d60bac731a9c3a4d764987056e489914 Mon Sep 17 00:00:00 2001 From: Philip Adams <35666630+PhilipBAdams@users.noreply.github.com> Date: Thu, 17 Aug 2023 13:45:14 -0700 Subject: [PATCH 08/23] fix OLS build (#428) * fix OLS build * Add a build to CI with feature flags enabled --- .github/actions/build/action.yml | 13 ++++++++++++- include/pq_flash_index.h | 2 +- src/pq_flash_index.cpp | 8 ++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 2b470d9dc..219d9d630 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -25,4 +25,15 @@ runs: mkdir dist mklink /j .\dist\bin .\x64\Release\ shell: cmd - # ------------ End Windows Build --------------- \ No newline at end of file + # ------------ End Windows Build --------------- + # ------------ Windows Build With EXEC_ENV_OLS and USE_BING_INFRA --------------- + - name: Add VisualStudio command line tools into path + if: runner.os == 'Windows' + uses: ilammy/msvc-dev-cmd@v1 + - name: Run configure and build for Windows with Bing feature flags + if: runner.os == 'Windows' + run: | + mkdir build_bing && cd build_bing && cmake .. -DEXEC_ENV_OLS=1 -DUSE_BING_INFRA=1 -DUNIT_TEST=True && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" -consoleloggerparameters:"ErrorsOnly;Summary" + cd .. + shell: cmd + # ------------ End Windows Build --------------- diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index 7214bd6a1..83668e0ea 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -238,7 +238,7 @@ template class PQFlashIndex // Set to a larger value than the actual header to accommodate // any additions we make to the header. This is an outer limit // on how big the header can be. - static const int HEADER_SIZE = SECTOR_LEN; + static const int HEADER_SIZE = defaults::SECTOR_LEN; char *getHeaderBytes(); #endif }; diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 721009830..b0121493e 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -750,7 +750,7 @@ template int PQFlashIndex::load(uint32_ std::string pq_compressed_vectors = std::string(index_prefix) + "_pq_compressed.bin"; std::string _disk_index_file = std::string(index_prefix) + "_disk.index"; #ifdef EXEC_ENV_OLS - return load_from_separate_paths(files, num_threads, disk_index_file.c_str(), pq_table_bin.c_str(), + return load_from_separate_paths(files, num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(), pq_compressed_vectors.c_str()); #else return load_from_separate_paths(num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(), @@ -919,7 +919,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons #ifdef EXEC_ENV_OLS // giving 0 chunks to make the _pq_table infer from the // chunk_offsets file the correct value - disk_pq_table.load_pq_centroid_bin(files, disk_pq_pivots_path.c_str(), 0); + _disk_pq_table.load_pq_centroid_bin(files, disk_pq_pivots_path.c_str(), 0); #else // giving 0 chunks to make the _pq_table infer from the // chunk_offsets file the correct value @@ -939,7 +939,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons // DiskPriorityIO class. So, we need to estimate how many // bytes are needed to store the header and read in that many using our // 'standard' aligned file reader approach. - reader->open(disk_index_file); + reader->open(_disk_index_file); this->setup_thread_data(num_threads); this->_max_nthreads = num_threads; @@ -1033,7 +1033,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons if (files.fileExists(medoids_file)) { size_t tmp_dim; - diskann::load_bin(files, medoids_file, medoids, _num_medoids, tmp_dim); + diskann::load_bin(files, medoids_file, _medoids, _num_medoids, tmp_dim); #else if (file_exists(medoids_file)) { From 4162c2118908cf55e719c0cdfdcd50b9069bef8f Mon Sep 17 00:00:00 2001 From: Yash Patel <47032340+yashpatel007@users.noreply.github.com> Date: Thu, 17 Aug 2023 17:15:53 -0400 Subject: [PATCH 09/23] In Memory Graph Store (#395) * inmem_graph_store initial impl * barebones of in mem graph store * refactoring index to use index factory * clang format fix * making enum to enum class (c++ 11 style) for scope resolution with same enum values * cleaning up API for GraphSore * moving _nd back to index class * resolving PR comments * error fix * error fix for dynamic * resolving PR comments * removing _num_frozen_point from graph store * minor fix * moving _start back to main + minor update in graph store api to support that * adding requested changes from Gopal * removing reservations * resolving namespace resolution for defaults after build failure * minor update * minor update * speeding up location update logic while repositioning * updated with reserving mem for graph neighbours upfront * build error fix * minor update in assert * initial commit * updating python bindings to use new ctor * python binding error fix * error fix * reverting some changes -> experiment * removing redundnt code from native index * python build error fix * tyring to resolve python build error * attempt at python build fix * adding IndexSearchParams * setting search threads to non zero * minor check removed * eperiment 3-> making distance fully owned by data_store * exp 3 clang fix * exp 4 * making distance as unique_ptr * trying to fix build * finally fixing problem * some minor fix * adding dll export to index_factory static function * adding dll export for static fn in index_factory * code cleanup * resolving errors after merge * resolving build errors * fixing build error for stitched index * resolving build errors * removing max_observed_degree set() * removing comments + typo fix * replacing add_neighbour with set_neighbours where we can * error fix --- apps/build_memory_index.cpp | 28 +- apps/build_stitched_index.cpp | 1 + apps/search_memory_index.cpp | 3 +- apps/test_insert_deletes_consolidate.cpp | 3 +- apps/test_streaming_scenario.cpp | 3 +- include/abstract_graph_store.h | 49 ++- include/in_mem_graph_store.h | 38 ++- include/index.h | 19 +- include/index_config.h | 13 +- include/index_factory.h | 12 +- src/filter_utils.cpp | 6 +- src/in_mem_graph_store.cpp | 225 +++++++++++++- src/index.cpp | 368 +++++++---------------- src/index_factory.cpp | 31 +- 14 files changed, 473 insertions(+), 326 deletions(-) diff --git a/apps/build_memory_index.cpp b/apps/build_memory_index.cpp index 1d6f0e7c6..6e9eb6677 100644 --- a/apps/build_memory_index.cpp +++ b/apps/build_memory_index.cpp @@ -120,32 +120,34 @@ int main(int argc, char **argv) size_t data_num, data_dim; diskann::get_bin_metadata(data_path, data_num, data_dim); + auto index_build_params = diskann::IndexWriteParametersBuilder(L, R) + .with_filter_list_size(Lf) + .with_alpha(alpha) + .with_saturate_graph(false) + .with_num_threads(num_threads) + .build(); + + auto build_params = diskann::IndexBuildParamsBuilder(index_build_params) + .with_universal_label(universal_label) + .with_label_file(label_file) + .with_save_path_prefix(index_path_prefix) + .build(); auto config = diskann::IndexConfigBuilder() .with_metric(metric) .with_dimension(data_dim) .with_max_points(data_num) - .with_data_load_store_strategy(diskann::MEMORY) + .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY) + .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY) .with_data_type(data_type) .with_label_type(label_type) .is_dynamic_index(false) + .with_index_write_params(index_build_params) .is_enable_tags(false) .is_use_opq(use_opq) .is_pq_dist_build(use_pq_build) .with_num_pq_chunks(build_PQ_bytes) .build(); - auto index_build_params = diskann::IndexWriteParametersBuilder(L, R) - .with_filter_list_size(Lf) - .with_alpha(alpha) - .with_saturate_graph(false) - .with_num_threads(num_threads) - .build(); - - auto build_params = diskann::IndexBuildParamsBuilder(index_build_params) - .with_universal_label(universal_label) - .with_label_file(label_file) - .with_save_path_prefix(index_path_prefix) - .build(); auto index_factory = diskann::IndexFactory(config); auto index = index_factory.create_instance(); index->build(data_path, data_num, build_params); diff --git a/apps/build_stitched_index.cpp b/apps/build_stitched_index.cpp index 069651781..7767a4bc6 100644 --- a/apps/build_stitched_index.cpp +++ b/apps/build_stitched_index.cpp @@ -285,6 +285,7 @@ void prune_and_save(path final_index_path_prefix, path full_index_path_prefix, p auto pruning_index_timer = std::chrono::high_resolution_clock::now(); diskann::get_bin_metadata(input_data_path, number_of_label_points, dimension); + diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, false); // not searching this index, set search_l to 0 diff --git a/apps/search_memory_index.cpp b/apps/search_memory_index.cpp index 44817242c..d309fa804 100644 --- a/apps/search_memory_index.cpp +++ b/apps/search_memory_index.cpp @@ -74,7 +74,8 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path, .with_metric(metric) .with_dimension(query_dim) .with_max_points(0) - .with_data_load_store_strategy(diskann::MEMORY) + .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY) + .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY) .with_data_type(diskann_type_to_name()) .with_label_type(diskann_type_to_name()) .with_tag_type(diskann_type_to_name()) diff --git a/apps/test_insert_deletes_consolidate.cpp b/apps/test_insert_deletes_consolidate.cpp index 8999688ea..4b7d230ef 100644 --- a/apps/test_insert_deletes_consolidate.cpp +++ b/apps/test_insert_deletes_consolidate.cpp @@ -162,7 +162,8 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa .with_index_search_params(index_search_params) .with_data_type(data_type) .with_tag_type(tag_type) - .with_data_load_store_strategy(diskann::MEMORY) + .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY) + .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY) .is_enable_tags(enable_tags) .is_concurrent_consolidate(concurrent) .build(); diff --git a/apps/test_streaming_scenario.cpp b/apps/test_streaming_scenario.cpp index c40ee251e..d8878cced 100644 --- a/apps/test_streaming_scenario.cpp +++ b/apps/test_streaming_scenario.cpp @@ -215,7 +215,8 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con .with_data_type(diskann_type_to_name()) .with_index_write_params(params) .with_index_search_params(index_search_params) - .with_data_load_store_strategy(diskann::MEMORY) + .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY) + .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY) .build(); diskann::IndexFactory index_factory = diskann::IndexFactory(index_config); diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h index f7735b79a..c0deade17 100644 --- a/include/abstract_graph_store.h +++ b/include/abstract_graph_store.h @@ -5,7 +5,6 @@ #include #include - #include "types.h" namespace diskann @@ -14,18 +13,54 @@ namespace diskann class AbstractGraphStore { public: - AbstractGraphStore(const size_t max_pts) : _capacity(max_pts) + AbstractGraphStore(const size_t total_pts, const size_t reserve_graph_degree) + : _capacity(total_pts), _reserve_graph_degree(reserve_graph_degree) + { + } + + // returns tuple of + virtual std::tuple load(const std::string &index_path_prefix, + const size_t num_points) = 0; + virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points, + const uint32_t start) = 0; + + // not synchronised, user should use lock when necvessary. + virtual const std::vector &get_neighbours(const location_t i) const = 0; + virtual void add_neighbour(const location_t i, location_t neighbour_id) = 0; + virtual void clear_neighbours(const location_t i) = 0; + virtual void swap_neighbours(const location_t a, location_t b) = 0; + + virtual void set_neighbours(const location_t i, std::vector &neighbours) = 0; + + virtual size_t resize_graph(const size_t new_size) = 0; + virtual void clear_graph() = 0; + + virtual uint32_t get_max_observed_degree() = 0; + + // set during load + virtual size_t get_max_range_of_graph() = 0; + + // Total internal points _max_points + _num_frozen_points + size_t get_total_points() { + return _capacity; } - virtual int load(const std::string &index_path_prefix) = 0; - virtual int store(const std::string &index_path_prefix) = 0; + protected: + // Internal function, changes total points when resize_graph is called. + void set_total_points(size_t new_capacity) + { + _capacity = new_capacity; + } - virtual void get_adj_list(const location_t i, std::vector &neighbors) = 0; - virtual void set_adj_list(const location_t i, std::vector &neighbors) = 0; + size_t get_reserve_graph_degree() + { + return _reserve_graph_degree; + } private: size_t _capacity; + size_t _reserve_graph_degree; }; -} // namespace diskann +} // namespace diskann \ No newline at end of file diff --git a/include/in_mem_graph_store.h b/include/in_mem_graph_store.h index 98a9e4dc5..d0206a7d6 100644 --- a/include/in_mem_graph_store.h +++ b/include/in_mem_graph_store.h @@ -11,13 +11,41 @@ namespace diskann class InMemGraphStore : public AbstractGraphStore { public: - InMemGraphStore(const size_t max_pts); + InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree); - int load(const std::string &index_path_prefix); - int store(const std::string &index_path_prefix); + // returns tuple of + virtual std::tuple load(const std::string &index_path_prefix, + const size_t num_points) override; + virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points, + const uint32_t start) override; - void get_adj_list(const location_t i, std::vector &neighbors); - void set_adj_list(const location_t i, std::vector &neighbors); + virtual const std::vector &get_neighbours(const location_t i) const override; + virtual void add_neighbour(const location_t i, location_t neighbour_id) override; + virtual void clear_neighbours(const location_t i) override; + virtual void swap_neighbours(const location_t a, location_t b) override; + + virtual void set_neighbours(const location_t i, std::vector &neighbors) override; + + virtual size_t resize_graph(const size_t new_size) override; + virtual void clear_graph() override; + + virtual size_t get_max_range_of_graph() override; + virtual uint32_t get_max_observed_degree() override; + + protected: + virtual std::tuple load_impl(const std::string &filename, size_t expected_num_points); +#ifdef EXEC_ENV_OLS + virtual std::tuple load_impl(AlignedFileReader &reader, size_t expected_num_points); +#endif + + int save_graph(const std::string &index_path_prefix, const size_t active_points, const size_t num_frozen_points, + const uint32_t start); + + private: + size_t _max_range_of_graph = 0; + uint32_t _max_observed_degree = 0; + + std::vector> _graph; }; } // namespace diskann diff --git a/include/index.h b/include/index.h index 095d1599a..b22dcdce4 100644 --- a/include/index.h +++ b/include/index.h @@ -19,6 +19,7 @@ #include "windows_customizations.h" #include "scratch.h" #include "in_mem_data_store.h" +#include "in_mem_graph_store.h" #include "abstract_index.h" #define OVERHEAD_FACTOR 1.1 @@ -58,9 +59,8 @@ template clas const bool pq_dist_build = false, const size_t num_pq_chunks = 0, const bool use_opq = false); - // This is called by IndexFactory which returns AbstractIndex's simplified API - DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr> data_store - /* std::unique_ptr graph_store*/); + DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr> data_store, + std::unique_ptr graph_store); DISKANN_DLLEXPORT ~Index(); @@ -327,10 +327,11 @@ template clas // Data std::unique_ptr> _data_store; - char *_opt_graph = nullptr; // Graph related data structures - std::vector> _final_graph; + std::unique_ptr _graph_store; + + char *_opt_graph = nullptr; T *_data = nullptr; // coordinates of all base points // Dimensions @@ -344,15 +345,13 @@ template clas // needed for a dynamic index. The frozen points have consecutive locations. // See also _start below. size_t _num_frozen_pts = 0; - size_t _max_range_of_loaded_graph = 0; size_t _node_size; size_t _data_len; size_t _neighbor_len; - uint32_t _max_observed_degree = 0; - // Start point of the search. When _num_frozen_pts is greater than zero, - // this is the location of the first frozen point. Otherwise, this is a - // location of one of the points in index. + // Start point of the search. When _num_frozen_pts is greater than zero, + // this is the location of the first frozen point. Otherwise, this is a + // location of one of the points in index. uint32_t _start = 0; bool _has_built = false; diff --git a/include/index_config.h b/include/index_config.h index 2a8e0e8ba..8b873fb6c 100644 --- a/include/index_config.h +++ b/include/index_config.h @@ -3,14 +3,16 @@ namespace diskann { -enum DataStoreStrategy +enum class DataStoreStrategy { MEMORY }; -enum GraphStoreStrategy +enum class GraphStoreStrategy { + MEMORY }; + struct IndexConfig { DataStoreStrategy data_strategy; @@ -201,6 +203,13 @@ class IndexConfigBuilder throw ANNException("Error: please pass initial_search_list_size for building dynamic index.", -1); } + // sanity check + if (_dynamic_index && _num_frozen_pts == 0) + { + diskann::cout << "_num_frozen_pts passed as 0 for dynamic_index. Setting it to 1 for safety." << std::endl; + _num_frozen_pts = 1; + } + return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks, _num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate, _use_opq, _data_type, _tag_type, _label_type, _index_write_params, _index_search_params); diff --git a/include/index_factory.h b/include/index_factory.h index 7ad0893cc..0f79b006e 100644 --- a/include/index_factory.h +++ b/include/index_factory.h @@ -12,15 +12,17 @@ class IndexFactory // Consruct a data store with distance function emplaced within template - DISKANN_DLLEXPORT static std::unique_ptr> construct_datastore(DataStoreStrategy stratagy, - size_t num_points, - size_t dimension, Metric m); + DISKANN_DLLEXPORT static std::unique_ptr> construct_datastore(const DataStoreStrategy stratagy, + const size_t num_points, + const size_t dimension, + const Metric m); + + DISKANN_DLLEXPORT static std::unique_ptr construct_graphstore( + const GraphStoreStrategy stratagy, const size_t size, const size_t reserve_graph_degree); private: void check_config(); - std::unique_ptr construct_graphstore(GraphStoreStrategy stratagy, size_t size); - template std::unique_ptr create_instance(); diff --git a/src/filter_utils.cpp b/src/filter_utils.cpp index 618666488..f077a14a3 100644 --- a/src/filter_utils.cpp +++ b/src/filter_utils.cpp @@ -45,8 +45,10 @@ void generate_label_indices(path input_data_path, path final_index_path_prefix, size_t number_of_label_points, dimension; diskann::get_bin_metadata(curr_label_input_data_path, number_of_label_points, dimension); - diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, - false); + + diskann::Index index(diskann::Metric::L2, dimension, number_of_label_points, + std::make_shared(label_index_build_parameters), nullptr, + 0, false, false); auto index_build_timer = std::chrono::high_resolution_clock::now(); index.build(curr_label_input_data_path.c_str(), number_of_label_points, label_index_build_parameters); diff --git a/src/in_mem_graph_store.cpp b/src/in_mem_graph_store.cpp index e9bfd4e9e..c12b2514e 100644 --- a/src/in_mem_graph_store.cpp +++ b/src/in_mem_graph_store.cpp @@ -6,26 +6,237 @@ namespace diskann { +InMemGraphStore::InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree) + : AbstractGraphStore(total_pts, reserve_graph_degree) +{ + this->resize_graph(total_pts); + for (size_t i = 0; i < total_pts; i++) + { + _graph[i].reserve(reserve_graph_degree); + } +} + +std::tuple InMemGraphStore::load(const std::string &index_path_prefix, + const size_t num_points) +{ + return load_impl(index_path_prefix, num_points); +} +int InMemGraphStore::store(const std::string &index_path_prefix, const size_t num_points, + const size_t num_frozen_points, const uint32_t start) +{ + return save_graph(index_path_prefix, num_points, num_frozen_points, start); +} +const std::vector &InMemGraphStore::get_neighbours(const location_t i) const +{ + return _graph.at(i); +} + +void InMemGraphStore::add_neighbour(const location_t i, location_t neighbour_id) +{ + _graph[i].emplace_back(neighbour_id); + if (_max_observed_degree < _graph[i].size()) + { + _max_observed_degree = (uint32_t)(_graph[i].size()); + } +} + +void InMemGraphStore::clear_neighbours(const location_t i) +{ + _graph[i].clear(); +}; +void InMemGraphStore::swap_neighbours(const location_t a, location_t b) +{ + _graph[a].swap(_graph[b]); +}; + +void InMemGraphStore::set_neighbours(const location_t i, std::vector &neighbours) +{ + _graph[i].assign(neighbours.begin(), neighbours.end()); + if (_max_observed_degree < neighbours.size()) + { + _max_observed_degree = (uint32_t)(neighbours.size()); + } +} + +size_t InMemGraphStore::resize_graph(const size_t new_size) +{ + _graph.resize(new_size); + set_total_points(new_size); + return _graph.size(); +} -InMemGraphStore::InMemGraphStore(const size_t max_pts) : AbstractGraphStore(max_pts) +void InMemGraphStore::clear_graph() { + _graph.clear(); } -int InMemGraphStore::load(const std::string &index_path_prefix) +#ifdef EXEC_ENV_OLS +std::tuple InMemGraphStore::load_impl(AlignedFileReader &reader, size_t expected_num_points) { - return 0; + size_t expected_file_size; + size_t file_frozen_pts; + uint32_t start; + + auto max_points = get_max_points(); + int header_size = 2 * sizeof(size_t) + 2 * sizeof(uint32_t); + std::unique_ptr header = std::make_unique(header_size); + read_array(reader, header.get(), header_size); + + expected_file_size = *((size_t *)header.get()); + _max_observed_degree = *((uint32_t *)(header.get() + sizeof(size_t))); + start = *((uint32_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t))); + file_frozen_pts = *((size_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t))); + + diskann::cout << "From graph header, expected_file_size: " << expected_file_size + << ", _max_observed_degree: " << _max_observed_degree << ", _start: " << start + << ", file_frozen_pts: " << file_frozen_pts << std::endl; + + diskann::cout << "Loading vamana graph from reader..." << std::flush; + + // If user provides more points than max_points + // resize the _graph to the larger size. + if (get_total_points() < expected_num_points) + { + diskann::cout << "resizing graph to " << expected_num_points << std::endl; + this->resize_graph(expected_num_points); + } + + uint32_t nodes_read = 0; + size_t cc = 0; + size_t graph_offset = header_size; + while (nodes_read < expected_num_points) + { + uint32_t k; + read_value(reader, k, graph_offset); + graph_offset += sizeof(uint32_t); + std::vector tmp(k); + tmp.reserve(k); + read_array(reader, tmp.data(), k, graph_offset); + graph_offset += k * sizeof(uint32_t); + cc += k; + _graph[nodes_read].swap(tmp); + nodes_read++; + if (nodes_read % 1000000 == 0) + { + diskann::cout << "." << std::flush; + } + if (k > _max_range_of_graph) + { + _max_range_of_graph = k; + } + } + + diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start + << std::endl; + return std::make_tuple(nodes_read, start, file_frozen_pts); } -int InMemGraphStore::store(const std::string &index_path_prefix) +#endif + +std::tuple InMemGraphStore::load_impl(const std::string &filename, + size_t expected_num_points) { - return 0; + size_t expected_file_size; + size_t file_frozen_pts; + uint32_t start; + size_t file_offset = 0; // will need this for single file format support + + std::ifstream in; + in.exceptions(std::ios::badbit | std::ios::failbit); + in.open(filename, std::ios::binary); + in.seekg(file_offset, in.beg); + in.read((char *)&expected_file_size, sizeof(size_t)); + in.read((char *)&_max_observed_degree, sizeof(uint32_t)); + in.read((char *)&start, sizeof(uint32_t)); + in.read((char *)&file_frozen_pts, sizeof(size_t)); + size_t vamana_metadata_size = sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(size_t); + + diskann::cout << "From graph header, expected_file_size: " << expected_file_size + << ", _max_observed_degree: " << _max_observed_degree << ", _start: " << start + << ", file_frozen_pts: " << file_frozen_pts << std::endl; + + diskann::cout << "Loading vamana graph " << filename << "..." << std::flush; + + // If user provides more points than max_points + // resize the _graph to the larger size. + if (get_total_points() < expected_num_points) + { + diskann::cout << "resizing graph to " << expected_num_points << std::endl; + this->resize_graph(expected_num_points); + } + + size_t bytes_read = vamana_metadata_size; + size_t cc = 0; + uint32_t nodes_read = 0; + while (bytes_read != expected_file_size) + { + uint32_t k; + in.read((char *)&k, sizeof(uint32_t)); + + if (k == 0) + { + diskann::cerr << "ERROR: Point found with no out-neighbours, point#" << nodes_read << std::endl; + } + + cc += k; + ++nodes_read; + std::vector tmp(k); + tmp.reserve(k); + in.read((char *)tmp.data(), k * sizeof(uint32_t)); + _graph[nodes_read - 1].swap(tmp); + bytes_read += sizeof(uint32_t) * ((size_t)k + 1); + if (nodes_read % 10000000 == 0) + diskann::cout << "." << std::flush; + if (k > _max_range_of_graph) + { + _max_range_of_graph = k; + } + } + + diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start + << std::endl; + return std::make_tuple(nodes_read, start, file_frozen_pts); +} + +int InMemGraphStore::save_graph(const std::string &index_path_prefix, const size_t num_points, + const size_t num_frozen_points, const uint32_t start) +{ + std::ofstream out; + open_file_to_write(out, index_path_prefix); + + size_t file_offset = 0; + out.seekp(file_offset, out.beg); + size_t index_size = 24; + uint32_t max_degree = 0; + out.write((char *)&index_size, sizeof(uint64_t)); + out.write((char *)&_max_observed_degree, sizeof(uint32_t)); + uint32_t ep_u32 = start; + out.write((char *)&ep_u32, sizeof(uint32_t)); + out.write((char *)&num_frozen_points, sizeof(size_t)); + + // Note: num_points = _nd + _num_frozen_points + for (uint32_t i = 0; i < num_points; i++) + { + uint32_t GK = (uint32_t)_graph[i].size(); + out.write((char *)&GK, sizeof(uint32_t)); + out.write((char *)_graph[i].data(), GK * sizeof(uint32_t)); + max_degree = _graph[i].size() > max_degree ? (uint32_t)_graph[i].size() : max_degree; + index_size += (size_t)(sizeof(uint32_t) * (GK + 1)); + } + out.seekp(file_offset, out.beg); + out.write((char *)&index_size, sizeof(uint64_t)); + out.write((char *)&max_degree, sizeof(uint32_t)); + out.close(); + return (int)index_size; } -void InMemGraphStore::get_adj_list(const location_t i, std::vector &neighbors) +size_t InMemGraphStore::get_max_range_of_graph() { + return _max_range_of_graph; } -void InMemGraphStore::set_adj_list(const location_t i, std::vector &neighbors) +uint32_t InMemGraphStore::get_max_observed_degree() { + return _max_observed_degree; } } // namespace diskann diff --git a/src/index.cpp b/src/index.cpp index eeb7169e1..799b4bb9c 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1,16 +1,16 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. -#include "index_factory.h" -#include #include -#include "tsl/robin_set.h" -#include "tsl/robin_map.h" -#include "boost/dynamic_bitset.hpp" +#include +#include "boost/dynamic_bitset.hpp" +#include "index_factory.h" #include "memory_mapper.h" #include "timer.h" +#include "tsl/robin_map.h" +#include "tsl/robin_set.h" #include "windows_customizations.h" #if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) #include "gperftools/malloc_extension.h" @@ -28,14 +28,14 @@ namespace diskann // Initialize an index with metric m, load the data of type T with filename // (bin), and initialize max_points template -Index::Index(const IndexConfig &index_config, std::unique_ptr> data_store) +Index::Index(const IndexConfig &index_config, std::unique_ptr> data_store, + std::unique_ptr graph_store) : _dist_metric(index_config.metric), _dim(index_config.dimension), _max_points(index_config.max_points), _num_frozen_pts(index_config.num_frozen_pts), _dynamic_index(index_config.dynamic_index), _enable_tags(index_config.enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr), _pq_dist(index_config.pq_dist_build), _use_opq(index_config.use_opq), _num_pq_chunks(index_config.num_pq_chunks), _delete_set(new tsl::robin_set), _conc_consolidate(index_config.concurrent_consolidate) { - if (_dynamic_index && !_enable_tags) { throw ANNException("ERROR: Dynamic Indexing must have tags enabled.", -1, __FUNCSIG__, __FILE__, __LINE__); @@ -80,9 +80,8 @@ Index::Index(const IndexConfig &index_config, std::unique_ptr(total_internal_points); if (_enable_tags) @@ -94,7 +93,8 @@ Index::Index(const IndexConfig &index_config, std::unique_ptrenable_delete(); // enable delete by default for dynamic index - // if write params are not passed, it is inffered that ctor is called by search + // if write params are not passed, it is inffered that ctor is called by + // search if (index_config.index_write_params != nullptr && index_config.index_search_params != nullptr) { _indexingQueueSize = index_config.index_write_params->search_list_size; @@ -133,9 +133,14 @@ Index::Index(Metric m, const size_t dim, const size_t max_point .is_use_opq(use_opq) .with_data_type(diskann_type_to_name()) .build(), - std::move(IndexFactory::construct_datastore( - diskann::MEMORY, max_points + (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), dim, - m))) + IndexFactory::construct_datastore( + DataStoreStrategy::MEMORY, + max_points + (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), dim, m), + IndexFactory::construct_graphstore( + GraphStoreStrategy::MEMORY, + max_points + (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts), + (size_t)((index_parameters == nullptr ? 0 : index_parameters->max_degree) * + defaults::GRAPH_SLACK_FACTOR * 1.05))) { } @@ -152,13 +157,6 @@ template Index::~I LockGuard lg(lock); } - // if (this->_distance != nullptr) - //{ - // delete this->_distance; - // this->_distance = nullptr; - // } - // REFACTOR - if (_opt_graph != nullptr) { delete[] _opt_graph; @@ -190,6 +188,7 @@ template size_t Index size_t Index size_t Index::save_data(std::string data_file) { // Note: at this point, either _nd == _max_points or any frozen points have - // been temporarily moved to _nd, so _nd + _num_frozen_points is the valid + // been temporarily moved to _nd, so _nd + _num_frozen_pts is the valid // location limit. return _data_store->save(data_file, (location_t)(_nd + _num_frozen_pts)); } @@ -234,34 +233,7 @@ template size_t Index size_t Index::save_graph(std::string graph_file) { - std::ofstream out; - open_file_to_write(out, graph_file); - - size_t file_offset = 0; // we will use this if we want - out.seekp(file_offset, out.beg); - size_t index_size = 24; - uint32_t max_degree = 0; - out.write((char *)&index_size, sizeof(uint64_t)); - out.write((char *)&_max_observed_degree, sizeof(uint32_t)); - uint32_t ep_u32 = _start; - out.write((char *)&ep_u32, sizeof(uint32_t)); - out.write((char *)&_num_frozen_pts, sizeof(size_t)); - // Note: at this point, either _nd == _max_points or any frozen points have - // been temporarily moved to _nd, so _nd + _num_frozen_points is the valid - // location limit. - for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++) - { - uint32_t GK = (uint32_t)_final_graph[i].size(); - out.write((char *)&GK, sizeof(uint32_t)); - out.write((char *)_final_graph[i].data(), GK * sizeof(uint32_t)); - max_degree = _final_graph[i].size() > max_degree ? (uint32_t)_final_graph[i].size() : max_degree; - index_size += (size_t)(sizeof(uint32_t) * (GK + 1)); - } - out.seekp(file_offset, out.beg); - out.write((char *)&index_size, sizeof(uint64_t)); - out.write((char *)&max_degree, sizeof(uint32_t)); - out.close(); - return index_size; // number of bytes written + return _graph_store->store(graph_file, _nd + _num_frozen_pts, _num_frozen_pts, _start); } template @@ -477,7 +449,8 @@ size_t Index::load_data(std::string filename) } #ifdef EXEC_ENV_OLS - // REFACTOR TODO: Must figure out how to support aligned reader in a clean manner. + // REFACTOR TODO: Must figure out how to support aligned reader in a clean + // manner. copy_aligned_data_from_file(reader, _data, file_num_points, file_dim, _data_store->get_aligned_dim()); #else _data_store->load(filename); // offset == 0. @@ -639,8 +612,8 @@ void Index::load(const char *filename, uint32_t num_threads, ui // initialize_q_s(). if (_query_scratch.size() == 0) { - initialize_query_scratch(num_threads, search_l, search_l, (uint32_t)_max_range_of_loaded_graph, _indexingMaxC, - _dim); + initialize_query_scratch(num_threads, search_l, search_l, (uint32_t)_graph_store->get_max_range_of_graph(), + _indexingMaxC, _dim); } } @@ -675,131 +648,10 @@ template size_t Index::load_graph(std::string filename, size_t expected_num_points) { #endif - size_t expected_file_size; - size_t file_frozen_pts; - -#ifdef EXEC_ENV_OLS - int header_size = 2 * sizeof(size_t) + 2 * sizeof(uint32_t); - std::unique_ptr header = std::make_unique(header_size); - read_array(reader, header.get(), header_size); - - expected_file_size = *((size_t *)header.get()); - _max_observed_degree = *((uint32_t *)(header.get() + sizeof(size_t))); - _start = *((uint32_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t))); - file_frozen_pts = *((size_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t))); -#else - - size_t file_offset = 0; // will need this for single file format support - std::ifstream in; - in.exceptions(std::ios::badbit | std::ios::failbit); - in.open(filename, std::ios::binary); - in.seekg(file_offset, in.beg); - in.read((char *)&expected_file_size, sizeof(size_t)); - in.read((char *)&_max_observed_degree, sizeof(uint32_t)); - in.read((char *)&_start, sizeof(uint32_t)); - in.read((char *)&file_frozen_pts, sizeof(size_t)); - size_t vamana_metadata_size = sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(size_t); - -#endif - diskann::cout << "From graph header, expected_file_size: " << expected_file_size - << ", _max_observed_degree: " << _max_observed_degree << ", _start: " << _start - << ", file_frozen_pts: " << file_frozen_pts << std::endl; - - if (file_frozen_pts != _num_frozen_pts) - { - std::stringstream stream; - if (file_frozen_pts == 1) - { - stream << "ERROR: When loading index, detected dynamic index, but " - "constructor asks for static index. Exitting." - << std::endl; - } - else - { - stream << "ERROR: When loading index, detected static index, but " - "constructor asks for dynamic index. Exitting." - << std::endl; - } - diskann::cerr << stream.str() << std::endl; - throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); - } - -#ifdef EXEC_ENV_OLS - diskann::cout << "Loading vamana graph from reader..." << std::flush; -#else - diskann::cout << "Loading vamana graph " << filename << "..." << std::flush; -#endif - - const size_t expected_max_points = expected_num_points - file_frozen_pts; - - // If user provides more points than max_points - // resize the _final_graph to the larger size. - if (_max_points < expected_max_points) - { - diskann::cout << "Number of points in data: " << expected_max_points - << " is greater than max_points: " << _max_points - << " Setting max points to: " << expected_max_points << std::endl; - _final_graph.resize(expected_max_points + _num_frozen_pts); - _max_points = expected_max_points; - } -#ifdef EXEC_ENV_OLS - uint32_t nodes_read = 0; - size_t cc = 0; - size_t graph_offset = header_size; - while (nodes_read < expected_num_points) - { - uint32_t k; - read_value(reader, k, graph_offset); - graph_offset += sizeof(uint32_t); - std::vector tmp(k); - tmp.reserve(k); - read_array(reader, tmp.data(), k, graph_offset); - graph_offset += k * sizeof(uint32_t); - cc += k; - _final_graph[nodes_read].swap(tmp); - nodes_read++; - if (nodes_read % 1000000 == 0) - { - diskann::cout << "." << std::flush; - } - if (k > _max_range_of_loaded_graph) - { - _max_range_of_loaded_graph = k; - } - } -#else - size_t bytes_read = vamana_metadata_size; - size_t cc = 0; - uint32_t nodes_read = 0; - while (bytes_read != expected_file_size) - { - uint32_t k; - in.read((char *)&k, sizeof(uint32_t)); - - if (k == 0) - { - diskann::cerr << "ERROR: Point found with no out-neighbors, point#" << nodes_read << std::endl; - } - - cc += k; - ++nodes_read; - std::vector tmp(k); - tmp.reserve(k); - in.read((char *)tmp.data(), k * sizeof(uint32_t)); - _final_graph[nodes_read - 1].swap(tmp); - bytes_read += sizeof(uint32_t) * ((size_t)k + 1); - if (nodes_read % 10000000 == 0) - diskann::cout << "." << std::flush; - if (k > _max_range_of_loaded_graph) - { - _max_range_of_loaded_graph = k; - } - } -#endif - - diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " - << _start << std::endl; - return nodes_read; + auto res = _graph_store->load(filename, expected_num_points); + _start = std::get<1>(res); + _num_frozen_pts = std::get<2>(res); + return std::get<0>(res); } template @@ -868,7 +720,8 @@ template std::vector Inde return init_ids; } -// Find common filter between a node's labels and a given set of labels, while taking into account universal label +// Find common filter between a node's labels and a given set of labels, while +// taking into account universal label template bool Index::detect_common_filters(uint32_t point_id, bool search_invocation, const std::vector &incoming_labels) @@ -879,8 +732,8 @@ bool Index::detect_common_filters(uint32_t point_id, bool searc curr_node_labels.end(), std::back_inserter(common_filters)); if (common_filters.size() > 0) { - // This is to reduce the repetitive calls. If common_filters size is > 0 , we dont need to check further for - // universal label + // This is to reduce the repetitive calls. If common_filters size is > 0 , + // we dont need to check further for universal label return true; } if (_use_universal_label) @@ -914,14 +767,6 @@ std::pair Index::iterate_to_fixed_point( std::vector &dist_scratch = scratch->dist_scratch(); assert(id_scratch.size() == 0); - // REFACTOR - // T *aligned_query = scratch->aligned_query(); - // memcpy(aligned_query, query, _dim * sizeof(T)); - // if (_normalize_vecs) - //{ - // normalize((float *)aligned_query, _dim); - // } - T *aligned_query = scratch->aligned_query(); float *query_float = nullptr; @@ -1057,7 +902,7 @@ std::pair Index::iterate_to_fixed_point( { if (_dynamic_index) _locks[n].lock(); - for (auto id : _final_graph[n]) + for (auto id : _graph_store->get_neighbours(n)) { assert(id < _max_points + _num_frozen_pts); @@ -1168,7 +1013,7 @@ void Index::search_for_point_and_prune(int location, uint32_t L prune_neighbors(location, pool, pruned_list, scratch); assert(!pruned_list.empty()); - assert(_final_graph.size() == _max_points + _num_frozen_pts); + assert(_graph_store->get_total_points() == _max_points + _num_frozen_pts); } template @@ -1283,8 +1128,6 @@ void Index::prune_neighbors(const uint32_t location, std::vecto return; } - _max_observed_degree = (std::max)(_max_observed_degree, range); - // If using _pq_build, over-write the PQ distances with actual distances if (_pq_dist) { @@ -1330,12 +1173,13 @@ void Index::inter_insert(uint32_t n, std::vector &pru bool prune_needed = false; { LockGuard guard(_locks[des]); - auto &des_pool = _final_graph[des]; + auto &des_pool = _graph_store->get_neighbours(des); if (std::find(des_pool.begin(), des_pool.end(), n) == des_pool.end()) { if (des_pool.size() < (uint64_t)(defaults::GRAPH_SLACK_FACTOR * range)) { - des_pool.emplace_back(n); + // des_pool.emplace_back(n); + _graph_store->add_neighbour(des, n); prune_needed = false; } else @@ -1371,7 +1215,7 @@ void Index::inter_insert(uint32_t n, std::vector &pru { LockGuard guard(_locks[des]); - _final_graph[des] = new_out_neighbors; + _graph_store->set_neighbours(des, new_out_neighbors); } } } @@ -1420,11 +1264,6 @@ void Index::link(const IndexWriteParameters ¶meters) else _start = calculate_entry_point(); - for (size_t p = 0; p < _nd; p++) - { - _final_graph[p].reserve((size_t)(std::ceil(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05))); - } - diskann::Timer link_timer; #pragma omp parallel for schedule(dynamic, 2048) @@ -1447,9 +1286,9 @@ void Index::link(const IndexWriteParameters ¶meters) } { LockGuard guard(_locks[node]); - _final_graph[node].reserve((size_t)(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05)); - _final_graph[node] = pruned_list; - assert(_final_graph[node].size() <= _indexingRange); + + _graph_store->set_neighbours(node, pruned_list); + assert(_graph_store->get_neighbours((location_t)node).size() <= _indexingRange); } inter_insert(node, pruned_list, scratch); @@ -1469,7 +1308,7 @@ void Index::link(const IndexWriteParameters ¶meters) for (int64_t node_ctr = 0; node_ctr < (int64_t)(visit_order.size()); node_ctr++) { auto node = visit_order[node_ctr]; - if (_final_graph[node].size() > _indexingRange) + if (_graph_store->get_neighbours((location_t)node).size() > _indexingRange) { ScratchStoreManager> manager(_query_scratch); auto scratch = manager.scratch_space(); @@ -1478,7 +1317,7 @@ void Index::link(const IndexWriteParameters ¶meters) std::vector dummy_pool(0); std::vector new_out_neighbors; - for (auto cur_nbr : _final_graph[node]) + for (auto cur_nbr : _graph_store->get_neighbours((location_t)node)) { if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node) { @@ -1489,9 +1328,8 @@ void Index::link(const IndexWriteParameters ¶meters) } prune_neighbors(node, dummy_pool, new_out_neighbors, scratch); - _final_graph[node].clear(); - for (auto id : new_out_neighbors) - _final_graph[node].emplace_back(id); + _graph_store->clear_neighbours((location_t)node); + _graph_store->set_neighbours((location_t)node, new_out_neighbors); } } if (_nd > 0) @@ -1515,7 +1353,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons { if ((size_t)node < _nd || (size_t)node >= _max_points) { - if (_final_graph[node].size() > range) + if (_graph_store->get_neighbours((location_t)node).size() > range) { tsl::robin_set dummy_visited(0); std::vector dummy_pool(0); @@ -1524,7 +1362,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons ScratchStoreManager> manager(_query_scratch); auto scratch = manager.scratch_space(); - for (auto cur_nbr : _final_graph[node]) + for (auto cur_nbr : _graph_store->get_neighbours((location_t)node)) { if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node) { @@ -1535,9 +1373,8 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons } prune_neighbors((uint32_t)node, dummy_pool, range, maxc, alpha, new_out_neighbors, scratch); - _final_graph[node].clear(); - for (auto id : new_out_neighbors) - _final_graph[node].emplace_back(id); + _graph_store->clear_neighbours((location_t)node); + _graph_store->set_neighbours((location_t)node, new_out_neighbors); } } } @@ -1548,7 +1385,7 @@ void Index::prune_all_neighbors(const uint32_t max_degree, cons { if (i < _nd || i >= _max_points) { - const std::vector &pool = _final_graph[i]; + const std::vector &pool = _graph_store->get_neighbours((location_t)i); max = (std::max)(max, pool.size()); min = (std::min)(min, pool.size()); total += pool.size(); @@ -1678,7 +1515,7 @@ void Index::build_with_data_populated(const IndexWriteParameter size_t max = 0, min = SIZE_MAX, total = 0, cnt = 0; for (size_t i = 0; i < _nd; i++) { - auto &pool = _final_graph[i]; + auto &pool = _graph_store->get_neighbours((location_t)i); max = std::max(max, pool.size()); min = std::min(min, pool.size()); total += pool.size(); @@ -1688,7 +1525,6 @@ void Index::build_with_data_populated(const IndexWriteParameter diskann::cout << "Index built with degree: max:" << max << " avg:" << (float)total / (float)(_nd + _num_frozen_pts) << " min:" << min << " count(deg<2):" << cnt << std::endl; - _max_observed_degree = std::max((uint32_t)max, _max_observed_degree); _has_built = true; } template @@ -1730,16 +1566,6 @@ void Index::build(const T *data, const size_t num_points_to_loa _nd = num_points_to_load; _data_store->populate_data(data, (location_t)num_points_to_load); - - // REFACTOR - // memcpy((char *)_data, (char *)data, _aligned_dim * _nd * sizeof(T)); - // if (_normalize_vecs) - //{ - // for (size_t i = 0; i < num_points_to_load; i++) - // { - // normalize(_data + _aligned_dim * i, _aligned_dim); - // } - // } } build_with_data_populated(parameters, tags); @@ -2238,9 +2064,6 @@ std::pair Index::search_with_filters(const } filter_vec.emplace_back(filter_label); - // REFACTOR - // T *aligned_query = scratch->aligned_query(); - // memcpy(aligned_query, query, _dim * sizeof(T)); _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); auto retval = iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, true, filter_vec, true); @@ -2320,7 +2143,8 @@ size_t Index::search_with_tags(const T *query, const uint64_t K const std::vector init_ids = get_init_ids(); const std::vector unused_filter_label; - //_distance->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); + //_distance->preprocess_query(query, _data_store->get_dims(), + // scratch->aligned_query()); _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); iterate_to_fixed_point(scratch->aligned_query(), L, init_ids, scratch, false, unused_filter_label, true); @@ -2451,7 +2275,7 @@ inline void Index::process_delete(const tsl::robin_set adj_list_lock; if (_conc_consolidate) adj_list_lock = std::unique_lock(_locks[loc]); - adj_list = _final_graph[loc]; + adj_list = _graph_store->get_neighbours((location_t)loc); } bool modify = false; @@ -2468,7 +2292,7 @@ inline void Index::process_delete(const tsl::robin_set ngh_lock; if (_conc_consolidate) ngh_lock = std::unique_lock(_locks[ngh]); - for (auto j : _final_graph[ngh]) + for (auto j : _graph_store->get_neighbours((location_t)ngh)) if (j != loc && old_delete_set.find(j) == old_delete_set.end()) expanded_nodes_set.insert(j); } @@ -2479,9 +2303,9 @@ inline void Index::process_delete(const tsl::robin_set adj_list_lock(_locks[loc]); - _final_graph[loc].clear(); + _graph_store->clear_neighbours((location_t)loc); for (auto &ngh : expanded_nodes_set) - _final_graph[loc].push_back(ngh); + _graph_store->add_neighbour((location_t)loc, ngh); } else { @@ -2496,7 +2320,7 @@ inline void Index::process_delete(const tsl::robin_set adj_list_lock(_locks[loc]); - _final_graph[loc] = occlude_list_output; + _graph_store->set_neighbours((location_t)loc, occlude_list_output); } } } @@ -2671,8 +2495,8 @@ template void Index= _max_points && old < _max_points + _num_frozen_pts)) { - new_adj_list.reserve(_final_graph[old].size()); - for (auto ngh_iter : _final_graph[old]) + new_adj_list.reserve(_graph_store->get_neighbours((location_t)old).size()); + for (auto ngh_iter : _graph_store->get_neighbours((location_t)old)) { if (empty_locations.find(ngh_iter) != empty_locations.end()) { @@ -2685,20 +2509,21 @@ template void Indexget_neighbours((location_t)old).swap(new_adj_list); + _graph_store->set_neighbours((location_t)old, new_adj_list); // Move the data and adj list to the correct position if (new_location[old] != old) { assert(new_location[old] < old); - _final_graph[new_location[old]].swap(_final_graph[old]); - + //_graph_store->get_neighbours(new_location[old]).swap(_graph_store->get_neighbours((location_t)old)); + _graph_store->swap_neighbours(new_location[old], (location_t)old); _data_store->copy_vectors(old, new_location[old], 1); } } else { - _final_graph[old].clear(); + _graph_store->clear_neighbours((location_t)old); } } diskann::cerr << "#dangling references after data compaction: " << num_dangling << std::endl; @@ -2717,7 +2542,7 @@ template void Indexclear_neighbours((location_t)old); } _empty_slots.clear(); for (auto i = _nd; i < _max_points; i++) @@ -2754,7 +2579,6 @@ template int Index location = _empty_slots.pop_any(); _delete_set->erase(location); } - ++_nd; return location; } @@ -2804,10 +2628,18 @@ void Index::reposition_points(uint32_t old_location_start, uint // integer arithmetic rules. const uint32_t location_delta = new_location_start - old_location_start; + std::vector updated_neighbours_location; for (uint32_t i = 0; i < _max_points + _num_frozen_pts; i++) - for (auto &loc : _final_graph[i]) + { + auto &i_neighbours = _graph_store->get_neighbours((location_t)i); + std::vector i_neighbours_copy(i_neighbours.begin(), i_neighbours.end()); + for (auto &loc : i_neighbours_copy) + { if (loc >= old_location_start && loc < old_location_start + num_locations) loc += location_delta; + } + _graph_store->set_neighbours(i, i_neighbours_copy); + } // The [start, end) interval which will contain obsolete points to be // cleared. @@ -2822,8 +2654,11 @@ void Index::reposition_points(uint32_t old_location_start, uint // to avoid modifying locations that are yet to be copied. for (uint32_t loc_offset = 0; loc_offset < num_locations; loc_offset++) { - assert(_final_graph[new_location_start + loc_offset].empty()); - _final_graph[new_location_start + loc_offset].swap(_final_graph[old_location_start + loc_offset]); + assert(_graph_store->get_neighbours(new_location_start + loc_offset).empty()); + /* _graph_store->get_neighbours(new_location_start + loc_offset) + .swap(_graph_store->get_neighbours(old_location_start + + loc_offset));*/ + _graph_store->swap_neighbours(new_location_start + loc_offset, old_location_start + loc_offset); } // If ranges are overlapping, make sure not to clear the newly copied @@ -2840,8 +2675,11 @@ void Index::reposition_points(uint32_t old_location_start, uint // to avoid modifying locations that are yet to be copied. for (uint32_t loc_offset = num_locations; loc_offset > 0; loc_offset--) { - assert(_final_graph[new_location_start + loc_offset - 1u].empty()); - _final_graph[new_location_start + loc_offset - 1u].swap(_final_graph[old_location_start + loc_offset - 1u]); + assert(_graph_store->get_neighbours(new_location_start + loc_offset - 1u).empty()); + /*_graph_store->get_neighbours(new_location_start + loc_offset - 1u) + .swap(_graph_store->get_neighbours(old_location_start + loc_offset - + 1u));*/ + _graph_store->swap_neighbours(new_location_start + loc_offset - 1u, old_location_start + loc_offset - 1u); } // If ranges are overlapping, make sure not to clear the newly copied @@ -2877,7 +2715,7 @@ template void Indexresize((location_t)new_internal_points); - _final_graph.resize(new_internal_points); + _graph_store->resize_graph(new_internal_points); _locks = std::vector(new_internal_points); if (_num_frozen_pts != 0) @@ -3006,17 +2844,18 @@ int Index::insert_point(const T *point, const TagT tag) tlock.lock(); LockGuard guard(_locks[location]); - _final_graph[location].clear(); - _final_graph[location].reserve((size_t)(_indexingRange * defaults::GRAPH_SLACK_FACTOR * 1.05)); + _graph_store->clear_neighbours(location); + std::vector neighbor_links; for (auto link : pruned_list) { if (_conc_consolidate) if (!_location_to_tag.contains(link)) continue; - _final_graph[location].emplace_back(link); + neighbor_links.emplace_back(link); } - assert(_final_graph[location].size() <= _indexingRange); + _graph_store->set_neighbours(location, neighbor_links); + assert(_graph_store->get_neighbours(location).size() <= _indexingRange); if (_conc_consolidate) tlock.unlock(); @@ -3148,7 +2987,7 @@ template void Indexget_total_points() << std::endl; diskann::cout << "Location to tag size: " << _location_to_tag.size() << std::endl; diskann::cout << "Tag to location size: " << _tag_to_location.size() << std::endl; diskann::cout << "Number of empty slots: " << _empty_slots.size() << std::endl; @@ -3186,7 +3025,7 @@ template void Indexget_neighbours((location_t)node)) { if (!visited.test(nghbr)) { @@ -3218,7 +3057,7 @@ template void Indexget_aligned_dim()]; std::memset(cur_vec, 0, _data_store->get_aligned_dim() * sizeof(float)); _data_len = (_data_store->get_aligned_dim() + 1) * sizeof(float); - _neighbor_len = (_max_observed_degree + 1) * sizeof(uint32_t); + _neighbor_len = (_graph_store->get_max_observed_degree() + 1) * sizeof(uint32_t); _node_size = _data_len + _neighbor_len; _opt_graph = new char[_node_size * _nd]; DistanceFastL2 *dist_fast = (DistanceFastL2 *)_data_store->get_dist_fn(); @@ -3231,13 +3070,14 @@ template void Indexget_neighbours(i).size(); std::memcpy(cur_node_offset, &k, sizeof(uint32_t)); - std::memcpy(cur_node_offset + sizeof(uint32_t), _final_graph[i].data(), k * sizeof(uint32_t)); - std::vector().swap(_final_graph[i]); + std::memcpy(cur_node_offset + sizeof(uint32_t), _graph_store->get_neighbours(i).data(), k * sizeof(uint32_t)); + // std::vector().swap(_graph_store->get_neighbours(i)); + _graph_store->clear_neighbours(i); } - _final_graph.clear(); - _final_graph.shrink_to_fit(); + _graph_store->clear_graph(); + _graph_store->resize_graph(0); delete[] cur_vec; } @@ -3258,8 +3098,10 @@ void Index::_search_with_optimized_layout(const DataType &query } catch (const std::bad_any_cast &e) { - throw ANNException( - "Error: bad any cast while performing _search_with_optimized_layout() " + std::string(e.what()), -1); + throw ANNException("Error: bad any cast while performing " + "_search_with_optimized_layout() " + + std::string(e.what()), + -1); } catch (const std::exception &e) { diff --git a/src/index_factory.cpp b/src/index_factory.cpp index 88ac44a16..aa2042725 100644 --- a/src/index_factory.cpp +++ b/src/index_factory.cpp @@ -50,13 +50,14 @@ void IndexFactory::check_config() } template -std::unique_ptr> IndexFactory::construct_datastore(DataStoreStrategy strategy, size_t num_points, - size_t dimension, Metric m) +std::unique_ptr> IndexFactory::construct_datastore(const DataStoreStrategy strategy, + const size_t num_points, const size_t dimension, + const Metric m) { std::unique_ptr> distance; switch (strategy) { - case MEMORY: + case diskann::DataStoreStrategy::MEMORY: if (m == diskann::Metric::COSINE && std::is_same::value) { distance.reset((Distance *)new AVXNormalizedCosineDistanceFloat()); @@ -74,9 +75,17 @@ std::unique_ptr> IndexFactory::construct_datastore(DataStor return nullptr; } -std::unique_ptr IndexFactory::construct_graphstore(GraphStoreStrategy, size_t size) +std::unique_ptr IndexFactory::construct_graphstore(const GraphStoreStrategy strategy, + const size_t size, + const size_t reserve_graph_degree) { - return std::make_unique(size); + switch (strategy) + { + case GraphStoreStrategy::MEMORY: + return std::make_unique(size, reserve_graph_degree); + default: + throw ANNException("Error : Current GraphStoreStratagy is not supported.", -1); + } } template @@ -84,10 +93,14 @@ std::unique_ptr IndexFactory::create_instance() { size_t num_points = _config->max_points + _config->num_frozen_pts; size_t dim = _config->dimension; - // auto graph_store = construct_graphstore(_config->graph_strategy, num_points); - auto data_store = - IndexFactory::construct_datastore(_config->data_strategy, num_points, dim, _config->metric); - return std::make_unique>(*_config, std::move(data_store)); + size_t max_reserve_degree = + (size_t)(defaults::GRAPH_SLACK_FACTOR * 1.05 * + (_config->index_write_params == nullptr ? 0 : _config->index_write_params->max_degree)); + auto data_store = construct_datastore(_config->data_strategy, num_points, dim, _config->metric); + auto graph_store = + construct_graphstore(_config->graph_strategy, num_points + _config->num_frozen_pts, max_reserve_degree); + return std::make_unique>(*_config, std::move(data_store), + std::move(graph_store)); } std::unique_ptr IndexFactory::create_instance(const std::string &data_type, const std::string &tag_type, From 9d5fde183b942b1f4c9178a44c7e7c5bb99a9618 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Tue, 22 Aug 2023 06:23:46 -0700 Subject: [PATCH 10/23] Undo mistake, let frontier read in PQ flash index be asynchronous (#434) * Undo mistake, let frontier read in PQ flash index be asynchronous * address changes requested --- src/pq_flash_index.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index b0121493e..e76debcdf 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -1392,7 +1392,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t io_timer.reset(); #ifdef USE_BING_INFRA reader->read(frontier_read_reqs, ctx, - false); // synhronous reader for Bing. + true); // asynhronous reader for Bing. #else reader->read(frontier_read_reqs, ctx); // synchronous IO linux #endif @@ -1561,7 +1561,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t io_timer.reset(); #ifdef USE_BING_INFRA - reader->read(vec_read_reqs, ctx, false); // sync reader windows. + reader->read(vec_read_reqs, ctx, true); // async reader windows. #else reader->read(vec_read_reqs, ctx); // synchronous IO linux #endif From fee17e6a34646c164444101983fd4df76b37fe6e Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Tue, 22 Aug 2023 15:02:39 -0700 Subject: [PATCH 11/23] =?UTF-8?q?Reduce=20CI=20tests=20for=20multi-sector?= =?UTF-8?q?=20disk=20layout=20from=2010K=20to=205K=20points=20so=E2=80=A6?= =?UTF-8?q?=20(#439)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Reduce CI tests for multi-sector disk layout from 10K to 5K points so they run faster * turn off 1024D --- .../generate-high-dim-random/action.yml | 18 +++++++-------- .github/workflows/multi-sector-disk-pq.yml | 22 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/actions/generate-high-dim-random/action.yml b/.github/actions/generate-high-dim-random/action.yml index 0c7eeb8fd..65e9b7e38 100644 --- a/.github/actions/generate-high-dim-random/action.yml +++ b/.github/actions/generate-high-dim-random/action.yml @@ -8,21 +8,21 @@ runs: mkdir data echo "Generating random 1020,1024,1536D float and 4096 int8 vectors for index" - dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_10K_norm1.0.bin -D 1020 -N 10000 --norm 1.0 - dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_10K_norm1.0.bin -D 1024 -N 10000 --norm 1.0 - dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_10K_norm1.0.bin -D 1536 -N 10000 --norm 1.0 - dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_10K_norm1.0.bin -D 4096 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_5K_norm1.0.bin -D 1020 -N 5000 --norm 1.0 + #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_5K_norm1.0.bin -D 1024 -N 5000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_5K_norm1.0.bin -D 1536 -N 5000 --norm 1.0 + dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_5K_norm1.0.bin -D 4096 -N 5000 --norm 1.0 echo "Generating random 1020,1024,1536D float and 4096D int8 avectors for query" dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_1K_norm1.0.bin -D 1020 -N 1000 --norm 1.0 - dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_1K_norm1.0.bin -D 1024 -N 1000 --norm 1.0 + #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_1K_norm1.0.bin -D 1024 -N 1000 --norm 1.0 dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_1K_norm1.0.bin -D 1536 -N 1000 --norm 1.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_1K_norm1.0.bin -D 4096 -N 1000 --norm 1.0 echo "Computing ground truth for 1020,1024,1536D float and 4096D int8 avectors for query" - dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1020D_10K_norm1.0.bin --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_10K_norm1.0_1020D_1K_norm1.0_gt100 --K 100 - dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1024D_10K_norm1.0.bin --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_10K_norm1.0_1024D_1K_norm1.0_gt100 --K 100 - dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1536D_10K_norm1.0.bin --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_10K_norm1.0_1536D_1K_norm1.0_gt100 --K 100 - dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_4096D_10K_norm1.0.bin --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_10K_norm1.0_4096D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1020D_5K_norm1.0.bin --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_5K_norm1.0_1020D_1K_norm1.0_gt100 --K 100 + #dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1024D_5K_norm1.0.bin --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_5K_norm1.0_1024D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1536D_5K_norm1.0.bin --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_5K_norm1.0_1536D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_4096D_5K_norm1.0.bin --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_5K_norm1.0_4096D_1K_norm1.0_gt100 --K 100 shell: bash diff --git a/.github/workflows/multi-sector-disk-pq.yml b/.github/workflows/multi-sector-disk-pq.yml index 1f010b124..8ea55c88d 100644 --- a/.github/workflows/multi-sector-disk-pq.yml +++ b/.github/workflows/multi-sector-disk-pq.yml @@ -32,24 +32,24 @@ jobs: - name: build and search disk index (1020D, one shot graph build, L2, no diskPQ) (float) if: success() || failure() run: | - dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1020D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1020D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 - dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1020D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_10K_norm1.0_1020D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 - - name: build and search disk index (1024D, one shot graph build, L2, no diskPQ) (float) - if: success() || failure() - run: | - dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1024D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1024D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 - dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1024D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_10K_norm1.0_1024D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1020D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1020D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1020D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_5K_norm1.0_1020D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + #- name: build and search disk index (1024D, one shot graph build, L2, no diskPQ) (float) + # if: success() || failure() + # run: | + # dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1024D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1024D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + # dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1024D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_5K_norm1.0_1024D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 - name: build and search disk index (1536D, one shot graph build, L2, no diskPQ) (float) if: success() || failure() run: | - dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1536D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1536D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 - dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1536D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_10K_norm1.0_1536D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1536D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1536D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1536D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_5K_norm1.0_1536D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 - name: build and search disk index (4096D, one shot graph build, L2, no diskPQ) (int8) if: success() || failure() run: | - dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_4096D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_int8_4096D_10K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 - dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_4096D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_10K_norm1.0_4096D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 + dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_4096D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_int8_4096D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 + dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_4096D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_5K_norm1.0_4096D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16 - name: upload data and bin uses: actions/upload-artifact@v3 From 9622d8f6d5f1a858ca8b119c8f4f5b4a77070c04 Mon Sep 17 00:00:00 2001 From: Yash Patel <47032340+yashpatel007@users.noreply.github.com> Date: Wed, 23 Aug 2023 16:23:42 -0400 Subject: [PATCH 12/23] hot fix definate mem_leaks (#440) --- include/abstract_data_store.h | 2 +- include/abstract_graph_store.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/abstract_data_store.h b/include/abstract_data_store.h index 976174378..d858c8eef 100644 --- a/include/abstract_data_store.h +++ b/include/abstract_data_store.h @@ -18,7 +18,7 @@ template class AbstractDataStore public: AbstractDataStore(const location_t capacity, const size_t dim); - // virtual ~AbstractDataStore() = default; + virtual ~AbstractDataStore() = default; // Return number of points returned virtual location_t load(const std::string &filename) = 0; diff --git a/include/abstract_graph_store.h b/include/abstract_graph_store.h index c0deade17..4d6906ca4 100644 --- a/include/abstract_graph_store.h +++ b/include/abstract_graph_store.h @@ -18,6 +18,8 @@ class AbstractGraphStore { } + virtual ~AbstractGraphStore() = default; + // returns tuple of virtual std::tuple load(const std::string &index_path_prefix, const size_t num_points) = 0; From b05c2dcef0a8a3b4bb92c617173efa9fb206e6db Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Thu, 24 Aug 2023 09:59:15 -0700 Subject: [PATCH 13/23] add num_Threads to indexwriteparams in sharded build (#438) --- src/disk_utils.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 6544df33a..7ea29c49a 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -692,18 +692,22 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index"; - diskann::IndexWriteParameters paras = - diskann::IndexWriteParametersBuilder(L, (2 * R / 3)).with_filter_list_size(Lf).build(); + diskann::IndexWriteParameters low_degree_params = diskann::IndexWriteParametersBuilder(L, 2 * R / 3) + .with_filter_list_size(Lf) + .with_saturate_graph(false) + .with_num_threads(num_threads) + .build(); uint64_t shard_base_dim, shard_base_pts; get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim); - diskann::Index _index( - compareMetric, shard_base_dim, shard_base_pts, std::make_shared(paras), - nullptr, paras.num_frozen_points, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); + diskann::Index _index(compareMetric, shard_base_dim, shard_base_pts, + std::make_shared(low_degree_params), nullptr, + low_degree_params.num_frozen_points, false, false, false, build_pq_bytes > 0, + build_pq_bytes, use_opq); if (!use_filters) { - _index.build(shard_base_file.c_str(), shard_base_pts, paras); + _index.build(shard_base_file.c_str(), shard_base_pts, low_degree_params); } else { @@ -713,7 +717,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr LabelT unv_label_as_num = 0; _index.set_universal_label(unv_label_as_num); } - _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts, paras); + _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts, low_degree_params); } _index.save(shard_index_file.c_str()); // copy universal label file from first shard to the final destination From 98b119a2486e8d3d5df2bb9211c7eb2abd7425dc Mon Sep 17 00:00:00 2001 From: Jon McLean <4429525+jonmclean@users.noreply.github.com> Date: Mon, 28 Aug 2023 09:46:02 -0700 Subject: [PATCH 14/23] Added clarity to the universal label (#442) --- include/program_options_utils.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/program_options_utils.hpp b/include/program_options_utils.hpp index 71077b7b2..2be60595b 100644 --- a/include/program_options_utils.hpp +++ b/include/program_options_utils.hpp @@ -73,7 +73,9 @@ const char *LABEL_FILE = "Input label file in txt format for Filtered Index buil const char *UNIVERSAL_LABEL = "Universal label, Use only in conjunction with label file for filtered index build. If a " "graph node has all the labels against it, we can assign a special universal filter to the " - "point instead of comma separated filters for that point"; + "point instead of comma separated filters for that point. The universal label should be assigned to nodes " + "in the labels file instead of listing all labels for a node. DiskANN will not automatically assign a " + "universal label to a node."; const char *FILTERED_LBUILD = "Build complexity for filtered points, higher value results in better graphs"; } // namespace program_options_utils From 8afb38a1e1762cdeecc1aeb5eb1ddb4ed700507c Mon Sep 17 00:00:00 2001 From: Yash Patel <47032340+yashpatel007@users.noreply.github.com> Date: Mon, 28 Aug 2023 13:22:29 -0400 Subject: [PATCH 15/23] Remove IndexWriteParams from build method. (#441) * removing write_params from buidl and taking it upfront in Index Ctor * renaming build_params to filter params --- apps/build_memory_index.cpp | 12 +-- apps/test_insert_deletes_consolidate.cpp | 2 +- include/abstract_index.h | 8 +- include/index.h | 20 ++--- include/index_build_params.h | 35 ++++---- python/include/static_memory_index.h | 22 ++--- python/src/builder.cpp | 4 +- src/abstract_index.cpp | 17 +--- src/disk_utils.cpp | 8 +- src/filter_utils.cpp | 2 +- src/index.cpp | 101 +++++++++-------------- 11 files changed, 97 insertions(+), 134 deletions(-) diff --git a/apps/build_memory_index.cpp b/apps/build_memory_index.cpp index 6e9eb6677..544e42dee 100644 --- a/apps/build_memory_index.cpp +++ b/apps/build_memory_index.cpp @@ -127,11 +127,11 @@ int main(int argc, char **argv) .with_num_threads(num_threads) .build(); - auto build_params = diskann::IndexBuildParamsBuilder(index_build_params) - .with_universal_label(universal_label) - .with_label_file(label_file) - .with_save_path_prefix(index_path_prefix) - .build(); + auto filter_params = diskann::IndexFilterParamsBuilder() + .with_universal_label(universal_label) + .with_label_file(label_file) + .with_save_path_prefix(index_path_prefix) + .build(); auto config = diskann::IndexConfigBuilder() .with_metric(metric) .with_dimension(data_dim) @@ -150,7 +150,7 @@ int main(int argc, char **argv) auto index_factory = diskann::IndexFactory(config); auto index = index_factory.create_instance(); - index->build(data_path, data_num, build_params); + index->build(data_path, data_num, filter_params); index->save(index_path_prefix.c_str()); index.reset(); return 0; diff --git a/apps/test_insert_deletes_consolidate.cpp b/apps/test_insert_deletes_consolidate.cpp index 4b7d230ef..bcc2e178d 100644 --- a/apps/test_insert_deletes_consolidate.cpp +++ b/apps/test_insert_deletes_consolidate.cpp @@ -216,7 +216,7 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa if (beginning_index_size > 0) { - index->build(data, beginning_index_size, params, tags); + index->build(data, beginning_index_size, tags); } else { diff --git a/include/abstract_index.h b/include/abstract_index.h index 1a32bf8da..ff77d904e 100644 --- a/include/abstract_index.h +++ b/include/abstract_index.h @@ -42,11 +42,10 @@ class AbstractIndex virtual ~AbstractIndex() = default; virtual void build(const std::string &data_file, const size_t num_points_to_load, - IndexBuildParams &build_params) = 0; + IndexFilterParams &build_params) = 0; template - void build(const data_type *data, const size_t num_points_to_load, const IndexWriteParameters ¶meters, - const std::vector &tags); + void build(const data_type *data, const size_t num_points_to_load, const std::vector &tags); virtual void save(const char *filename, bool compact_before_save = false) = 0; @@ -98,8 +97,7 @@ class AbstractIndex template int get_vector_by_tag(tag_type &tag, data_type *vec); private: - virtual void _build(const DataType &data, const size_t num_points_to_load, const IndexWriteParameters ¶meters, - TagVector &tags) = 0; + virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0; virtual std::pair _search(const DataType &query, const size_t K, const uint32_t L, std::any &indices, float *distances = nullptr) = 0; virtual std::pair _search_with_filters(const DataType &query, const std::string &filter_label, diff --git a/include/index.h b/include/index.h index b22dcdce4..cb27aeac0 100644 --- a/include/index.h +++ b/include/index.h @@ -86,23 +86,21 @@ template clas // Batch build from a file. Optionally pass tags vector. DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags = std::vector()); // Batch build from a file. Optionally pass tags file. - DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const char *tag_filename); + DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load, const char *tag_filename); // Batch build from a data array, which must pad vectors to aligned_dim - DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const IndexWriteParameters ¶meters, - const std::vector &tags); + DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const std::vector &tags); + // Based on filter params builds a filtered or unfiltered index DISKANN_DLLEXPORT void build(const std::string &data_file, const size_t num_points_to_load, - IndexBuildParams &build_params); + IndexFilterParams &build_params); // Filtered Support DISKANN_DLLEXPORT void build_filtered_index(const char *filename, const std::string &label_file, - const size_t num_points_to_load, IndexWriteParameters ¶meters, + const size_t num_points_to_load, const std::vector &tags = std::vector()); DISKANN_DLLEXPORT void set_universal_label(const LabelT &label); @@ -194,8 +192,7 @@ template clas protected: // overload of abstract index virtual methods - virtual void _build(const DataType &data, const size_t num_points_to_load, const IndexWriteParameters ¶meters, - TagVector &tags) override; + virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) override; virtual std::pair _search(const DataType &query, const size_t K, const uint32_t L, std::any &indices, float *distances = nullptr) override; @@ -227,7 +224,7 @@ template clas // Use after _data and _nd have been populated // Acquire exclusive _update_lock before calling - void build_with_data_populated(const IndexWriteParameters ¶meters, const std::vector &tags); + void build_with_data_populated(const std::vector &tags); // generates 1 frozen point that will never be deleted from the graph // This is not visible to the user @@ -273,7 +270,7 @@ template clas void inter_insert(uint32_t n, std::vector &pruned_list, InMemQueryScratch *scratch); // Acquire exclusive _update_lock before calling - void link(const IndexWriteParameters ¶meters); + void link(); // Acquire exclusive _tag_lock and _delete_lock before calling int reserve_location(); @@ -380,6 +377,7 @@ template clas uint32_t _indexingRange; uint32_t _indexingMaxC; float _indexingAlpha; + uint32_t _indexingThreads; // Query scratch data structures ConcurrentQueue *> _query_scratch; diff --git a/include/index_build_params.h b/include/index_build_params.h index ff68c5001..a3012e99a 100644 --- a/include/index_build_params.h +++ b/include/index_build_params.h @@ -3,31 +3,30 @@ namespace diskann { -struct IndexBuildParams +struct IndexFilterParams { public: - diskann::IndexWriteParameters index_write_params; std::string save_path_prefix; std::string label_file; std::string universal_label; uint32_t filter_threshold = 0; private: - IndexBuildParams(const IndexWriteParameters &index_write_params, const std::string &save_path_prefix, - const std::string &label_file, const std::string &universal_label, uint32_t filter_threshold) - : index_write_params(index_write_params), save_path_prefix(save_path_prefix), label_file(label_file), - universal_label(universal_label), filter_threshold(filter_threshold) + IndexFilterParams(const std::string &save_path_prefix, const std::string &label_file, + const std::string &universal_label, uint32_t filter_threshold) + : save_path_prefix(save_path_prefix), label_file(label_file), universal_label(universal_label), + filter_threshold(filter_threshold) { } - friend class IndexBuildParamsBuilder; + friend class IndexFilterParamsBuilder; }; -class IndexBuildParamsBuilder +class IndexFilterParamsBuilder { public: - IndexBuildParamsBuilder(const diskann::IndexWriteParameters ¶s) : _index_write_params(paras){}; + IndexFilterParamsBuilder() = default; - IndexBuildParamsBuilder &with_save_path_prefix(const std::string &save_path_prefix) + IndexFilterParamsBuilder &with_save_path_prefix(const std::string &save_path_prefix) { if (save_path_prefix.empty() || save_path_prefix == "") throw ANNException("Error: save_path_prefix can't be empty", -1); @@ -35,35 +34,33 @@ class IndexBuildParamsBuilder return *this; } - IndexBuildParamsBuilder &with_label_file(const std::string &label_file) + IndexFilterParamsBuilder &with_label_file(const std::string &label_file) { this->_label_file = label_file; return *this; } - IndexBuildParamsBuilder &with_universal_label(const std::string &univeral_label) + IndexFilterParamsBuilder &with_universal_label(const std::string &univeral_label) { this->_universal_label = univeral_label; return *this; } - IndexBuildParamsBuilder &with_filter_threshold(const std::uint32_t &filter_threshold) + IndexFilterParamsBuilder &with_filter_threshold(const std::uint32_t &filter_threshold) { this->_filter_threshold = filter_threshold; return *this; } - IndexBuildParams build() + IndexFilterParams build() { - return IndexBuildParams(_index_write_params, _save_path_prefix, _label_file, _universal_label, - _filter_threshold); + return IndexFilterParams(_save_path_prefix, _label_file, _universal_label, _filter_threshold); } - IndexBuildParamsBuilder(const IndexBuildParamsBuilder &) = delete; - IndexBuildParamsBuilder &operator=(const IndexBuildParamsBuilder &) = delete; + IndexFilterParamsBuilder(const IndexFilterParamsBuilder &) = delete; + IndexFilterParamsBuilder &operator=(const IndexFilterParamsBuilder &) = delete; private: - diskann::IndexWriteParameters _index_write_params; std::string _save_path_prefix; std::string _label_file; std::string _universal_label; diff --git a/python/include/static_memory_index.h b/python/include/static_memory_index.h index 33f3187ae..6a222bedb 100644 --- a/python/include/static_memory_index.h +++ b/python/include/static_memory_index.h @@ -14,21 +14,23 @@ namespace py = pybind11; -namespace diskannpy { +namespace diskannpy +{ -template -class StaticMemoryIndex +template class StaticMemoryIndex { public: - StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, - size_t dimensions, uint32_t num_threads, uint32_t initial_search_complexity); + StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, size_t dimensions, + uint32_t num_threads, uint32_t initial_search_complexity); + + NeighborsAndDistances search(py::array_t &query, + uint64_t knn, uint64_t complexity); - NeighborsAndDistances search(py::array_t &query, uint64_t knn, - uint64_t complexity); + NeighborsAndDistances batch_search( + py::array_t &queries, uint64_t num_queries, uint64_t knn, + uint64_t complexity, uint32_t num_threads); - NeighborsAndDistances batch_search(py::array_t &queries, - uint64_t num_queries, uint64_t knn, uint64_t complexity, uint32_t num_threads); private: diskann::Index _index; }; -} \ No newline at end of file +} // namespace diskannpy \ No newline at end of file diff --git a/python/src/builder.cpp b/python/src/builder.cpp index 2e593e72b..3576cab6d 100644 --- a/python/src/builder.cpp +++ b/python/src/builder.cpp @@ -65,11 +65,11 @@ void build_memory_index(const diskann::Metric metric, const std::string &vector_ size_t tag_dims = 1; diskann::load_bin(tags_file, tags_data, data_num, tag_dims); std::vector tags(tags_data, tags_data + data_num); - index.build(vector_bin_path.c_str(), data_num, index_build_params, tags); + index.build(vector_bin_path.c_str(), data_num, tags); } else { - index.build(vector_bin_path.c_str(), data_num, index_build_params); + index.build(vector_bin_path.c_str(), data_num); } index.save(index_output_path.c_str()); diff --git a/src/abstract_index.cpp b/src/abstract_index.cpp index 518f8b7dd..ee55b0753 100644 --- a/src/abstract_index.cpp +++ b/src/abstract_index.cpp @@ -6,12 +6,11 @@ namespace diskann { template -void AbstractIndex::build(const data_type *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags) +void AbstractIndex::build(const data_type *data, const size_t num_points_to_load, const std::vector &tags) { auto any_data = std::any(data); auto any_tags_vec = TagVector(tags); - this->_build(any_data, num_points_to_load, parameters, any_tags_vec); + this->_build(any_data, num_points_to_load, any_tags_vec); } template @@ -92,50 +91,38 @@ template int AbstractIndex::get_vector_b // exports template DISKANN_DLLEXPORT void AbstractIndex::build(const float *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const int8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const uint8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const float *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const int8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const uint8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const float *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const int8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const uint8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const float *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const int8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT void AbstractIndex::build(const uint8_t *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags); template DISKANN_DLLEXPORT std::pair AbstractIndex::search( diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 7ea29c49a..a67059c8d 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -639,7 +639,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr compareMetric, base_dim, base_num, std::make_shared(paras), nullptr, paras.num_frozen_points, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); if (!use_filters) - _index.build(base_file.c_str(), base_num, paras); + _index.build(base_file.c_str(), base_num); else { if (universal_label != "") @@ -647,7 +647,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr LabelT unv_label_as_num = 0; _index.set_universal_label(unv_label_as_num); } - _index.build_filtered_index(base_file.c_str(), label_file, base_num, paras); + _index.build_filtered_index(base_file.c_str(), label_file, base_num); } _index.save(mem_index_path.c_str()); @@ -707,7 +707,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr build_pq_bytes, use_opq); if (!use_filters) { - _index.build(shard_base_file.c_str(), shard_base_pts, low_degree_params); + _index.build(shard_base_file.c_str(), shard_base_pts); } else { @@ -717,7 +717,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr LabelT unv_label_as_num = 0; _index.set_universal_label(unv_label_as_num); } - _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts, low_degree_params); + _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts); } _index.save(shard_index_file.c_str()); // copy universal label file from first shard to the final destination diff --git a/src/filter_utils.cpp b/src/filter_utils.cpp index f077a14a3..0cdb9bde7 100644 --- a/src/filter_utils.cpp +++ b/src/filter_utils.cpp @@ -51,7 +51,7 @@ void generate_label_indices(path input_data_path, path final_index_path_prefix, 0, false, false); auto index_build_timer = std::chrono::high_resolution_clock::now(); - index.build(curr_label_input_data_path.c_str(), number_of_label_points, label_index_build_parameters); + index.build(curr_label_input_data_path.c_str(), number_of_label_points); std::chrono::duration current_indexing_time = std::chrono::high_resolution_clock::now() - index_build_timer; diff --git a/src/index.cpp b/src/index.cpp index 799b4bb9c..b4ebe1dda 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -93,18 +93,21 @@ Index::Index(const IndexConfig &index_config, std::unique_ptrenable_delete(); // enable delete by default for dynamic index - // if write params are not passed, it is inffered that ctor is called by - // search - if (index_config.index_write_params != nullptr && index_config.index_search_params != nullptr) - { - _indexingQueueSize = index_config.index_write_params->search_list_size; - _indexingRange = index_config.index_write_params->max_degree; - _indexingMaxC = index_config.index_write_params->max_occlusion_size; - _indexingAlpha = index_config.index_write_params->alpha; - _filterIndexingQueueSize = index_config.index_write_params->filter_list_size; + } - uint32_t num_threads_indx = index_config.index_write_params->num_threads; - uint32_t num_scratch_spaces = index_config.index_search_params->num_search_threads + num_threads_indx; + if (index_config.index_write_params != nullptr) + { + _indexingQueueSize = index_config.index_write_params->search_list_size; + _indexingRange = index_config.index_write_params->max_degree; + _indexingMaxC = index_config.index_write_params->max_occlusion_size; + _indexingAlpha = index_config.index_write_params->alpha; + _filterIndexingQueueSize = index_config.index_write_params->filter_list_size; + _indexingThreads = index_config.index_write_params->num_threads; + _saturate_graph = index_config.index_write_params->saturate_graph; + + if (index_config.index_search_params != nullptr) + { + uint32_t num_scratch_spaces = index_config.index_search_params->num_search_threads + _indexingThreads; initialize_query_scratch(num_scratch_spaces, index_config.index_search_params->initial_search_list_size, _indexingQueueSize, _indexingRange, _indexingMaxC, _data_store->get_dims()); @@ -1227,21 +1230,12 @@ void Index::inter_insert(uint32_t n, std::vector &pru inter_insert(n, pruned_list, _indexingRange, scratch); } -template -void Index::link(const IndexWriteParameters ¶meters) +template void Index::link() { - uint32_t num_threads = parameters.num_threads; + uint32_t num_threads = _indexingThreads; if (num_threads != 0) omp_set_num_threads(num_threads); - _saturate_graph = parameters.saturate_graph; - - _indexingQueueSize = parameters.search_list_size; - _filterIndexingQueueSize = parameters.filter_list_size; - _indexingRange = parameters.max_degree; - _indexingMaxC = parameters.max_occlusion_size; - _indexingAlpha = parameters.alpha; - /* visit_order is a vector that is initialized to the entire graph */ std::vector visit_order; std::vector pool, tmp; @@ -1473,8 +1467,7 @@ void Index::set_start_points_at_random(T radius, uint32_t rando } template -void Index::build_with_data_populated(const IndexWriteParameters ¶meters, - const std::vector &tags) +void Index::build_with_data_populated(const std::vector &tags) { diskann::cout << "Starting index build with " << _nd << " points... " << std::endl; @@ -1498,10 +1491,10 @@ void Index::build_with_data_populated(const IndexWriteParameter } } - uint32_t index_R = parameters.max_degree; - uint32_t num_threads_index = parameters.num_threads; - uint32_t index_L = parameters.search_list_size; - uint32_t maxc = parameters.max_occlusion_size; + uint32_t index_R = _indexingRange; + uint32_t num_threads_index = _indexingThreads; + uint32_t index_L = _indexingQueueSize; + uint32_t maxc = _indexingMaxC; if (_query_scratch.size() == 0) { @@ -1510,7 +1503,7 @@ void Index::build_with_data_populated(const IndexWriteParameter } generate_frozen_point(); - link(parameters); + link(); size_t max = 0, min = SIZE_MAX, total = 0, cnt = 0; for (size_t i = 0; i < _nd; i++) @@ -1528,13 +1521,11 @@ void Index::build_with_data_populated(const IndexWriteParameter _has_built = true; } template -void Index::_build(const DataType &data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, TagVector &tags) +void Index::_build(const DataType &data, const size_t num_points_to_load, TagVector &tags) { try { - this->build(std::any_cast(data), num_points_to_load, parameters, - tags.get>()); + this->build(std::any_cast(data), num_points_to_load, tags.get>()); } catch (const std::bad_any_cast &e) { @@ -1546,8 +1537,7 @@ void Index::_build(const DataType &data, const size_t num_point } } template -void Index::build(const T *data, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags) +void Index::build(const T *data, const size_t num_points_to_load, const std::vector &tags) { if (num_points_to_load == 0) { @@ -1568,12 +1558,11 @@ void Index::build(const T *data, const size_t num_points_to_loa _data_store->populate_data(data, (location_t)num_points_to_load); } - build_with_data_populated(parameters, tags); + build_with_data_populated(tags); } template -void Index::build(const char *filename, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const std::vector &tags) +void Index::build(const char *filename, const size_t num_points_to_load, const std::vector &tags) { // idealy this should call build_filtered_index based on params passed @@ -1662,12 +1651,11 @@ void Index::build(const char *filename, const size_t num_points std::unique_lock tl(_tag_lock); _nd = num_points_to_load; } - build_with_data_populated(parameters, tags); + build_with_data_populated(tags); } template -void Index::build(const char *filename, const size_t num_points_to_load, - const IndexWriteParameters ¶meters, const char *tag_filename) +void Index::build(const char *filename, const size_t num_points_to_load, const char *tag_filename) { std::vector tags; @@ -1706,43 +1694,37 @@ void Index::build(const char *filename, const size_t num_points } } } - build(filename, num_points_to_load, parameters, tags); + build(filename, num_points_to_load, tags); } template void Index::build(const std::string &data_file, const size_t num_points_to_load, - IndexBuildParams &build_params) + IndexFilterParams &filter_params) { - std::string labels_file_to_use = build_params.save_path_prefix + "_label_formatted.txt"; - std::string mem_labels_int_map_file = build_params.save_path_prefix + "_labels_map.txt"; + std::string labels_file_to_use = filter_params.save_path_prefix + "_label_formatted.txt"; + std::string mem_labels_int_map_file = filter_params.save_path_prefix + "_labels_map.txt"; size_t points_to_load = num_points_to_load == 0 ? _max_points : num_points_to_load; auto s = std::chrono::high_resolution_clock::now(); - if (build_params.label_file == "") + if (filter_params.label_file == "") { - this->build(data_file.c_str(), points_to_load, build_params.index_write_params); + this->build(data_file.c_str(), points_to_load); } else { // TODO: this should ideally happen in save() - convert_labels_string_to_int(build_params.label_file, labels_file_to_use, mem_labels_int_map_file, - build_params.universal_label); - if (build_params.universal_label != "") + convert_labels_string_to_int(filter_params.label_file, labels_file_to_use, mem_labels_int_map_file, + filter_params.universal_label); + if (filter_params.universal_label != "") { LabelT unv_label_as_num = 0; this->set_universal_label(unv_label_as_num); } - this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load, - build_params.index_write_params); + this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load); } std::chrono::duration diff = std::chrono::high_resolution_clock::now() - s; std::cout << "Indexing time: " << diff.count() << "\n"; - // cleanup - if (build_params.label_file != "") - { - // clean_up_artifacts({labels_file_to_use, mem_labels_int_map_file}, {}); - } } template @@ -1838,8 +1820,7 @@ void Index::set_universal_label(const LabelT &label) template void Index::build_filtered_index(const char *filename, const std::string &label_file, - const size_t num_points_to_load, IndexWriteParameters ¶meters, - const std::vector &tags) + const size_t num_points_to_load, const std::vector &tags) { _labels_file = label_file; // original label file _filtered_index = true; @@ -1903,7 +1884,7 @@ void Index::build_filtered_index(const char *filename, const st _medoid_counts[best_medoid]++; } - this->build(filename, num_points_to_load, parameters, tags); + this->build(filename, num_points_to_load, tags); } template From 353e538f458d4f775565b82ee070cd0a01433839 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Tue, 29 Aug 2023 15:49:30 -0700 Subject: [PATCH 16/23] Type hints and returns actually align this time. (#444) --- python/src/_dynamic_memory_index.py | 6 ++++-- python/src/_static_disk_index.py | 6 ++++-- python/src/_static_memory_index.py | 6 ++++-- python/tests/test_dynamic_memory_index.py | 9 +++++++-- python/tests/test_static_disk_index.py | 9 +++++++-- python/tests/test_static_memory_index.py | 9 +++++++-- 6 files changed, 33 insertions(+), 12 deletions(-) diff --git a/python/src/_dynamic_memory_index.py b/python/src/_dynamic_memory_index.py index 9570b8345..0346a2c76 100644 --- a/python/src/_dynamic_memory_index.py +++ b/python/src/_dynamic_memory_index.py @@ -309,7 +309,8 @@ def search( f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}" ) complexity = k_neighbors - return self._index.search(query=_query, knn=k_neighbors, complexity=complexity) + neighbors, distances = self._index.search(query=_query, knn=k_neighbors, complexity=complexity) + return QueryResponse(identifiers=neighbors, distances=distances) def batch_search( self, @@ -351,13 +352,14 @@ def batch_search( complexity = k_neighbors num_queries, dim = queries.shape - return self._index.batch_search( + neighbors, distances = self._index.batch_search( queries=_queries, num_queries=num_queries, knn=k_neighbors, complexity=complexity, num_threads=num_threads, ) + return QueryResponseBatch(identifiers=neighbors, distances=distances) def save(self, save_path: str, index_prefix: str = "ann"): """ diff --git a/python/src/_static_disk_index.py b/python/src/_static_disk_index.py index 1ca93c0a4..769099d8f 100644 --- a/python/src/_static_disk_index.py +++ b/python/src/_static_disk_index.py @@ -138,12 +138,13 @@ def search( ) complexity = k_neighbors - return self._index.search( + neighbors, distances = self._index.search( query=_query, knn=k_neighbors, complexity=complexity, beam_width=beam_width, ) + return QueryResponse(identifiers=neighbors, distances=distances) def batch_search( self, @@ -187,7 +188,7 @@ def batch_search( complexity = k_neighbors num_queries, dim = _queries.shape - return self._index.batch_search( + neighbors, distances = self._index.batch_search( queries=_queries, num_queries=num_queries, knn=k_neighbors, @@ -195,3 +196,4 @@ def batch_search( beam_width=beam_width, num_threads=num_threads, ) + return QueryResponseBatch(identifiers=neighbors, distances=distances) diff --git a/python/src/_static_memory_index.py b/python/src/_static_memory_index.py index 8b87cd561..b1ffb468d 100644 --- a/python/src/_static_memory_index.py +++ b/python/src/_static_memory_index.py @@ -136,7 +136,8 @@ def search( f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}" ) complexity = k_neighbors - return self._index.search(query=_query, knn=k_neighbors, complexity=complexity) + neighbors, distances = self._index.search(query=_query, knn=k_neighbors, complexity=complexity) + return QueryResponse(identifiers=neighbors, distances=distances) def batch_search( self, @@ -178,10 +179,11 @@ def batch_search( complexity = k_neighbors num_queries, dim = _queries.shape - return self._index.batch_search( + neighbors, distances = self._index.batch_search( queries=_queries, num_queries=num_queries, knn=k_neighbors, complexity=complexity, num_threads=num_threads, ) + return QueryResponseBatch(identifiers=neighbors, distances=distances) diff --git a/python/tests/test_dynamic_memory_index.py b/python/tests/test_dynamic_memory_index.py index ff9c8981d..48c05443c 100644 --- a/python/tests/test_dynamic_memory_index.py +++ b/python/tests/test_dynamic_memory_index.py @@ -72,12 +72,15 @@ def test_recall_and_batch(self): ) k = 5 - diskann_neighbors, diskann_distances = index.batch_search( + batch_response = index.batch_search( query_vectors, k_neighbors=k, complexity=5, num_threads=16, ) + self.assertIsInstance(batch_response, dap.QueryResponseBatch) + + diskann_neighbors, diskann_distances = batch_response if metric == "l2" or metric == "cosine": knn = NearestNeighbors( n_neighbors=100, algorithm="auto", metric=metric @@ -115,7 +118,9 @@ def test_single(self): index.batch_insert(vectors=index_vectors, vector_ids=generated_tags) k = 5 - ids, dists = index.search(query_vectors[0], k_neighbors=k, complexity=5) + response = index.search(query_vectors[0], k_neighbors=k, complexity=5) + self.assertIsInstance(response, dap.QueryResponse) + ids, dists = response self.assertEqual(ids.shape[0], k) self.assertEqual(dists.shape[0], k) diff --git a/python/tests/test_static_disk_index.py b/python/tests/test_static_disk_index.py index 4ba544106..c36c581d2 100644 --- a/python/tests/test_static_disk_index.py +++ b/python/tests/test_static_disk_index.py @@ -62,13 +62,16 @@ def test_recall_and_batch(self): ) k = 5 - diskann_neighbors, diskann_distances = index.batch_search( + batch_response = index.batch_search( query_vectors, k_neighbors=k, complexity=5, beam_width=2, num_threads=16, ) + self.assertIsInstance(batch_response, dap.QueryResponseBatch) + + diskann_neighbors, diskann_distances = batch_response if metric == "l2": knn = NearestNeighbors( n_neighbors=100, algorithm="auto", metric="l2" @@ -93,9 +96,11 @@ def test_single(self): ) k = 5 - ids, dists = index.search( + response = index.search( query_vectors[0], k_neighbors=k, complexity=5, beam_width=2 ) + self.assertIsInstance(response, dap.QueryResponse) + ids, dists = response self.assertEqual(ids.shape[0], k) self.assertEqual(dists.shape[0], k) diff --git a/python/tests/test_static_memory_index.py b/python/tests/test_static_memory_index.py index cb7f0f01d..ce12ed3bf 100644 --- a/python/tests/test_static_memory_index.py +++ b/python/tests/test_static_memory_index.py @@ -50,12 +50,15 @@ def test_recall_and_batch(self): ) k = 5 - diskann_neighbors, diskann_distances = index.batch_search( + batch_response = index.batch_search( query_vectors, k_neighbors=k, complexity=5, num_threads=16, ) + self.assertIsInstance(batch_response, dap.QueryResponseBatch) + + diskann_neighbors, diskann_distances = batch_response if metric in ["l2", "cosine"]: knn = NearestNeighbors( n_neighbors=100, algorithm="auto", metric=metric @@ -86,7 +89,9 @@ def test_single(self): ) k = 5 - ids, dists = index.search(query_vectors[0], k_neighbors=k, complexity=5) + response = index.search(query_vectors[0], k_neighbors=k, complexity=5) + self.assertIsInstance(response, dap.QueryResponse) + ids, dists = response self.assertEqual(ids.shape[0], k) self.assertEqual(dists.shape[0], k) From fa6c27970a9f0ae419b560313d793d7907f7ab80 Mon Sep 17 00:00:00 2001 From: rakri <78582691+rakri@users.noreply.github.com> Date: Wed, 30 Aug 2023 15:02:34 +0530 Subject: [PATCH 17/23] working draft PR for cleaning up disk based filter search (#414) * made changes to clean up filter number conversion, and fixed bug with universal filter search * minor typecast fix --------- Co-authored-by: rakri --- include/pq_flash_index.h | 12 ++--- src/index.cpp | 4 ++ src/pq_flash_index.cpp | 113 +++++++++++---------------------------- 3 files changed, 41 insertions(+), 88 deletions(-) diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index 83668e0ea..c98500815 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -115,11 +115,10 @@ template class PQFlashIndex DISKANN_DLLEXPORT void set_universal_label(const LabelT &label); private: - DISKANN_DLLEXPORT inline bool point_has_label(uint32_t point_id, uint32_t label_id); + DISKANN_DLLEXPORT inline bool point_has_label(uint32_t point_id, LabelT label_id); std::unordered_map load_label_map(const std::string &map_file); DISKANN_DLLEXPORT void parse_label_file(const std::string &map_file, size_t &num_pts_labels); DISKANN_DLLEXPORT void get_label_file_metadata(std::string map_file, uint32_t &num_pts, uint32_t &num_total_labels); - DISKANN_DLLEXPORT inline int32_t get_filter_number(const LabelT &filter_label); DISKANN_DLLEXPORT void generate_random_labels(std::vector &labels, const uint32_t num_labels, const uint32_t nthreads); @@ -222,12 +221,11 @@ template class PQFlashIndex // filter support uint32_t *_pts_to_label_offsets = nullptr; - uint32_t *_pts_to_labels = nullptr; - tsl::robin_set _labels; + uint32_t *_pts_to_label_counts = nullptr; + LabelT *_pts_to_labels = nullptr; std::unordered_map> _filter_to_medoid_ids; - bool _use_universal_label; - uint32_t _universal_filter_num; - std::vector _filter_list; + bool _use_universal_label = false; + LabelT _universal_filter_label; tsl::robin_set _dummy_pts; tsl::robin_set _has_dummy_pts; tsl::robin_map _dummy_to_real_map; diff --git a/src/index.cpp b/src/index.cpp index b4ebe1dda..0b10cc9a0 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1754,6 +1754,10 @@ LabelT Index::get_converted_label(const std::string &raw_label) { return _label_map[raw_label]; } + if (_use_universal_label) + { + return _universal_label; + } std::stringstream stream; stream << "Unable to find label in the Label Map"; diskann::cerr << stream.str() << std::endl; diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index e76debcdf..e26df08d0 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -81,7 +81,10 @@ template PQFlashIndex::~PQFlashIndex() { delete[] _pts_to_label_offsets; } - + if (_pts_to_label_counts != nullptr) + { + delete[] _pts_to_label_counts; + } if (_pts_to_labels != nullptr) { delete[] _pts_to_labels; @@ -536,21 +539,6 @@ template void PQFlashIndex::use_medoids } } -template -inline int32_t PQFlashIndex::get_filter_number(const LabelT &filter_label) -{ - int idx = -1; - for (uint32_t i = 0; i < _filter_list.size(); i++) - { - if (_filter_list[i] == filter_label) - { - idx = i; - break; - } - } - return idx; -} - template void PQFlashIndex::generate_random_labels(std::vector &labels, const uint32_t num_labels, const uint32_t nthreads) @@ -559,30 +547,22 @@ void PQFlashIndex::generate_random_labels(std::vector &labels labels.clear(); labels.resize(num_labels); - uint64_t num_total_labels = - _pts_to_label_offsets[_num_points - 1] + _pts_to_labels[_pts_to_label_offsets[_num_points - 1]]; + uint64_t num_total_labels = _pts_to_label_offsets[_num_points - 1] + _pts_to_label_counts[_num_points - 1]; std::mt19937 gen(rd()); - std::uniform_int_distribution dis(0, num_total_labels); - - tsl::robin_set skip_locs; - for (uint32_t i = 0; i < _num_points; i++) + if (num_total_labels == 0) { - skip_locs.insert(_pts_to_label_offsets[i]); + std::stringstream stream; + stream << "No labels found in data. Not sampling random labels "; + diskann::cerr << stream.str() << std::endl; + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } + std::uniform_int_distribution dis(0, num_total_labels - 1); #pragma omp parallel for schedule(dynamic, 1) num_threads(nthreads) for (int64_t i = 0; i < num_labels; i++) { - bool found_flag = false; - while (!found_flag) - { - uint64_t rnd_loc = dis(gen); - if (skip_locs.find(rnd_loc) == skip_locs.end()) - { - found_flag = true; - labels[i] = _filter_list[_pts_to_labels[rnd_loc]]; - } - } + uint64_t rnd_loc = dis(gen); + labels[i] = (LabelT)_pts_to_labels[rnd_loc]; } } @@ -613,6 +593,10 @@ LabelT PQFlashIndex::get_converted_label(const std::string &filter_la { return _label_map[filter_label]; } + if (_use_universal_label) + { + return _universal_filter_label; + } std::stringstream stream; stream << "Unable to find label in the Label Map"; diskann::cerr << stream.str() << std::endl; @@ -646,14 +630,14 @@ void PQFlashIndex::get_label_file_metadata(std::string map_file, uint } template -inline bool PQFlashIndex::point_has_label(uint32_t point_id, uint32_t label_id) +inline bool PQFlashIndex::point_has_label(uint32_t point_id, LabelT label_id) { uint32_t start_vec = _pts_to_label_offsets[point_id]; - uint32_t num_lbls = _pts_to_labels[start_vec]; + uint32_t num_lbls = _pts_to_label_counts[point_id]; bool ret_val = false; for (uint32_t i = 0; i < num_lbls; i++) { - if (_pts_to_labels[start_vec + 1 + i] == label_id) + if (_pts_to_labels[start_vec + i] == label_id) { ret_val = true; break; @@ -679,18 +663,18 @@ void PQFlashIndex::parse_label_file(const std::string &label_file, si get_label_file_metadata(label_file, num_pts_in_label_file, num_total_labels); _pts_to_label_offsets = new uint32_t[num_pts_in_label_file]; - _pts_to_labels = new uint32_t[num_pts_in_label_file + num_total_labels]; - uint32_t counter = 0; + _pts_to_label_counts = new uint32_t[num_pts_in_label_file]; + _pts_to_labels = new LabelT[num_total_labels]; + uint32_t labels_seen_so_far = 0; while (std::getline(infile, line)) { std::istringstream iss(line); std::vector lbls(0); - _pts_to_label_offsets[line_cnt] = counter; - uint32_t &num_lbls_in_cur_pt = _pts_to_labels[counter]; + _pts_to_label_offsets[line_cnt] = labels_seen_so_far; + uint32_t &num_lbls_in_cur_pt = _pts_to_label_counts[line_cnt]; num_lbls_in_cur_pt = 0; - counter++; getline(iss, token, '\t'); std::istringstream new_iss(token); while (getline(new_iss, token, ',')) @@ -698,19 +682,8 @@ void PQFlashIndex::parse_label_file(const std::string &label_file, si token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); LabelT token_as_num = (LabelT)std::stoul(token); - if (_labels.find(token_as_num) == _labels.end()) - { - _filter_list.emplace_back(token_as_num); - } - int32_t filter_num = get_filter_number(token_as_num); - if (filter_num == -1) - { - diskann::cout << "Error!! " << std::endl; - exit(-1); - } - _pts_to_labels[counter++] = filter_num; + _pts_to_labels[labels_seen_so_far++] = (LabelT)token_as_num; num_lbls_in_cur_pt++; - _labels.insert(token_as_num); } if (num_lbls_in_cur_pt == 0) @@ -726,16 +699,8 @@ void PQFlashIndex::parse_label_file(const std::string &label_file, si template void PQFlashIndex::set_universal_label(const LabelT &label) { - int32_t temp_filter_num = get_filter_number(label); - if (temp_filter_num == -1) - { - diskann::cout << "Error, could not find universal label." << std::endl; - } - else - { - _use_universal_label = true; - _universal_filter_num = (uint32_t)temp_filter_num; - } + _use_universal_label = true; + _universal_filter_label = label; } #ifdef EXEC_ENV_OLS @@ -1178,22 +1143,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t const uint32_t io_limit, const bool use_reorder_data, QueryStats *stats) { - int32_t filter_num = 0; - if (use_filter) - { - filter_num = get_filter_number(filter_label); - if (filter_num < 0) - { - if (!_use_universal_label) - { - return; - } - else - { - filter_num = _universal_filter_num; - } - } - } uint64_t num_sector_per_nodes = DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN); if (beam_width > num_sector_per_nodes * defaults::MAX_N_SECTOR_READS) @@ -1443,7 +1392,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t if (!use_filter && _dummy_pts.find(id) != _dummy_pts.end()) continue; - if (use_filter && !point_has_label(id, filter_num) && !point_has_label(id, _universal_filter_num)) + if (use_filter && !(point_has_label(id, filter_label)) && + (!_use_universal_label || !point_has_label(id, _universal_filter_label))) continue; cmps++; float dist = dist_scratch[m]; @@ -1505,7 +1455,8 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t if (!use_filter && _dummy_pts.find(id) != _dummy_pts.end()) continue; - if (use_filter && !point_has_label(id, filter_num) && !point_has_label(id, _universal_filter_num)) + if (use_filter && !(point_has_label(id, filter_label)) && + (!_use_universal_label || !point_has_label(id, _universal_filter_label))) continue; cmps++; float dist = dist_scratch[m]; From a112411efed8487680d1e2e2b4ed4d4298cea523 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Wed, 30 Aug 2023 13:51:22 -0700 Subject: [PATCH 18/23] Fixes #432, bug in using openmp with gcc and omp_get_num_threads() (#445) * Fixes #432, bug in using openmp with gcc and omp_get_num_threads() only reporting the number of threads collaborating on the current code region not available overall. I made this error and transitioned us from omp_get_num_procs() about 5 or 6 months ago and only with bug #432 did I really get to see how problematic my naive expectations were. * Removed cosine distance metric from disk index until we can properly fix it in pqflashindex. Documented what distance metrics can be used with what vector dtypes in tables in the documentation. --- include/parameters.h | 2 +- python/src/_builder.py | 28 +++++++++++++++++++++++ python/src/dynamic_memory_index.cpp | 3 +-- python/src/static_disk_index.cpp | 5 ++-- python/src/static_memory_index.cpp | 6 ++--- python/tests/test_dynamic_memory_index.py | 24 +++++++++++++++++++ python/tests/test_static_disk_index.py | 19 ++++++++++++++- python/tests/test_static_memory_index.py | 21 +++++++++++++++++ src/index.cpp | 2 +- 9 files changed, 100 insertions(+), 10 deletions(-) diff --git a/include/parameters.h b/include/parameters.h index 209b9128c..4fec9ae08 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -83,7 +83,7 @@ class IndexWriteParametersBuilder IndexWriteParametersBuilder &with_num_threads(const uint32_t num_threads) { - _num_threads = num_threads == 0 ? omp_get_num_threads() : num_threads; + _num_threads = num_threads == 0 ? omp_get_num_procs() : num_threads; return *this; } diff --git a/python/src/_builder.py b/python/src/_builder.py index 18e9e9fa0..db2b200db 100644 --- a/python/src/_builder.py +++ b/python/src/_builder.py @@ -70,6 +70,15 @@ def build_disk_index( in the format DiskANN's PQ Flash Index builder requires. This temp folder is deleted upon index creation completion or error. + ## Distance Metric and Vector Datatype Restrictions + | Metric \ Datatype | np.float32 | np.uint8 | np.int8 | + |-------------------|------------|----------|---------| + | L2 | ✅ | ✅ | ✅ | + | MIPS | ✅ | ❌ | ❌ | + | Cosine [^bug-in-disk-cosine] | ❌ | ❌ | ❌ | + + [^bug-in-disk-cosine]: For StaticDiskIndex, Cosine distances are not currently supported. + ### Parameters - **data**: Either a `str` representing a path to a DiskANN vector bin file, or a numpy.ndarray, of a supported dtype, in 2 dimensions. Note that `vector_dtype` must be provided if data is a `str` @@ -119,6 +128,12 @@ def build_disk_index( vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( data, vector_dtype, index_directory, index_prefix ) + _assert(dap_metric != _native_dap.COSINE, "Cosine is currently not supported in StaticDiskIndex") + if dap_metric == _native_dap.INNER_PRODUCT: + _assert( + vector_dtype_actual == np.float32, + "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips" + ) num_points, dimensions = vectors_metadata_from_file(vector_bin_path) @@ -176,6 +191,14 @@ def build_memory_index( `diskannpy.DynamicMemoryIndex`, you **must** supply a valid value for the `tags` parameter. **Do not supply tags if the index is intended to be `diskannpy.StaticMemoryIndex`**! + ## Distance Metric and Vector Datatype Restrictions + + | Metric \ Datatype | np.float32 | np.uint8 | np.int8 | + |-------------------|------------|----------|---------| + | L2 | ✅ | ✅ | ✅ | + | MIPS | ✅ | ❌ | ❌ | + | Cosine | ✅ | ✅ | ✅ | + ### Parameters - **data**: Either a `str` representing a path to an existing DiskANN vector bin file, or a numpy.ndarray of a @@ -232,6 +255,11 @@ def build_memory_index( vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( data, vector_dtype, index_directory, index_prefix ) + if dap_metric == _native_dap.INNER_PRODUCT: + _assert( + vector_dtype_actual == np.float32, + "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips" + ) num_points, dimensions = vectors_metadata_from_file(vector_bin_path) diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp index f92f4157e..3add2aa5c 100644 --- a/python/src/dynamic_memory_index.cpp +++ b/python/src/dynamic_memory_index.cpp @@ -34,8 +34,7 @@ diskann::Index dynamic_index_builder(const diskann:: const uint32_t initial_search_threads, const bool concurrent_consolidation) { - const uint32_t _initial_search_threads = - initial_search_threads != 0 ? initial_search_threads : omp_get_num_threads(); + const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_procs(); auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, _initial_search_threads); return diskann::Index( diff --git a/python/src/static_disk_index.cpp b/python/src/static_disk_index.cpp index 654f8ec30..9e86b0ad5 100644 --- a/python/src/static_disk_index.cpp +++ b/python/src/static_disk_index.cpp @@ -14,7 +14,8 @@ StaticDiskIndex
::StaticDiskIndex(const diskann::Metric metric, const std::st const uint32_t cache_mechanism) : _reader(std::make_shared()), _index(_reader, metric) { - int load_success = _index.load(num_threads, index_path_prefix.c_str()); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); + int load_success = _index.load(_num_threads, index_path_prefix.c_str()); if (load_success != 0) { throw std::runtime_error("index load failed."); @@ -22,7 +23,7 @@ StaticDiskIndex
::StaticDiskIndex(const diskann::Metric metric, const std::st if (cache_mechanism == 1) { std::string sample_file = index_path_prefix + std::string("_sample_data.bin"); - cache_sample_paths(num_nodes_to_cache, sample_file, num_threads); + cache_sample_paths(num_nodes_to_cache, sample_file, _num_threads); } else if (cache_mechanism == 2) { diff --git a/python/src/static_memory_index.cpp b/python/src/static_memory_index.cpp index 0dbb24dc3..23a349fac 100644 --- a/python/src/static_memory_index.cpp +++ b/python/src/static_memory_index.cpp @@ -17,7 +17,7 @@ diskann::Index static_index_builder(const diskann::Me { throw std::runtime_error("initial_search_complexity must be a positive uint32_t"); } - auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_threads()); + auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_procs()); return diskann::Index
(m, dimensions, num_points, nullptr, // index write params std::make_shared(index_search_params), // index search params @@ -36,7 +36,7 @@ StaticMemoryIndex
::StaticMemoryIndex(const diskann::Metric m, const std::str const uint32_t initial_search_complexity) : _index(static_index_builder
(m, num_points, dimensions, initial_search_complexity)) { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); _index.load(index_prefix.c_str(), _num_threads, initial_search_complexity); } @@ -56,7 +56,7 @@ NeighborsAndDistances StaticMemoryIndex
::batch_search( py::array_t &queries, const uint64_t num_queries, const uint64_t knn, const uint64_t complexity, const uint32_t num_threads) { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); py::array_t ids({num_queries, knn}); py::array_t dists({num_queries, knn}); std::vector
empty_vector; diff --git a/python/tests/test_dynamic_memory_index.py b/python/tests/test_dynamic_memory_index.py index 48c05443c..13d9b08db 100644 --- a/python/tests/test_dynamic_memory_index.py +++ b/python/tests/test_dynamic_memory_index.py @@ -40,6 +40,7 @@ def setUpClass(cls) -> None: build_random_vectors_and_memory_index(np.float32, "cosine", with_tags=True), build_random_vectors_and_memory_index(np.uint8, "cosine", with_tags=True), build_random_vectors_and_memory_index(np.int8, "cosine", with_tags=True), + build_random_vectors_and_memory_index(np.float32, "mips", with_tags=True), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -442,4 +443,27 @@ def _tiny_index(): warnings.simplefilter("error") # turns warnings into raised exceptions index.batch_insert(rng.random((2, 10), dtype=np.float32), np.array([15, 25], dtype=np.uint32)) + def test_zero_threads(self): + for ( + metric, + dtype, + query_vectors, + index_vectors, + ann_dir, + vector_bin_file, + generated_tags, + ) in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.DynamicMemoryIndex( + distance_metric="l2", + vector_dtype=dtype, + dimensions=10, + max_vectors=11_000, + complexity=64, + graph_degree=32, + num_threads=0, # explicitly asking it to use all available threads. + ) + index.batch_insert(vectors=index_vectors, vector_ids=generated_tags, num_threads=0) + k = 5 + ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0) diff --git a/python/tests/test_static_disk_index.py b/python/tests/test_static_disk_index.py index c36c581d2..35015276e 100644 --- a/python/tests/test_static_disk_index.py +++ b/python/tests/test_static_disk_index.py @@ -25,7 +25,7 @@ def _build_random_vectors_and_index(dtype, metric): complexity=32, search_memory_maximum=0.00003, build_memory_maximum=1, - num_threads=1, + num_threads=0, pq_disk_bytes=0, ) return metric, dtype, query_vectors, index_vectors, ann_dir @@ -38,6 +38,7 @@ def setUpClass(cls) -> None: _build_random_vectors_and_index(np.float32, "l2"), _build_random_vectors_and_index(np.uint8, "l2"), _build_random_vectors_and_index(np.int8, "l2"), + _build_random_vectors_and_index(np.float32, "mips"), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -149,3 +150,19 @@ def test_value_ranges_batch_search(self): index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs ) + + def test_zero_threads(self): + for metric, dtype, query_vectors, index_vectors, ann_dir in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.StaticDiskIndex( + distance_metric="l2", + vector_dtype=dtype, + index_directory=ann_dir, + num_threads=0, # Issue #432 + num_nodes_to_cache=10, + ) + + k = 5 + ids, dists = index.batch_search( + query_vectors, k_neighbors=k, complexity=5, beam_width=2, num_threads=0 + ) \ No newline at end of file diff --git a/python/tests/test_static_memory_index.py b/python/tests/test_static_memory_index.py index ce12ed3bf..a04f98928 100644 --- a/python/tests/test_static_memory_index.py +++ b/python/tests/test_static_memory_index.py @@ -20,6 +20,7 @@ def setUpClass(cls) -> None: build_random_vectors_and_memory_index(np.float32, "cosine"), build_random_vectors_and_memory_index(np.uint8, "cosine"), build_random_vectors_and_memory_index(np.int8, "cosine"), + build_random_vectors_and_memory_index(np.float32, "mips"), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -165,3 +166,23 @@ def test_value_ranges_batch_search(self): index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs ) + + def test_zero_threads(self): + for ( + metric, + dtype, + query_vectors, + index_vectors, + ann_dir, + vector_bin_file, + _, + ) in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.StaticMemoryIndex( + index_directory=ann_dir, + num_threads=0, + initial_search_complexity=32, + ) + + k = 5 + ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0) \ No newline at end of file diff --git a/src/index.cpp b/src/index.cpp index 0b10cc9a0..478b86273 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2370,7 +2370,7 @@ consolidation_report Index::consolidate_deletes(const IndexWrit const uint32_t range = params.max_degree; const uint32_t maxc = params.max_occlusion_size; const float alpha = params.alpha; - const uint32_t num_threads = params.num_threads == 0 ? omp_get_num_threads() : params.num_threads; + const uint32_t num_threads = params.num_threads == 0 ? omp_get_num_procs() : params.num_threads; uint32_t num_calls_to_process_delete = 0; diskann::Timer timer; From 4c31367b6c3df5395bf8b786299988a5d001ec5a Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Wed, 30 Aug 2023 15:23:57 -0700 Subject: [PATCH 19/23] Preparing for 0.6.1 release (#447) --- README.md | 2 +- pyproject.toml | 2 +- python/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a381bdedf..a20a1d671 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ Please cite this software in your work as: author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}}, title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}}, url = {https://github.com/Microsoft/DiskANN}, - version = {0.6.0}, + version = {0.6.1}, year = {2023} } ``` diff --git a/pyproject.toml b/pyproject.toml index fb4349fab..df2a342ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.6.0" +version = "0.6.1" description = "DiskANN Python extension module" readme = "python/README.md" diff --git a/python/README.md b/python/README.md index 1365fb422..a0c94759e 100644 --- a/python/README.md +++ b/python/README.md @@ -49,7 +49,7 @@ Please cite this software in your work as: author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}}, title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}}, url = {https://github.com/Microsoft/DiskANN}, - version = {0.6.0}, + version = {0.6.1}, year = {2023} } ``` From b8b6caf8c2bfd92be9326fd85946c78dc33eea7f Mon Sep 17 00:00:00 2001 From: Jon McLean <4429525+jonmclean@users.noreply.github.com> Date: Thu, 31 Aug 2023 09:01:16 -0700 Subject: [PATCH 20/23] Release documentation from the release tag instead of main (#448) --- .github/workflows/build-python-pdoc.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-python-pdoc.yml b/.github/workflows/build-python-pdoc.yml index c9f4c6494..dc3b77fc0 100644 --- a/.github/workflows/build-python-pdoc.yml +++ b/.github/workflows/build-python-pdoc.yml @@ -60,18 +60,18 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: docs/python/html destination_dir: docs/python/dev - # Publish to / if we are on the "main" branch and releasing + # Publish to / if we are releasing - name: Publish reference docs by version (main branch) uses: peaceiris/actions-gh-pages@v3 - if: github.event_name == 'release' && github.ref == 'refs/heads/main' + if: github.event_name == 'release' with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: docs/python/html destination_dir: docs/python/${{ env.DISKANN_VERSION }} - # Publish to /latest if we are on the "main" branch and releasing + # Publish to /latest if we are releasing - name: Publish latest reference docs (main branch) uses: peaceiris/actions-gh-pages@v3 - if: github.event_name == 'release' && github.ref == 'refs/heads/main' + if: github.event_name == 'release' with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: docs/python/html From ced3b4ff4e396c07efc5ec82eedf4a26a455e3ec Mon Sep 17 00:00:00 2001 From: David Kaczynski Date: Fri, 22 Sep 2023 12:54:12 -0400 Subject: [PATCH 21/23] Build streaming index of labeled data (#376) * Add bool param for building a graph of labeled data * Add arguments for building labeled index * Pass arguments for labeled index * Light renaming * Handle labels in insert_point * Fix missing semicolon * Add initial label handling logic * Use unlabeled algo for uniquely labeled point * Ignore frozen points when checking labels * Fix missing newline * Move label-specific logic to threadsafe zone * Check for frozen points when assert num points and num labeled points * Fix file name concatenation for label metadata * inmem_graph_store initial impl * Use Lbuild to append to pruned_list during filter build * Add label counts for deleting from streaming index * Fix typo * Fix conditions for testing * Add medoid search to support deleting label medoids from graph * resolvig error with bfs_medoid_search() * trying to create 2 pruned_lists and combine them * Clear pool between calls to search_for_point_and_prune. Fix integer math * Update pruned_list algo for link method * making fz_points to be medoids for labels encountered * repositioning medoids as well because they are fz points when compacting data * removing unrequired method * rebasing from main * adding tests in yml workflow for dynamic index with labels * quick fix * removing combining of unfiltered + filtered list for now * trying to resolve disk search poor performance * incleasing L size while searching disk index * minor roolback * updating dynamic-label to not use tag file while computing GT * altering some test search L values * adding unfiltered search for filtered batch build index * adding compute gt for zipf dist labels in labsls wowrkflow * searching filtered streaming index with popular label for now * reposition fz points as medoids for filtered dynamic build * minor renaming vars * seoparate functio for insert opoint with labels and without labels * clang error fix * barebones of in mem graph store * refactoring index to use index factory * clang format fix * window build fix * making enum to enum class (c++ 11 style) for scope resolution with same enum values * cleaning up API for GraphSore * resolving comments * clang error fix * adding some comments * moving _nd back to index class * removing funcrion reposition medoidds its not required, incorporated into reposition_points * altering -L (32->5) and -R (16->32) whhile building filterted disk index to work well with modified connections in algo * updating docs -> dynamic_index.md to have info on how to build and search filtered dynamic index * updating docs * updateing _pts_to_labels when repositioning fz_points * error fix * clang fix * making sure _pts_to_labels are not empty * fixing dynamic-label build error * code improvements * adding logic for test_ins_del_consolidate to support filtered index * resolving PR comments * error fix * error fix for dynamic * now test insert delete consolidate support building filters * lowering recal in case of test insert delete consolidte * resolving PR comments * removing _num_frozen_point from graph store * minor fix * moving _start back to main + minor update in graph store api to support that * adding a lock before detect_common_filter + minor naming improvement * adding requested changes from Gopal * removing reservations * resolving namespace resolution for defaults after build failure * minor update * minor update * speeding up location update logic while repositioning * updated with reserving mem for graph neighbours upfront * build error fix * minor update in assert * initial commit * updating python bindings to use new ctor * python binding error fix * error fix * reverting some changes -> experiment * removing redundnt code from native index * python build error fix * tyring to resolve python build error * attempt at python build fix * adding IndexSearchParams * setting search threads to non zero * minor check removed * eperiment 3-> making distance fully owned by data_store * exp 3 clang fix * exp 4 * making distance as unique_ptr * trying to fix build * finally fixing problem * some minor fix * adding dll export to index_factory static function * adding dll export for static fn in index_factory * code cleanup * resolving errors after merge * resolving build errors * fixing build error for stitched index * resolving build errors * removing max_observed_degree set() * removing comments + typo fix * replacing add_neighbour with set_neighbours where we can * error fix * minor fix * fixing error introduced while rebasing * fixing error for dynamic filtered index * resolving dynamic build deadlick error * resolving error with test_insert_del_consolidate for dynamic filter build * minor code cleanup * refactoring fz_pts and filter_index to be property of IndexConfig and hence Index * removing write_params from build() * removing write_params from buidl and taking it upfront in Index Ctor * minor fix * renaming build_params to filter params * fixing errors on auto merge * auto decide universal_label experiment * resolving bug with universal lable * resolving dynamic labels error, if there are unused fz points * exposing set_universal_label() through abstract index * minor update: sanity check * minor update to search * including tag file while computing GT * generating compacted label file and using it in generate GT * minor fix * resolving New PR comments (minor typo fixes) * renaming _pts_to_labels to _tag_to_labels + adding a warning for consolidate deletes and quality of index * minor name chnage + code cleanup * clang format fix * adding locks for filter data_structures * avoiding deadock * universal label defination update * reverting locks on _location_to_labels as its causing problems with large dataset * adding locks for _label_to_medoid_id * Update dynamic_index.md * Update dynamic-labels.yml * renaming some variables --------- Co-authored-by: David Kaczynski Co-authored-by: yashpatel007 Co-authored-by: Yash Patel <47032340+yashpatel007@users.noreply.github.com> Co-authored-by: Harsha Vardhan Simhadri --- .github/workflows/dynamic-labels.yml | 102 ++++++ .github/workflows/labels.yml | 21 +- .github/workflows/pr-test.yml | 3 + apps/search_memory_index.cpp | 4 +- apps/test_insert_deletes_consolidate.cpp | 131 ++++++-- apps/test_streaming_scenario.cpp | 184 +++++++++-- .../utils/compute_groundtruth_for_filters.cpp | 5 - include/abstract_index.h | 11 + include/defaults.h | 1 + include/filter_utils.h | 4 + include/index.h | 22 +- include/index_build_params.h | 2 + include/index_config.h | 27 +- include/parameters.h | 16 +- include/types.h | 1 + include/utils.h | 5 +- python/src/dynamic_memory_index.cpp | 20 +- src/abstract_index.cpp | 74 +++++ src/disk_utils.cpp | 9 +- src/filter_utils.cpp | 68 ++++ src/index.cpp | 312 ++++++++++++++---- src/index_factory.cpp | 5 +- src/restapi/search_wrapper.cpp | 3 +- .../index_write_parameters_builder_tests.cpp | 7 +- workflows/dynamic_index.md | 43 ++- workflows/filtered_ssd_index.md | 2 +- 26 files changed, 893 insertions(+), 189 deletions(-) create mode 100644 .github/workflows/dynamic-labels.yml diff --git a/.github/workflows/dynamic-labels.yml b/.github/workflows/dynamic-labels.yml new file mode 100644 index 000000000..0f3b56eb9 --- /dev/null +++ b/.github/workflows/dynamic-labels.yml @@ -0,0 +1,102 @@ +name: Dynamic-Labels +on: [workflow_call] +jobs: + acceptance-tests-dynamic: + name: Dynamic-Labels + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-2019, windows-latest] + runs-on: ${{matrix.os}} + defaults: + run: + shell: bash + steps: + - name: Checkout repository + if: ${{ runner.os == 'Linux' }} + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - name: Checkout repository + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@v3 + with: + fetch-depth: 1 + submodules: true + - name: DiskANN Build CLI Applications + uses: ./.github/actions/build + + - name: Generate Data + uses: ./.github/actions/generate-random + + - name: Generate Labels + run: | + echo "Generating synthetic labels and computing ground truth for filtered search with universal label" + dist/bin/generate_synthetic_labels --num_labels 50 --num_points 10000 --output_file data/rand_labels_50_10K.txt --distribution_type random + + echo "Generating synthetic labels with a zipf distribution and computing ground truth for filtered search with universal label" + dist/bin/generate_synthetic_labels --num_labels 50 --num_points 10000 --output_file data/zipf_labels_50_10K.txt --distribution_type zipf + + - name: Test a streaming index (float) with labels (Zipf distributed) + run: | + dist/bin/test_streaming_scenario --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --universal_label 0 --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_zipf_stream -R 64 --FilteredLbuild 200 -L 50 --alpha 1.2 --insert_threads 8 --consolidate_threads 8 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 3.2 --unique_labels_supported 51 + + echo "Computing groundtruth with filter" + dist/bin/compute_groundtruth_for_filters --data_type float --universal_label 0 --filter_label 1 --dist_fn l2 --base_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_base-act4000-cons2000-max10000_1 --label_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000_raw_labels.txt --tags_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.tags + echo "Searching with filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 1 --fail_if_recall_below 40 --index_path_prefix data/index_zipf_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_base-act4000-cons2000-max10000_1 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1 + + echo "Computing groundtruth w/o filter" + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_base-act4000-cons2000-max10000 + echo "Searching without filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_zipf_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 + + - name: Test a streaming index (float) with labels (random distributed) + run: | + dist/bin/test_streaming_scenario --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --universal_label 0 --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_rand_stream -R 64 --FilteredLbuild 200 -L 50 --alpha 1.2 --insert_threads 8 --consolidate_threads 8 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 3.2 --unique_labels_supported 51 + + echo "Computing groundtruth with filter" + dist/bin/compute_groundtruth_for_filters --data_type float --universal_label 0 --filter_label 1 --dist_fn l2 --base_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_base-act4000-cons2000-max10000_1 --label_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000_raw_labels.txt --tags_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.tags + echo "Searching with filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 1 --fail_if_recall_below 40 --index_path_prefix data/index_rand_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_base-act4000-cons2000-max10000_1 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1 + + echo "Computing groundtruth w/o filter" + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_base-act4000-cons2000-max10000 + echo "Searching without filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_rand_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 + + - name: Test Insert Delete Consolidate (float) with labels (zipf distributed) + run: | + dist/bin/test_insert_deletes_consolidate --data_type float --dist_fn l2 --universal_label 0 --label_file data/zipf_labels_50_10K.txt --FilteredLbuild 70 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_zipf_ins_del -R 64 -L 10 --alpha 1.2 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 3.2 --unique_labels_supported 51 + + echo "Computing groundtruth with filter" + dist/bin/compute_groundtruth_for_filters --data_type float --filter_label 5 --universal_label 0 --dist_fn l2 --base_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_random10D_1K_wlabel_5 --label_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500_raw_labels.txt --tags_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.tags + echo "Searching with filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 5 --fail_if_recall_below 10 --index_path_prefix data/index_zipf_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_zipf_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_random10D_1K_wlabel_5 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1 + + echo "Computing groundtruth w/o filter" + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_random10D_1K + echo "Searching without filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_zipf_ins_del.after-concurrent-delete-del2500-7500 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_random10D_1K -K 10 -L 20 40 60 80 100 -T 64 + + - name: Test Insert Delete Consolidate (float) with labels (random distributed) + run: | + dist/bin/test_insert_deletes_consolidate --data_type float --dist_fn l2 --universal_label 0 --label_file data/rand_labels_50_10K.txt --FilteredLbuild 70 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_rand_ins_del -R 64 -L 10 --alpha 1.2 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 3.2 --unique_labels_supported 51 + + echo "Computing groundtruth with filter" + dist/bin/compute_groundtruth_for_filters --data_type float --filter_label 5 --universal_label 0 --dist_fn l2 --base_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_random10D_1K_wlabel_5 --label_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500_raw_labels.txt --tags_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.tags + echo "Searching with filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 5 --fail_if_recall_below 40 --index_path_prefix data/index_rand_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_rand_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_random10D_1K_wlabel_5 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1 + + echo "Computing groundtruth w/o filter" + dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_random10D_1K + echo "Searching without filter" + dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_rand_ins_del.after-concurrent-delete-del2500-7500 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_random10D_1K -K 10 -L 20 40 60 80 100 -T 64 + + - name: upload data and bin + uses: actions/upload-artifact@v3 + with: + name: dynamic + path: | + ./dist/** + ./data/** diff --git a/.github/workflows/labels.yml b/.github/workflows/labels.yml index e811c1ff5..dc440951c 100644 --- a/.github/workflows/labels.yml +++ b/.github/workflows/labels.yml @@ -27,7 +27,7 @@ jobs: uses: ./.github/actions/build - name: Generate Data - uses: ./.github/actions/generate-random + uses: ./.github/actions/generate-random - name: Generate Labels run: | @@ -55,10 +55,15 @@ jobs: dist/bin/build_memory_index --data_type uint8 --dist_fn cosine --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 10 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L 16 32 dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --filter_label 10 --fail_if_recall_below 70 --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L 16 32 + + echo "Searching without filters" + dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 32 64 + dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 32 64 + - name: build and search disk index with labels using L2 and Cosine metrics (random distributed labels) if: success() || failure() run: | - dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50_wlabel -R 16 -L 32 -B 0.00003 -M 1 + dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50_wlabel -R 32 -L 5 -B 0.00003 -M 1 dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 10 --fail_if_recall_below 50 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50_wlabel --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: build and search in-memory index with labels using L2 and Cosine metrics (zipf distributed labels) if: success() || failure() @@ -67,16 +72,24 @@ jobs: dist/bin/build_memory_index --data_type uint8 --dist_fn cosine --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L 16 32 dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L 16 32 + + echo "Searching without filters" + dist/bin/compute_groundtruth --data_type uint8 --dist_fn l2 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type uint8 --dist_fn cosine --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100 + dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 32 64 + dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 32 64 + - name: build and search disk index with labels using L2 and Cosine metrics (zipf distributed labels) if: success() || failure() run: | - dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel -R 16 -L 32 -B 0.00003 -M 1 + dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel -R 32 -L 5 -B 0.00003 -M 1 dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 50 --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 + - name : build and search in-memory and disk index (without universal label, zipf distributed) if: success() || failure() run: | dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal - dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal -R 16 -L 32 -B 0.00003 -M 1 + dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal -R 32 -L 5 -B 0.00003 -M 1 dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel_nouniversal -L 16 32 dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 5 --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel_nouniversal --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: Generate combined GT for each query with a separate label and search diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f332f185e..f84953b8c 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -27,6 +27,9 @@ jobs: dynamic: name: Dynamic uses: ./.github/workflows/dynamic.yml + dynamic-labels: + name: Dynamic Labels + uses: ./.github/workflows/dynamic-labels.yml python: name: Python uses: ./.github/workflows/build-python.yml diff --git a/apps/search_memory_index.cpp b/apps/search_memory_index.cpp index d309fa804..1bb02c9bc 100644 --- a/apps/search_memory_index.cpp +++ b/apps/search_memory_index.cpp @@ -131,7 +131,7 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path, std::vector> query_result_dists(Lvec.size()); std::vector latency_stats(query_num, 0); std::vector cmp_stats; - if (not tags) + if (not tags || filtered_search) { cmp_stats = std::vector(query_num, 0); } @@ -221,7 +221,7 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path, float avg_cmps = (float)std::accumulate(cmp_stats.begin(), cmp_stats.end(), 0) / (float)query_num; - if (tags) + if (tags && !filtered_search) { std::cout << std::setw(4) << L << std::setw(12) << displayed_qps << std::setw(20) << (float)mean_latency << std::setw(15) << (float)latency_stats[(uint64_t)(0.999 * query_num)]; diff --git a/apps/test_insert_deletes_consolidate.cpp b/apps/test_insert_deletes_consolidate.cpp index bcc2e178d..659ada14d 100644 --- a/apps/test_insert_deletes_consolidate.cpp +++ b/apps/test_insert_deletes_consolidate.cpp @@ -11,6 +11,7 @@ #include #include "utils.h" +#include "filter_utils.h" #include "program_options_utils.hpp" #include "index_factory.h" @@ -91,16 +92,23 @@ std::string get_save_filename(const std::string &save_path, size_t points_to_ski return final_path; } -template +template void insert_till_next_checkpoint(diskann::AbstractIndex &index, size_t start, size_t end, int32_t thread_count, T *data, - size_t aligned_dim) + size_t aligned_dim, std::vector> &location_to_labels) { diskann::Timer insert_timer; - #pragma omp parallel for num_threads(thread_count) schedule(dynamic) for (int64_t j = start; j < (int64_t)end; j++) { - index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j)); + if (!location_to_labels.empty()) + { + index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j), + location_to_labels[j - start]); + } + else + { + index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j)); + } } const double elapsedSeconds = insert_timer.elapsed() / 1000000.0; std::cout << "Insertion time " << elapsedSeconds << " seconds (" << (end - start) / elapsedSeconds @@ -141,17 +149,22 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa size_t max_points_to_insert, size_t beginning_index_size, float start_point_norm, uint32_t num_start_pts, size_t points_per_checkpoint, size_t checkpoints_per_snapshot, const std::string &save_path, size_t points_to_delete_from_beginning, - size_t start_deletes_after, bool concurrent) + size_t start_deletes_after, bool concurrent, const std::string &label_file, + const std::string &universal_label) { size_t dim, aligned_dim; size_t num_points; diskann::get_bin_metadata(data_path, num_points, dim); aligned_dim = ROUND_UP(dim, 8); + bool has_labels = label_file != ""; + using TagT = uint32_t; + using LabelT = uint32_t; + + size_t current_point_offset = points_to_skip; + const size_t last_point_threshold = points_to_skip + max_points_to_insert; bool enable_tags = true; using TagT = uint32_t; - auto data_type = diskann_type_to_name(); - auto tag_type = diskann_type_to_name(); auto index_search_params = diskann::IndexSearchParams(params.search_list_size, params.num_threads); diskann::IndexConfig index_config = diskann::IndexConfigBuilder() .with_metric(diskann::L2) @@ -160,17 +173,26 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa .is_dynamic_index(true) .with_index_write_params(params) .with_index_search_params(index_search_params) - .with_data_type(data_type) - .with_tag_type(tag_type) + .with_data_type(diskann_type_to_name()) + .with_tag_type(diskann_type_to_name()) + .with_label_type(diskann_type_to_name()) .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY) .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY) .is_enable_tags(enable_tags) + .is_filtered(has_labels) + .with_num_frozen_pts(num_start_pts) .is_concurrent_consolidate(concurrent) .build(); diskann::IndexFactory index_factory = diskann::IndexFactory(index_config); auto index = index_factory.create_instance(); + if (universal_label != "") + { + LabelT u_label = 0; + index->set_universal_label(u_label); + } + if (points_to_skip > num_points) { throw diskann::ANNException("Asked to skip more points than in data file", -1, __FUNCSIG__, __FILE__, __LINE__); @@ -188,9 +210,6 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa << " points since the data file has only that many" << std::endl; } - size_t current_point_offset = points_to_skip; - const size_t last_point_threshold = points_to_skip + max_points_to_insert; - if (beginning_index_size > max_points_to_insert) { beginning_index_size = max_points_to_insert; @@ -236,8 +255,21 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa << " points since the data file has only that many" << std::endl; } + std::vector> location_to_labels; if (concurrent) { + // handle labels + const auto save_path_inc = get_save_filename(save_path + ".after-concurrent-delete-", points_to_skip, + points_to_delete_from_beginning, last_point_threshold); + std::string labels_file_to_use = save_path_inc + "_label_formatted.txt"; + std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt"; + if (has_labels) + { + convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label); + auto parse_result = diskann::parse_formatted_label_file(labels_file_to_use); + location_to_labels = std::get<0>(parse_result); + } + int32_t sub_threads = (params.num_threads + 1) / 2; bool delete_launched = false; std::future delete_task; @@ -252,7 +284,8 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa auto insert_task = std::async(std::launch::async, [&]() { load_aligned_bin_part(data_path, data, start, end - start); - insert_till_next_checkpoint(*index, start, end, sub_threads, data, aligned_dim); + insert_till_next_checkpoint(*index, start, end, sub_threads, data, aligned_dim, + location_to_labels); }); insert_task.wait(); @@ -272,12 +305,21 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa delete_task.wait(); std::cout << "Time Elapsed " << timer.elapsed() / 1000 << "ms\n"; - const auto save_path_inc = get_save_filename(save_path + ".after-concurrent-delete-", points_to_skip, - points_to_delete_from_beginning, last_point_threshold); index->save(save_path_inc.c_str(), true); } else { + const auto save_path_inc = get_save_filename(save_path + ".after-delete-", points_to_skip, + points_to_delete_from_beginning, last_point_threshold); + std::string labels_file_to_use = save_path_inc + "_label_formatted.txt"; + std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt"; + if (has_labels) + { + convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label); + auto parse_result = diskann::parse_formatted_label_file(labels_file_to_use); + location_to_labels = std::get<0>(parse_result); + } + size_t last_snapshot_points_threshold = 0; size_t num_checkpoints_till_snapshot = checkpoints_per_snapshot; @@ -288,7 +330,8 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa std::cout << std::endl << "Inserting from " << start << " to " << end << std::endl; load_aligned_bin_part(data_path, data, start, end - start); - insert_till_next_checkpoint(*index, start, end, (int32_t)params.num_threads, data, aligned_dim); + insert_till_next_checkpoint(*index, start, end, (int32_t)params.num_threads, data, + aligned_dim, location_to_labels); if (checkpoints_per_snapshot > 0 && --num_checkpoints_till_snapshot == 0) { @@ -321,8 +364,7 @@ void build_incremental_index(const std::string &data_path, diskann::IndexWritePa { delete_from_beginning(*index, params, points_to_skip, points_to_delete_from_beginning); } - const auto save_path_inc = get_save_filename(save_path + ".after-delete-", points_to_skip, - points_to_delete_from_beginning, last_point_threshold); + index->save(save_path_inc.c_str(), true); } @@ -338,6 +380,10 @@ int main(int argc, char **argv) points_to_delete_from_beginning, start_deletes_after; bool concurrent; + // label options + std::string label_file, label_type, universal_label; + std::uint32_t Lf, unique_labels_supported; + po::options_description desc{program_options_utils::make_program_description("test_insert_deletes_consolidate", "Test insert deletes & consolidate")}; try @@ -386,6 +432,24 @@ int main(int argc, char **argv) po::value(&start_deletes_after)->default_value(0), ""); optional_configs.add_options()("start_point_norm", po::value(&start_point_norm)->default_value(0), "Set the start point to a random point on a sphere of this radius"); + + // optional params for filters + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + "Input label file in txt format for Filtered Index search. " + "The file should contain comma separated filters for each node " + "with each line corresponding to a graph node"); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + "Universal label, if using it, only in conjunction with labels_file"); + optional_configs.add_options()("FilteredLbuild,Lf", po::value(&Lf)->default_value(0), + "Build complexity for filtered points, higher value " + "results in better graphs"); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + "Storage type of Labels , default value is uint which " + "will consume memory 4 bytes per filter"); + optional_configs.add_options()("unique_labels_supported", + po::value(&unique_labels_supported)->default_value(0), + "Number of unique labels supported by the dynamic index."); + optional_configs.add_options()( "num_start_points", po::value(&num_start_pts)->default_value(diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC), @@ -419,30 +483,41 @@ int main(int argc, char **argv) return -1; } + bool has_labels = false; + if (!label_file.empty() || label_file != "") + { + has_labels = true; + } + + if (num_start_pts < unique_labels_supported) + { + num_start_pts = unique_labels_supported; + } + try { diskann::IndexWriteParameters params = diskann::IndexWriteParametersBuilder(L, R) .with_max_occlusion_size(500) .with_alpha(alpha) .with_num_threads(num_threads) - .with_num_frozen_points(num_start_pts) + .with_filter_list_size(Lf) .build(); if (data_type == std::string("int8")) - build_incremental_index(data_path, params, points_to_skip, max_points_to_insert, - beginning_index_size, start_point_norm, num_start_pts, - points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, - points_to_delete_from_beginning, start_deletes_after, concurrent); + build_incremental_index( + data_path, params, points_to_skip, max_points_to_insert, beginning_index_size, start_point_norm, + num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, + points_to_delete_from_beginning, start_deletes_after, concurrent, label_file, universal_label); else if (data_type == std::string("uint8")) - build_incremental_index(data_path, params, points_to_skip, max_points_to_insert, - beginning_index_size, start_point_norm, num_start_pts, - points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, - points_to_delete_from_beginning, start_deletes_after, concurrent); + build_incremental_index( + data_path, params, points_to_skip, max_points_to_insert, beginning_index_size, start_point_norm, + num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, + points_to_delete_from_beginning, start_deletes_after, concurrent, label_file, universal_label); else if (data_type == std::string("float")) build_incremental_index(data_path, params, points_to_skip, max_points_to_insert, beginning_index_size, start_point_norm, num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix, points_to_delete_from_beginning, - start_deletes_after, concurrent); + start_deletes_after, concurrent, label_file, universal_label); else std::cout << "Unsupported type. Use float/int8/uint8" << std::endl; } diff --git a/apps/test_streaming_scenario.cpp b/apps/test_streaming_scenario.cpp index d8878cced..5a43a69f3 100644 --- a/apps/test_streaming_scenario.cpp +++ b/apps/test_streaming_scenario.cpp @@ -13,6 +13,7 @@ #include #include "utils.h" +#include "filter_utils.h" #include "program_options_utils.hpp" #ifndef _WINDOWS @@ -84,9 +85,9 @@ std::string get_save_filename(const std::string &save_path, size_t active_window return final_path; } -template +template void insert_next_batch(diskann::AbstractIndex &index, size_t start, size_t end, size_t insert_threads, T *data, - size_t aligned_dim) + size_t aligned_dim, std::vector> &pts_to_labels) { try { @@ -97,7 +98,18 @@ void insert_next_batch(diskann::AbstractIndex &index, size_t start, size_t end, #pragma omp parallel for num_threads((int32_t)insert_threads) schedule(dynamic) reduction(+ : num_failed) for (int64_t j = start; j < (int64_t)end; j++) { - if (index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j)) != 0) + int insert_result = -1; + if (pts_to_labels.size() > 0) + { + insert_result = index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j), + pts_to_labels[j - start]); + } + else + { + insert_result = index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast(j)); + } + + if (insert_result != 0) { std::cerr << "Insert failed " << j << std::endl; num_failed++; @@ -113,6 +125,7 @@ void insert_next_batch(diskann::AbstractIndex &index, size_t start, size_t end, catch (std::system_error &e) { std::cout << "Exiting after catching exception in insertion task: " << e.what() << std::endl; + exit(-1); } } @@ -167,23 +180,23 @@ void delete_and_consolidate(diskann::AbstractIndex &index, diskann::IndexWritePa } } -template +template void build_incremental_index(const std::string &data_path, const uint32_t L, const uint32_t R, const float alpha, const uint32_t insert_threads, const uint32_t consolidate_threads, size_t max_points_to_insert, size_t active_window, size_t consolidate_interval, - const float start_point_norm, uint32_t num_start_pts, const std::string &save_path) + const float start_point_norm, uint32_t num_start_pts, const std::string &save_path, + const std::string &label_file, const std::string &universal_label, const uint32_t Lf) { const uint32_t C = 500; const bool saturate_graph = false; - using TagT = uint32_t; - using LabelT = uint32_t; + bool has_labels = label_file != ""; diskann::IndexWriteParameters params = diskann::IndexWriteParametersBuilder(L, R) .with_max_occlusion_size(C) .with_alpha(alpha) .with_saturate_graph(saturate_graph) .with_num_threads(insert_threads) - .with_num_frozen_points(num_start_pts) + .with_filter_list_size(Lf) .build(); auto index_search_params = diskann::IndexSearchParams(L, insert_threads); @@ -192,11 +205,25 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con .with_alpha(alpha) .with_saturate_graph(saturate_graph) .with_num_threads(consolidate_threads) + .with_filter_list_size(Lf) .build(); size_t dim, aligned_dim; size_t num_points; + std::vector> pts_to_labels; + + const auto save_path_inc = + get_save_filename(save_path + ".after-streaming-", active_window, consolidate_interval, max_points_to_insert); + std::string labels_file_to_use = save_path_inc + "_label_formatted.txt"; + std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt"; + if (has_labels) + { + convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label); + auto parse_result = diskann::parse_formatted_label_file(labels_file_to_use); + pts_to_labels = std::get<0>(parse_result); + } + diskann::get_bin_metadata(data_path, num_points, dim); diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims" << std::endl; @@ -208,8 +235,10 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con .is_dynamic_index(true) .is_enable_tags(true) .is_use_opq(false) + .is_filtered(has_labels) .with_num_pq_chunks(0) .is_pq_dist_build(false) + .with_num_frozen_pts(num_start_pts) .with_tag_type(diskann_type_to_name()) .with_label_type(diskann_type_to_name()) .with_data_type(diskann_type_to_name()) @@ -222,6 +251,12 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con diskann::IndexFactory index_factory = diskann::IndexFactory(index_config); auto index = index_factory.create_instance(); + if (universal_label != "") + { + LabelT u_label = 0; + index->set_universal_label(u_label); + } + if (max_points_to_insert == 0) { max_points_to_insert = num_points; @@ -255,7 +290,8 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con auto insert_task = std::async(std::launch::async, [&]() { load_aligned_bin_part(data_path, data, 0, active_window); - insert_next_batch(*index, (size_t)0, active_window, params.num_threads, data, aligned_dim); + insert_next_batch(*index, (size_t)0, active_window, params.num_threads, data, aligned_dim, + pts_to_labels); }); insert_task.wait(); @@ -265,7 +301,8 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con auto end = std::min(start + consolidate_interval, max_points_to_insert); auto insert_task = std::async(std::launch::async, [&]() { load_aligned_bin_part(data_path, data, start, end - start); - insert_next_batch(*index, start, end, params.num_threads, data, aligned_dim); + insert_next_batch(*index, start, end, params.num_threads, data, aligned_dim, + pts_to_labels); }); insert_task.wait(); @@ -285,8 +322,7 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con delete_tasks[delete_tasks.size() - 1].wait(); std::cout << "Time Elapsed " << timer.elapsed() / 1000 << "ms\n"; - const auto save_path_inc = - get_save_filename(save_path + ".after-streaming-", active_window, consolidate_interval, max_points_to_insert); + index->save(save_path_inc.c_str(), true); diskann::aligned_free(data); @@ -294,9 +330,8 @@ void build_incremental_index(const std::string &data_path, const uint32_t L, con int main(int argc, char **argv) { - std::string data_type, dist_fn, data_path, index_path_prefix; - uint32_t insert_threads, consolidate_threads; - uint32_t R, L, num_start_pts; + std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type; + uint32_t insert_threads, consolidate_threads, R, L, num_start_pts, Lf, unique_labels_supported; float alpha, start_point_norm; size_t max_points_to_insert, active_window, consolidate_interval; @@ -352,6 +387,22 @@ int main(int argc, char **argv) "Set the number of random start (frozen) points to use when " "inserting and searching"); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + "Input label file in txt format for Filtered Index search. " + "The file should contain comma separated filters for each node " + "with each line corresponding to a graph node"); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + "Universal label, if using it, only in conjunction with labels_file"); + optional_configs.add_options()("FilteredLbuild,Lf", po::value(&Lf)->default_value(0), + "Build complexity for filtered points, higher value " + "results in better graphs"); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + "Storage type of Labels , default value is uint which " + "will consume memory 4 bytes per filter"); + optional_configs.add_options()("unique_labels_supported", + po::value(&unique_labels_supported)->default_value(0), + "Number of unique labels supported by the dynamic index."); + // Merge required and optional parameters desc.add(required_configs).add(optional_configs); @@ -363,13 +414,6 @@ int main(int argc, char **argv) return 0; } po::notify(vm); - if (start_point_norm == 0) - { - std::cout << "When beginning_index_size is 0, use a start point with " - "appropriate norm" - << std::endl; - return -1; - } } catch (const std::exception &ex) { @@ -377,22 +421,92 @@ int main(int argc, char **argv) return -1; } + // Validate arguments + if (start_point_norm == 0) + { + std::cout << "When beginning_index_size is 0, use a start point with " + "appropriate norm" + << std::endl; + return -1; + } + + if (label_type != std::string("ushort") && label_type != std::string("uint")) + { + std::cerr << "Invalid label type. Supported types are uint and ushort" << std::endl; + return -1; + } + + if (data_type != std::string("int8") && data_type != std::string("uint8") && data_type != std::string("float")) + { + std::cerr << "Invalid data type. Supported types are int8, uint8 and float" << std::endl; + return -1; + } + + // TODO: Are additional distance functions supported? + if (dist_fn != std::string("l2") && dist_fn != std::string("mips")) + { + std::cerr << "Invalid distance function. Supported functions are l2 and mips" << std::endl; + return -1; + } + + if (num_start_pts < unique_labels_supported) + { + num_start_pts = unique_labels_supported; + } + try { - if (data_type == std::string("int8")) - build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, - max_points_to_insert, active_window, consolidate_interval, start_point_norm, - num_start_pts, index_path_prefix); - else if (data_type == std::string("uint8")) - build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, - max_points_to_insert, active_window, consolidate_interval, - start_point_norm, num_start_pts, index_path_prefix); + if (data_type == std::string("uint8")) + { + if (label_type == std::string("ushort")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + else if (label_type == std::string("uint")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + } + else if (data_type == std::string("int8")) + { + if (label_type == std::string("ushort")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + else if (label_type == std::string("uint")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + } else if (data_type == std::string("float")) - build_incremental_index(data_path, L, R, alpha, insert_threads, consolidate_threads, - max_points_to_insert, active_window, consolidate_interval, start_point_norm, - num_start_pts, index_path_prefix); - else - std::cout << "Unsupported type. Use float/int8/uint8" << std::endl; + { + if (label_type == std::string("ushort")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + else if (label_type == std::string("uint")) + { + build_incremental_index( + data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window, + consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file, + universal_label, Lf); + } + } } catch (const std::exception &e) { diff --git a/apps/utils/compute_groundtruth_for_filters.cpp b/apps/utils/compute_groundtruth_for_filters.cpp index 5be7135e1..52e586475 100644 --- a/apps/utils/compute_groundtruth_for_filters.cpp +++ b/apps/utils/compute_groundtruth_for_filters.cpp @@ -415,11 +415,6 @@ inline void parse_label_file_into_vec(size_t &line_cnt, const std::string &map_f lbls.push_back(token); labels.insert(token); } - if (lbls.size() <= 0) - { - std::cout << "No label found"; - exit(-1); - } std::sort(lbls.begin(), lbls.end()); pts_to_labels.push_back(lbls); } diff --git a/include/abstract_index.h b/include/abstract_index.h index ff77d904e..12feec663 100644 --- a/include/abstract_index.h +++ b/include/abstract_index.h @@ -78,10 +78,17 @@ class AbstractIndex const size_t K, const uint32_t L, IndexType *indices, float *distances); + // insert points with labels, labels should be present for filtered index + template + int insert_point(const data_type *point, const tag_type tag, const std::vector &labels); + + // insert point for unfiltered index build. do not use with filtered index template int insert_point(const data_type *point, const tag_type tag); + // delete point with tag, or return -1 if point can not be deleted template int lazy_delete(const tag_type &tag); + // batch delete tags and populates failed tags if unabke to delete given tags. template void lazy_delete(const std::vector &tags, std::vector &failed_tags); @@ -96,6 +103,8 @@ class AbstractIndex // memory should be allocated for vec before calling this function template int get_vector_by_tag(tag_type &tag, data_type *vec); + template void set_universal_label(const label_type universal_label); + private: virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0; virtual std::pair _search(const DataType &query, const size_t K, const uint32_t L, @@ -103,6 +112,7 @@ class AbstractIndex virtual std::pair _search_with_filters(const DataType &query, const std::string &filter_label, const size_t K, const uint32_t L, std::any &indices, float *distances) = 0; + virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) = 0; virtual int _insert_point(const DataType &data_point, const TagType tag) = 0; virtual int _lazy_delete(const TagType &tag) = 0; virtual void _lazy_delete(TagVector &tags, TagVector &failed_tags) = 0; @@ -112,5 +122,6 @@ class AbstractIndex virtual size_t _search_with_tags(const DataType &query, const uint64_t K, const uint32_t L, const TagType &tags, float *distances, DataVector &res_vectors) = 0; virtual void _search_with_optimized_layout(const DataType &query, size_t K, size_t L, uint32_t *indices) = 0; + virtual void _set_universal_label(const LabelType universal_label) = 0; }; } // namespace diskann diff --git a/include/defaults.h b/include/defaults.h index 834234e57..5ea5af495 100644 --- a/include/defaults.h +++ b/include/defaults.h @@ -11,6 +11,7 @@ namespace defaults const float ALPHA = 1.2f; const uint32_t NUM_THREADS = 0; const uint32_t MAX_OCCLUSION_SIZE = 750; +const bool HAS_LABELS = false; const uint32_t FILTER_LIST_SIZE = 0; const uint32_t NUM_FROZEN_POINTS_STATIC = 0; const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1; diff --git a/include/filter_utils.h b/include/filter_utils.h index df1970be4..55f7aed28 100644 --- a/include/filter_utils.h +++ b/include/filter_utils.h @@ -57,6 +57,10 @@ DISKANN_DLLEXPORT void generate_label_indices(path input_data_path, path final_i DISKANN_DLLEXPORT load_label_index_return_values load_label_index(path label_index_path, uint32_t label_number_of_points); +template +DISKANN_DLLEXPORT std::tuple>, tsl::robin_set> parse_formatted_label_file( + path label_file); + DISKANN_DLLEXPORT parse_label_file_return_values parse_label_file(path label_data_path, std::string universal_label); template diff --git a/include/index.h b/include/index.h index cb27aeac0..e7966461c 100644 --- a/include/index.h +++ b/include/index.h @@ -57,7 +57,7 @@ template clas const size_t num_frozen_pts = 0, const bool dynamic_index = false, const bool enable_tags = false, const bool concurrent_consolidate = false, const bool pq_dist_build = false, const size_t num_pq_chunks = 0, - const bool use_opq = false); + const bool use_opq = false, const bool filtered_index = false); DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr> data_store, std::unique_ptr graph_store); @@ -96,7 +96,7 @@ template clas // Based on filter params builds a filtered or unfiltered index DISKANN_DLLEXPORT void build(const std::string &data_file, const size_t num_points_to_load, - IndexFilterParams &build_params); + IndexFilterParams &filter_params); // Filtered Support DISKANN_DLLEXPORT void build_filtered_index(const char *filename, const std::string &label_file, @@ -141,6 +141,9 @@ template clas // Will fail if tag already in the index or if tag=0. DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag); + // Will fail if tag already in the index or if tag=0. + DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag, const std::vector &label); + // call this before issuing deletions to sets relevant flags DISKANN_DLLEXPORT int enable_delete(); @@ -202,6 +205,7 @@ template clas float *distances) override; virtual int _insert_point(const DataType &data_point, const TagType tag) override; + virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) override; virtual int _lazy_delete(const TagType &tag) override; @@ -218,6 +222,8 @@ template clas virtual size_t _search_with_tags(const DataType &query, const uint64_t K, const uint32_t L, const TagType &tags, float *distances, DataVector &res_vectors) override; + virtual void _set_universal_label(const LabelType universal_label) override; + // No copy/assign. Index(const Index &) = delete; Index &operator=(const Index &) = delete; @@ -342,6 +348,7 @@ template clas // needed for a dynamic index. The frozen points have consecutive locations. // See also _start below. size_t _num_frozen_pts = 0; + size_t _frozen_pts_used = 0; size_t _node_size; size_t _data_len; size_t _neighbor_len; @@ -362,11 +369,14 @@ template clas // Filter Support bool _filtered_index = false; - std::vector> _pts_to_labels; + // Location to label is only updated during insert_point(), all other reads are protected by + // default as a location can only be released at end of consolidate deletes + std::vector> _location_to_labels; tsl::robin_set _labels; std::string _labels_file; - std::unordered_map _label_to_medoid_id; + std::unordered_map _label_to_start_id; std::unordered_map _medoid_counts; + bool _use_universal_label = false; LabelT _universal_label = 0; uint32_t _filterIndexingQueueSize; @@ -416,11 +426,11 @@ template clas std::shared_timed_mutex // Ensure only one consolidate or compact_data is _consolidate_lock; // ever active std::shared_timed_mutex // RW lock for _tag_to_location, - _tag_lock; // _location_to_tag, _empty_slots, _nd, _max_points + _tag_lock; // _location_to_tag, _empty_slots, _nd, _max_points, _label_to_start_id std::shared_timed_mutex // RW Lock on _delete_set and _data_compacted _delete_lock; // variable - // Per node lock, cardinality=_max_points + // Per node lock, cardinality=_max_points + _num_frozen_points std::vector _locks; static const float INDEX_GROWTH_FACTOR; diff --git a/include/index_build_params.h b/include/index_build_params.h index a3012e99a..0233fcec4 100644 --- a/include/index_build_params.h +++ b/include/index_build_params.h @@ -8,6 +8,7 @@ struct IndexFilterParams public: std::string save_path_prefix; std::string label_file; + std::string tags_file; std::string universal_label; uint32_t filter_threshold = 0; @@ -63,6 +64,7 @@ class IndexFilterParamsBuilder private: std::string _save_path_prefix; std::string _label_file; + std::string _tags_file; std::string _universal_label; uint32_t _filter_threshold = 0; }; diff --git a/include/index_config.h b/include/index_config.h index 8b873fb6c..452498b01 100644 --- a/include/index_config.h +++ b/include/index_config.h @@ -27,6 +27,7 @@ struct IndexConfig bool pq_dist_build; bool concurrent_consolidate; bool use_opq; + bool filtered_index; size_t num_pq_chunks; size_t num_frozen_pts; @@ -43,15 +44,15 @@ struct IndexConfig private: IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension, size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags, - bool pq_dist_build, bool concurrent_consolidate, bool use_opq, const std::string &data_type, - const std::string &tag_type, const std::string &label_type, + bool pq_dist_build, bool concurrent_consolidate, bool use_opq, bool filtered_index, + std::string &data_type, const std::string &tag_type, const std::string &label_type, std::shared_ptr index_write_params, std::shared_ptr index_search_params) : data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension), max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build), - concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), num_pq_chunks(num_pq_chunks), - num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type), data_type(data_type), - index_write_params(index_write_params), index_search_params(index_search_params) + concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), filtered_index(filtered_index), + num_pq_chunks(num_pq_chunks), num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type), + data_type(data_type), index_write_params(index_write_params), index_search_params(index_search_params) { } @@ -123,6 +124,12 @@ class IndexConfigBuilder return *this; } + IndexConfigBuilder &is_filtered(bool is_filtered) + { + this->_filtered_index = is_filtered; + return *this; + } + IndexConfigBuilder &with_num_pq_chunks(size_t num_pq_chunks) { this->_num_pq_chunks = num_pq_chunks; @@ -212,7 +219,8 @@ class IndexConfigBuilder return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks, _num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate, - _use_opq, _data_type, _tag_type, _label_type, _index_write_params, _index_search_params); + _use_opq, _filtered_index, _data_type, _tag_type, _label_type, _index_write_params, + _index_search_params); } IndexConfigBuilder(const IndexConfigBuilder &) = delete; @@ -231,12 +239,13 @@ class IndexConfigBuilder bool _pq_dist_build = false; bool _concurrent_consolidate = false; bool _use_opq = false; + bool _filtered_index{defaults::HAS_LABELS}; size_t _num_pq_chunks = 0; - size_t _num_frozen_pts = 0; + size_t _num_frozen_pts{defaults::NUM_FROZEN_POINTS_STATIC}; - std::string _label_type = "uint32"; - std::string _tag_type = "uint32"; + std::string _label_type{"uint32"}; + std::string _tag_type{"uint32"}; std::string _data_type; std::shared_ptr _index_write_params; diff --git a/include/parameters.h b/include/parameters.h index 4fec9ae08..2bba9aeca 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -23,15 +23,14 @@ class IndexWriteParameters const float alpha; const uint32_t num_threads; const uint32_t filter_list_size; // Lf - const uint32_t num_frozen_points; private: IndexWriteParameters(const uint32_t search_list_size, const uint32_t max_degree, const bool saturate_graph, const uint32_t max_occlusion_size, const float alpha, const uint32_t num_threads, - const uint32_t filter_list_size, const uint32_t num_frozen_points) + const uint32_t filter_list_size) : search_list_size(search_list_size), max_degree(max_degree), saturate_graph(saturate_graph), max_occlusion_size(max_occlusion_size), alpha(alpha), num_threads(num_threads), - filter_list_size(filter_list_size), num_frozen_points(num_frozen_points) + filter_list_size(filter_list_size) { } @@ -93,22 +92,16 @@ class IndexWriteParametersBuilder return *this; } - IndexWriteParametersBuilder &with_num_frozen_points(const uint32_t num_frozen_points) - { - _num_frozen_points = num_frozen_points; - return *this; - } - IndexWriteParameters build() const { return IndexWriteParameters(_search_list_size, _max_degree, _saturate_graph, _max_occlusion_size, _alpha, - _num_threads, _filter_list_size, _num_frozen_points); + _num_threads, _filter_list_size); } IndexWriteParametersBuilder(const IndexWriteParameters &wp) : _search_list_size(wp.search_list_size), _max_degree(wp.max_degree), _max_occlusion_size(wp.max_occlusion_size), _saturate_graph(wp.saturate_graph), _alpha(wp.alpha), - _filter_list_size(wp.filter_list_size), _num_frozen_points(wp.num_frozen_points) + _filter_list_size(wp.filter_list_size) { } IndexWriteParametersBuilder(const IndexWriteParametersBuilder &) = delete; @@ -122,7 +115,6 @@ class IndexWriteParametersBuilder float _alpha{defaults::ALPHA}; uint32_t _num_threads{defaults::NUM_THREADS}; uint32_t _filter_list_size{defaults::FILTER_LIST_SIZE}; - uint32_t _num_frozen_points{defaults::NUM_FROZEN_POINTS_STATIC}; }; } // namespace diskann diff --git a/include/types.h b/include/types.h index b95848869..953d59a5f 100644 --- a/include/types.h +++ b/include/types.h @@ -17,5 +17,6 @@ using TagType = std::any; using LabelType = std::any; using TagVector = AnyWrapper::AnyVector; using DataVector = AnyWrapper::AnyVector; +using Labelvector = AnyWrapper::AnyVector; using TagRobinSet = AnyWrapper::AnyRobinSet; } // namespace diskann diff --git a/include/utils.h b/include/utils.h index 58bb52a3b..195dd95ef 100644 --- a/include/utils.h +++ b/include/utils.h @@ -153,6 +153,7 @@ inline int delete_file(const std::string &fileName) } } +// generates formatted_label and _labels_map file. inline void convert_labels_string_to_int(const std::string &inFileName, const std::string &outFileName, const std::string &mapFileName, const std::string &unv_label) { @@ -160,7 +161,7 @@ inline void convert_labels_string_to_int(const std::string &inFileName, const st std::ofstream label_writer(outFileName); std::ifstream label_reader(inFileName); if (unv_label != "") - string_int_map[unv_label] = 0; + string_int_map[unv_label] = 0; // if universal label is provided map it to 0 always std::string line, token; while (std::getline(label_reader, line)) { @@ -173,7 +174,7 @@ inline void convert_labels_string_to_int(const std::string &inFileName, const st if (string_int_map.find(token) == string_int_map.end()) { uint32_t nextId = (uint32_t)string_int_map.size() + 1; - string_int_map[token] = nextId; + string_int_map[token] = nextId; // nextId can never be 0 } lbls.push_back(string_int_map[token]); } diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp index 3add2aa5c..d05e54d96 100644 --- a/python/src/dynamic_memory_index.cpp +++ b/python/src/dynamic_memory_index.cpp @@ -13,8 +13,7 @@ diskann::IndexWriteParameters dynamic_index_write_parameters(const uint32_t comp const bool saturate_graph, const uint32_t max_occlusion_size, const float alpha, const uint32_t num_threads, - const uint32_t filter_complexity, - const uint32_t num_frozen_points) + const uint32_t filter_complexity) { return diskann::IndexWriteParametersBuilder(complexity, graph_degree) .with_saturate_graph(saturate_graph) @@ -22,17 +21,14 @@ diskann::IndexWriteParameters dynamic_index_write_parameters(const uint32_t comp .with_alpha(alpha) .with_num_threads(num_threads) .with_filter_list_size(filter_complexity) - .with_num_frozen_points(num_frozen_points) .build(); } template -diskann::Index dynamic_index_builder(const diskann::Metric m, - const diskann::IndexWriteParameters &write_params, - const size_t dimensions, const size_t max_vectors, - const uint32_t initial_search_complexity, - const uint32_t initial_search_threads, - const bool concurrent_consolidation) +diskann::Index dynamic_index_builder( + const diskann::Metric m, const diskann::IndexWriteParameters &write_params, const size_t dimensions, + const size_t max_vectors, const uint32_t initial_search_complexity, const uint32_t initial_search_threads, + const bool concurrent_consolidation, const uint32_t num_frozen_points) { const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_procs(); @@ -41,7 +37,7 @@ diskann::Index dynamic_index_builder(const diskann:: m, dimensions, max_vectors, std::make_shared(write_params), // index write params std::make_shared(index_search_params), // index_search_params - write_params.num_frozen_points, // frozen_points + num_frozen_points, // frozen_points true, // dynamic_index true, // enable_tags concurrent_consolidation, @@ -60,9 +56,9 @@ DynamicMemoryIndex
::DynamicMemoryIndex(const diskann::Metric m, const size_t const uint32_t initial_search_threads, const bool concurrent_consolidation) : _initial_search_complexity(initial_search_complexity != 0 ? initial_search_complexity : complexity), _write_parameters(dynamic_index_write_parameters(complexity, graph_degree, saturate_graph, max_occlusion_size, - alpha, num_threads, filter_complexity, num_frozen_points)), + alpha, num_threads, filter_complexity)), _index(dynamic_index_builder
(m, _write_parameters, dimensions, max_vectors, _initial_search_complexity, - initial_search_threads, concurrent_consolidation)) + initial_search_threads, concurrent_consolidation, num_frozen_points)) { } diff --git a/src/abstract_index.cpp b/src/abstract_index.cpp index ee55b0753..a7a5986cc 100644 --- a/src/abstract_index.cpp +++ b/src/abstract_index.cpp @@ -56,6 +56,15 @@ int AbstractIndex::insert_point(const data_type *point, const tag_type tag) return this->_insert_point(any_point, any_tag); } +template +int AbstractIndex::insert_point(const data_type *point, const tag_type tag, const std::vector &labels) +{ + auto any_point = std::any(point); + auto any_tag = std::any(tag); + auto any_labels = Labelvector(labels); + return this->_insert_point(any_point, any_tag, any_labels); +} + template int AbstractIndex::lazy_delete(const tag_type &tag) { auto any_tag = std::any(tag); @@ -89,6 +98,12 @@ template int AbstractIndex::get_vector_b return this->_get_vector_by_tag(any_tag, any_data_ptr); } +template void AbstractIndex::set_universal_label(const label_type universal_label) +{ + auto any_label = std::any(universal_label); + this->_set_universal_label(any_label); +} + // exports template DISKANN_DLLEXPORT void AbstractIndex::build(const float *data, const size_t num_points_to_load, const std::vector &tags); @@ -226,6 +241,62 @@ template DISKANN_DLLEXPORT int AbstractIndex::insert_point(cons template DISKANN_DLLEXPORT int AbstractIndex::insert_point(const uint8_t *point, const uint64_t tag); template DISKANN_DLLEXPORT int AbstractIndex::insert_point(const int8_t *point, const uint64_t tag); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int64_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint64_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint32_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint32_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const int64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const int64_t tag, const std::vector &labels); + +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const float *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const uint8_t *point, const uint64_t tag, const std::vector &labels); +template DISKANN_DLLEXPORT int AbstractIndex::insert_point( + const int8_t *point, const uint64_t tag, const std::vector &labels); + template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete(const int32_t &tag); template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete(const uint32_t &tag); template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete(const int64_t &tag); @@ -264,4 +335,7 @@ template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag(uint64_t &tag, uint8_t *vec); template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag(uint64_t &tag, int8_t *vec); +template DISKANN_DLLEXPORT void AbstractIndex::set_universal_label(const uint16_t label); +template DISKANN_DLLEXPORT void AbstractIndex::set_universal_label(const uint32_t label); + } // namespace diskann diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index a67059c8d..297619b4a 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -635,9 +635,10 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr .with_num_threads(num_threads) .build(); using TagT = uint32_t; - diskann::Index _index( - compareMetric, base_dim, base_num, std::make_shared(paras), nullptr, - paras.num_frozen_points, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); + diskann::Index _index(compareMetric, base_dim, base_num, + std::make_shared(paras), nullptr, + defaults::NUM_FROZEN_POINTS_STATIC, false, false, false, + build_pq_bytes > 0, build_pq_bytes, use_opq, use_filters); if (!use_filters) _index.build(base_file.c_str(), base_num); else @@ -703,7 +704,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr diskann::Index _index(compareMetric, shard_base_dim, shard_base_pts, std::make_shared(low_degree_params), nullptr, - low_degree_params.num_frozen_points, false, false, false, build_pq_bytes > 0, + defaults::NUM_FROZEN_POINTS_STATIC, false, false, false, build_pq_bytes > 0, build_pq_bytes, use_opq); if (!use_filters) { diff --git a/src/filter_utils.cpp b/src/filter_utils.cpp index 0cdb9bde7..b55a3230f 100644 --- a/src/filter_utils.cpp +++ b/src/filter_utils.cpp @@ -261,6 +261,74 @@ parse_label_file_return_values parse_label_file(path label_data_path, std::strin return std::make_tuple(point_ids_to_labels, labels_to_number_of_points, all_labels); } +/* + * A templated function to parse a file of labels that are already represented + * as either uint16_t or uint32_t + * + * Returns two objects via std::tuple: + * 1. a vector of vectors of labels, where the outer vector is indexed by point id + * 2. a set of all labels + */ +template +std::tuple>, tsl::robin_set> parse_formatted_label_file(std::string label_file) +{ + std::vector> pts_to_labels; + tsl::robin_set labels; + + // Format of Label txt file: filters with comma separators + std::ifstream infile(label_file); + if (infile.fail()) + { + throw diskann::ANNException(std::string("Failed to open file ") + label_file, -1); + } + + std::string line, token; + uint32_t line_cnt = 0; + + while (std::getline(infile, line)) + { + line_cnt++; + } + pts_to_labels.resize(line_cnt, std::vector()); + + infile.clear(); + infile.seekg(0, std::ios::beg); + line_cnt = 0; + + while (std::getline(infile, line)) + { + std::istringstream iss(line); + std::vector lbls(0); + getline(iss, token, '\t'); + std::istringstream new_iss(token); + while (getline(new_iss, token, ',')) + { + token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); + LabelT token_as_num = static_cast(std::stoul(token)); + lbls.push_back(token_as_num); + labels.insert(token_as_num); + } + if (lbls.size() <= 0) + { + diskann::cout << "No label found"; + exit(-1); + } + std::sort(lbls.begin(), lbls.end()); + pts_to_labels[line_cnt] = lbls; + line_cnt++; + } + diskann::cout << "Identified " << labels.size() << " distinct label(s)" << std::endl; + + return std::make_tuple(pts_to_labels, labels); +} + +template DISKANN_DLLEXPORT std::tuple>, tsl::robin_set> +parse_formatted_label_file(path label_file); + +template DISKANN_DLLEXPORT std::tuple>, tsl::robin_set> +parse_formatted_label_file(path label_file); + template DISKANN_DLLEXPORT void generate_label_indices(path input_data_path, path final_index_path_prefix, label_set all_labels, uint32_t R, uint32_t L, float alpha, uint32_t num_threads); diff --git a/src/index.cpp b/src/index.cpp index 478b86273..3de3a3b7f 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -33,7 +33,8 @@ Index::Index(const IndexConfig &index_config, std::unique_ptr), _conc_consolidate(index_config.concurrent_consolidate) { if (_dynamic_index && !_enable_tags) @@ -93,6 +94,10 @@ Index::Index(const IndexConfig &index_config, std::unique_ptrenable_delete(); // enable delete by default for dynamic index + if (_filtered_index) + { + _location_to_labels.resize(total_internal_points); + } } if (index_config.index_write_params != nullptr) @@ -108,7 +113,6 @@ Index::Index(const IndexConfig &index_config, std::unique_ptrnum_search_threads + _indexingThreads; - initialize_query_scratch(num_scratch_spaces, index_config.index_search_params->initial_search_list_size, _indexingQueueSize, _indexingRange, _indexingMaxC, _data_store->get_dims()); } @@ -120,7 +124,8 @@ Index::Index(Metric m, const size_t dim, const size_t max_point const std::shared_ptr index_parameters, const std::shared_ptr index_search_params, const size_t num_frozen_pts, const bool dynamic_index, const bool enable_tags, const bool concurrent_consolidate, - const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq) + const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq, + const bool filtered_index) : Index(IndexConfigBuilder() .with_metric(m) .with_dimension(dim) @@ -134,6 +139,7 @@ Index::Index(Metric m, const size_t dim, const size_t max_point .is_pq_dist_build(pq_dist_build) .with_num_pq_chunks(num_pq_chunks) .is_use_opq(use_opq) + .is_filtered(filtered_index) .with_data_type(diskann_type_to_name()) .build(), IndexFactory::construct_datastore( @@ -283,14 +289,14 @@ void Index::save(const char *filename, bool compact_before_save { if (_filtered_index) { - if (_label_to_medoid_id.size() > 0) + if (_label_to_start_id.size() > 0) { std::ofstream medoid_writer(std::string(filename) + "_labels_to_medoids.txt"); if (medoid_writer.fail()) { throw diskann::ANNException(std::string("Failed to open file ") + filename, -1); } - for (auto iter : _label_to_medoid_id) + for (auto iter : _label_to_start_id) { medoid_writer << iter.first << ", " << iter.second << std::endl; } @@ -305,21 +311,51 @@ void Index::save(const char *filename, bool compact_before_save universal_label_writer.close(); } - if (_pts_to_labels.size() > 0) + if (_location_to_labels.size() > 0) { std::ofstream label_writer(std::string(filename) + "_labels.txt"); assert(label_writer.is_open()); - for (uint32_t i = 0; i < _pts_to_labels.size(); i++) + for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++) { - for (uint32_t j = 0; j < (_pts_to_labels[i].size() - 1); j++) + for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++) { - label_writer << _pts_to_labels[i][j] << ","; + label_writer << _location_to_labels[i][j] << ","; } - if (_pts_to_labels[i].size() != 0) - label_writer << _pts_to_labels[i][_pts_to_labels[i].size() - 1]; + if (_location_to_labels[i].size() != 0) + label_writer << _location_to_labels[i][_location_to_labels[i].size() - 1]; + label_writer << std::endl; } label_writer.close(); + + // write compacted raw_labels if data hence _location_to_labels was also compacted + if (compact_before_save && _dynamic_index) + { + _label_map = load_label_map(std::string(filename) + "_labels_map.txt"); + std::unordered_map mapped_to_raw_labels; + // invert label map + for (const auto &[key, value] : _label_map) + { + mapped_to_raw_labels.insert({value, key}); + } + + // write updated labels + std::ofstream raw_label_writer(std::string(filename) + "_raw_labels.txt"); + assert(raw_label_writer.is_open()); + for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++) + { + for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++) + { + raw_label_writer << mapped_to_raw_labels[_location_to_labels[i][j]] << ","; + } + if (_location_to_labels[i].size() != 0) + raw_label_writer + << mapped_to_raw_labels[_location_to_labels[i][_location_to_labels[i].size() - 1]]; + + raw_label_writer << std::endl; + } + raw_label_writer.close(); + } } } @@ -553,14 +589,14 @@ void Index::load(const char *filename, uint32_t num_threads, ui { _label_map = load_label_map(labels_map_file); parse_label_file(labels_file, label_num_pts); - assert(label_num_pts == data_file_num_pts); + assert(label_num_pts == data_file_num_pts - _num_frozen_pts); if (file_exists(labels_to_medoids)) { std::ifstream medoid_stream(labels_to_medoids); std::string line, token; uint32_t line_cnt = 0; - _label_to_medoid_id.clear(); + _label_to_start_id.clear(); while (std::getline(medoid_stream, line)) { @@ -579,7 +615,7 @@ void Index::load(const char *filename, uint32_t num_threads, ui medoid = token_as_num; cnt++; } - _label_to_medoid_id[label] = medoid; + _label_to_start_id[label] = medoid; line_cnt++; } } @@ -729,7 +765,7 @@ template bool Index::detect_common_filters(uint32_t point_id, bool search_invocation, const std::vector &incoming_labels) { - auto &curr_node_labels = _pts_to_labels[point_id]; + auto &curr_node_labels = _location_to_labels[point_id]; std::vector common_filters; std::set_intersection(incoming_labels.begin(), incoming_labels.end(), curr_node_labels.begin(), curr_node_labels.end(), std::back_inserter(common_filters)); @@ -759,7 +795,7 @@ bool Index::detect_common_filters(uint32_t point_id, bool searc template std::pair Index::iterate_to_fixed_point( const T *query, const uint32_t Lsize, const std::vector &init_ids, InMemQueryScratch *scratch, - bool use_filter, const std::vector &filter_label, bool search_invocation) + bool use_filter, const std::vector &filter_labels, bool search_invocation) { std::vector &expanded_nodes = scratch->pool(); NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes(); @@ -844,7 +880,7 @@ std::pair Index::iterate_to_fixed_point( if (use_filter) { - if (!detect_common_filters(id, search_invocation, filter_label)) + if (!detect_common_filters(id, search_invocation, filter_labels)) continue; } @@ -912,7 +948,7 @@ std::pair Index::iterate_to_fixed_point( if (use_filter) { // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS. - if (!detect_common_filters(id, search_invocation, filter_label)) + if (!detect_common_filters(id, search_invocation, filter_labels)) continue; } @@ -921,7 +957,6 @@ std::pair Index::iterate_to_fixed_point( id_scratch.push_back(id); } } - if (_dynamic_index) _locks[n].unlock(); } @@ -988,13 +1023,44 @@ void Index::search_for_point_and_prune(int location, uint32_t L } else { + std::shared_lock tl(_tag_lock, std::defer_lock); + if (_dynamic_index) + tl.lock(); std::vector filter_specific_start_nodes; - for (auto &x : _pts_to_labels[location]) - filter_specific_start_nodes.emplace_back(_label_to_medoid_id[x]); + for (auto &x : _location_to_labels[location]) + filter_specific_start_nodes.emplace_back(_label_to_start_id[x]); + + if (_dynamic_index) + tl.unlock(); _data_store->get_vector(location, scratch->aligned_query()); iterate_to_fixed_point(scratch->aligned_query(), filteredLindex, filter_specific_start_nodes, scratch, true, - _pts_to_labels[location], false); + _location_to_labels[location], false); + + // combine candidate pools obtained with filter and unfiltered criteria. + std::set best_candidate_pool; + for (auto filtered_neighbor : scratch->pool()) + { + best_candidate_pool.insert(filtered_neighbor); + } + + // clear scratch for finding unfiltered candidates + scratch->clear(); + + _data_store->get_vector(location, scratch->aligned_query()); + iterate_to_fixed_point(scratch->aligned_query(), Lindex, init_ids, scratch, false, unused_filter_label, false); + + for (auto unfiltered_neighbour : scratch->pool()) + { + // insert if this neighbour is not already in best_candidate_pool + if (best_candidate_pool.find(unfiltered_neighbour) == best_candidate_pool.end()) + { + best_candidate_pool.insert(unfiltered_neighbour); + } + } + + scratch->pool().clear(); + std::copy(best_candidate_pool.begin(), best_candidate_pool.end(), std::back_inserter(scratch->pool())); } auto &pool = scratch->pool(); @@ -1077,9 +1143,12 @@ void Index::occlude_list(const uint32_t location, std::vectorid; uint32_t b = iter2->id; - for (auto &x : _pts_to_labels[b]) + if (_location_to_labels.size() < b || _location_to_labels.size() < a) + continue; + for (auto &x : _location_to_labels[b]) { - if (std::find(_pts_to_labels[a].begin(), _pts_to_labels[a].end(), x) == _pts_to_labels[a].end()) + if (std::find(_location_to_labels[a].begin(), _location_to_labels[a].end(), x) == + _location_to_labels[a].end()) { prune_allowed = false; } @@ -1265,19 +1334,20 @@ template void Index> manager(_query_scratch); auto scratch = manager.scratch_space(); - std::vector pruned_list; if (_filtered_index) { - search_for_point_and_prune(node, _indexingQueueSize, pruned_list, scratch, _filtered_index, - _filterIndexingQueueSize); + search_for_point_and_prune(node, _indexingQueueSize, pruned_list, scratch, true, _filterIndexingQueueSize); } else { search_for_point_and_prune(node, _indexingQueueSize, pruned_list, scratch); } + assert(pruned_list.size() > 0); + { LockGuard guard(_locks[node]); @@ -1701,9 +1771,6 @@ template void Index::build(const std::string &data_file, const size_t num_points_to_load, IndexFilterParams &filter_params) { - std::string labels_file_to_use = filter_params.save_path_prefix + "_label_formatted.txt"; - std::string mem_labels_int_map_file = filter_params.save_path_prefix + "_labels_map.txt"; - size_t points_to_load = num_points_to_load == 0 ? _max_points : num_points_to_load; auto s = std::chrono::high_resolution_clock::now(); @@ -1714,6 +1781,8 @@ void Index::build(const std::string &data_file, const size_t nu else { // TODO: this should ideally happen in save() + std::string labels_file_to_use = filter_params.save_path_prefix + "_label_formatted.txt"; + std::string mem_labels_int_map_file = filter_params.save_path_prefix + "_labels_map.txt"; convert_labels_string_to_int(filter_params.label_file, labels_file_to_use, mem_labels_int_map_file, filter_params.universal_label); if (filter_params.universal_label != "") @@ -1782,7 +1851,7 @@ void Index::parse_label_file(const std::string &label_file, siz { line_cnt++; } - _pts_to_labels.resize(line_cnt, std::vector()); + _location_to_labels.resize(line_cnt, std::vector()); infile.clear(); infile.seekg(0, std::ios::beg); @@ -1802,19 +1871,21 @@ void Index::parse_label_file(const std::string &label_file, siz lbls.push_back(token_as_num); _labels.insert(token_as_num); } - if (lbls.size() <= 0) - { - diskann::cout << "No label found"; - exit(-1); - } + std::sort(lbls.begin(), lbls.end()); - _pts_to_labels[line_cnt] = lbls; + _location_to_labels[line_cnt] = lbls; line_cnt++; } num_points = (size_t)line_cnt; diskann::cout << "Identified " << _labels.size() << " distinct label(s)" << std::endl; } +template +void Index::_set_universal_label(const LabelType universal_label) +{ + this->set_universal_label(std::any_cast(universal_label)); +} + template void Index::set_universal_label(const LabelT &label) { @@ -1826,9 +1897,8 @@ template void Index::build_filtered_index(const char *filename, const std::string &label_file, const size_t num_points_to_load, const std::vector &tags) { - _labels_file = label_file; // original label file _filtered_index = true; - _label_to_medoid_id.clear(); + _label_to_start_id.clear(); size_t num_points_labels = 0; parse_label_file(label_file, @@ -1839,7 +1909,7 @@ void Index::build_filtered_index(const char *filename, const st for (uint32_t point_id = 0; point_id < num_points_to_load; point_id++) { - for (auto label : _pts_to_labels[point_id]) + for (auto label : _location_to_labels[point_id]) { if (label != _universal_label) { @@ -1884,7 +1954,7 @@ void Index::build_filtered_index(const char *filename, const st best_medoid = cur_cnd; } } - _label_to_medoid_id[curr_label] = best_medoid; + _label_to_start_id[curr_label] = best_medoid; _medoid_counts[best_medoid]++; } @@ -2036,10 +2106,13 @@ std::pair Index::search_with_filters(const std::vector init_ids = get_init_ids(); std::shared_lock lock(_update_lock); + std::shared_lock tl(_tag_lock, std::defer_lock); + if (_dynamic_index) + tl.lock(); - if (_label_to_medoid_id.find(filter_label) != _label_to_medoid_id.end()) + if (_label_to_start_id.find(filter_label) != _label_to_start_id.end()) { - init_ids.emplace_back(_label_to_medoid_id[filter_label]); + init_ids.emplace_back(_label_to_start_id[filter_label]); } else { @@ -2047,6 +2120,9 @@ std::pair Index::search_with_filters(const << std::endl; // RKNOTE: If universal label found start there throw diskann::ANNException("No filtered medoid found. exitting ", -1); } + if (_dynamic_index) + tl.unlock(); + filter_vec.emplace_back(filter_label); _data_store->get_dist_fn()->preprocess_query(query, _data_store->get_dims(), scratch->aligned_query()); @@ -2061,7 +2137,23 @@ std::pair Index::search_with_filters(const { // safe because Index uses uint32_t ids internally // and IDType will be uint32_t or uint64_t - indices[pos] = (IdType)best_L_nodes[i].id; + if (_enable_tags) + { + TagT tag; + if (_location_to_tag.try_get(best_L_nodes[i].id, tag)) + { + indices[pos] = (IdType)tag; + } + else + { + continue; + } + } + else + { + indices[pos] = (IdType)best_L_nodes[i].id; + } + if (distances != nullptr) { #ifdef EXEC_ENV_OLS @@ -2211,6 +2303,7 @@ template void Indexcopy_vectors((location_t)res, (location_t)_max_points, 1); } + _frozen_pts_used++; } template int Index::enable_delete() @@ -2420,6 +2513,17 @@ template void Index void Index new_adj_list; if ((new_location[old] < _max_points) // If point continues to exist @@ -2501,8 +2606,13 @@ template void Indexget_neighbours(new_location[old]).swap(_graph_store->get_neighbours((location_t)old)); _graph_store->swap_neighbours(new_location[old], (location_t)old); + + if (_filtered_index) + { + _location_to_labels[new_location[old]].swap(_location_to_labels[old]); + } + _data_store->copy_vectors(old, new_location[old], 1); } } @@ -2524,12 +2634,21 @@ template void Indexclear_neighbours((location_t)old); } + if (_filtered_index) + { + for (size_t old = _nd; old < _max_points; old++) + { + _location_to_labels[old].clear(); + } + } + _empty_slots.clear(); + // mark all slots after _nd as empty for (auto i = _nd; i < _max_points; i++) { _empty_slots.insert((uint32_t)i); @@ -2640,12 +2759,13 @@ void Index::reposition_points(uint32_t old_location_start, uint for (uint32_t loc_offset = 0; loc_offset < num_locations; loc_offset++) { assert(_graph_store->get_neighbours(new_location_start + loc_offset).empty()); - /* _graph_store->get_neighbours(new_location_start + loc_offset) - .swap(_graph_store->get_neighbours(old_location_start + - loc_offset));*/ _graph_store->swap_neighbours(new_location_start + loc_offset, old_location_start + loc_offset); + if (_dynamic_index && _filtered_index) + { + _location_to_labels[new_location_start + loc_offset].swap( + _location_to_labels[old_location_start + loc_offset]); + } } - // If ranges are overlapping, make sure not to clear the newly copied // data. if (mem_clear_loc_start < new_location_start + num_locations) @@ -2661,10 +2781,12 @@ void Index::reposition_points(uint32_t old_location_start, uint for (uint32_t loc_offset = num_locations; loc_offset > 0; loc_offset--) { assert(_graph_store->get_neighbours(new_location_start + loc_offset - 1u).empty()); - /*_graph_store->get_neighbours(new_location_start + loc_offset - 1u) - .swap(_graph_store->get_neighbours(old_location_start + loc_offset - - 1u));*/ _graph_store->swap_neighbours(new_location_start + loc_offset - 1u, old_location_start + loc_offset - 1u); + if (_dynamic_index && _filtered_index) + { + _location_to_labels[new_location_start + loc_offset - 1u].swap( + _location_to_labels[old_location_start + loc_offset - 1u]); + } } // If ranges are overlapping, make sure not to clear the newly copied @@ -2691,6 +2813,17 @@ template void Index void Index::resize(size_t new_max_points) @@ -2737,9 +2870,35 @@ int Index::_insert_point(const DataType &point, const TagType t } } +template +int Index::_insert_point(const DataType &point, const TagType tag, Labelvector &labels) +{ + try + { + return this->insert_point(std::any_cast(point), std::any_cast(tag), + labels.get>()); + } + catch (const std::bad_any_cast &anycast_e) + { + throw new ANNException("Error:Trying to insert invalid data type" + std::string(anycast_e.what()), -1); + } + catch (const std::exception &e) + { + throw new ANNException("Error:" + std::string(e.what()), -1); + } +} + template int Index::insert_point(const T *point, const TagT tag) { + std::vector no_labels{0}; + return insert_point(point, tag, no_labels); +} + +template +int Index::insert_point(const T *point, const TagT tag, const std::vector &labels) +{ + assert(_has_built); if (tag == static_cast(0)) { @@ -2753,8 +2912,42 @@ int Index::insert_point(const T *point, const TagT tag) std::unique_lock tl(_tag_lock); std::unique_lock dl(_delete_lock); - // Find a vacant location in the data array to insert the new point auto location = reserve_location(); + if (_filtered_index) + { + if (labels.empty()) + { + release_location(location); + std::cerr << "Error: Can't insert point with tag " + std::to_string(tag) + + " . there are no labels for the point." + << std::endl; + return -1; + } + + _location_to_labels[location] = labels; + + for (LabelT label : labels) + { + if (_labels.find(label) == _labels.end()) + { + if (_frozen_pts_used >= _num_frozen_pts) + { + throw ANNException( + "Error: For dynamic filtered index, the number of frozen points should be atleast equal " + "to number of unique labels.", + -1); + } + + auto fz_location = (int)(_max_points) + _frozen_pts_used; // as first _fz_point + _labels.insert(label); + _label_to_start_id[label] = (uint32_t)fz_location; + _location_to_labels[fz_location] = {label}; + _data_store->set_vector((location_t)fz_location, point); + _frozen_pts_used++; + } + } + } + if (location == -1) { #if EXPAND_IF_FULL @@ -2792,12 +2985,13 @@ int Index::insert_point(const T *point, const TagT tag) #else return -1; #endif - } + } // cant insert as active pts >= max_pts dl.unlock(); // Insert tag and mapping to location if (_enable_tags) { + // if tags are enabled and tag is already inserted. so we can't reuse that tag. if (_tag_to_location.find(tag) != _tag_to_location.end()) { release_location(location); @@ -2809,20 +3003,23 @@ int Index::insert_point(const T *point, const TagT tag) } tl.unlock(); - _data_store->set_vector(location, point); + _data_store->set_vector(location, point); // update datastore // Find and add appropriate graph edges ScratchStoreManager> manager(_query_scratch); auto scratch = manager.scratch_space(); - std::vector pruned_list; + std::vector pruned_list; // it is the set best candidates to connect to this point if (_filtered_index) { + // when filtered the best_candidates will share the same label ( label_present > distance) search_for_point_and_prune(location, _indexingQueueSize, pruned_list, scratch, true, _filterIndexingQueueSize); } else { search_for_point_and_prune(location, _indexingQueueSize, pruned_list, scratch); } + assert(pruned_list.size() > 0); // should find atleast one neighbour (i.e frozen point acting as medoid) + { std::shared_lock tlock(_tag_lock, std::defer_lock); if (_conc_consolidate) @@ -2898,7 +3095,6 @@ template int Index _delete_set->insert(location); _location_to_tag.erase(location); _tag_to_location.erase(tag); - return 0; } diff --git a/src/index_factory.cpp b/src/index_factory.cpp index aa2042725..f3f29e183 100644 --- a/src/index_factory.cpp +++ b/src/index_factory.cpp @@ -57,7 +57,7 @@ std::unique_ptr> IndexFactory::construct_datastore(const Da std::unique_ptr> distance; switch (strategy) { - case diskann::DataStoreStrategy::MEMORY: + case DataStoreStrategy::MEMORY: if (m == diskann::Metric::COSINE && std::is_same::value) { distance.reset((Distance *)new AVXNormalizedCosineDistanceFloat()); @@ -97,8 +97,7 @@ std::unique_ptr IndexFactory::create_instance() (size_t)(defaults::GRAPH_SLACK_FACTOR * 1.05 * (_config->index_write_params == nullptr ? 0 : _config->index_write_params->max_degree)); auto data_store = construct_datastore(_config->data_strategy, num_points, dim, _config->metric); - auto graph_store = - construct_graphstore(_config->graph_strategy, num_points + _config->num_frozen_pts, max_reserve_degree); + auto graph_store = construct_graphstore(_config->graph_strategy, num_points, max_reserve_degree); return std::make_unique>(*_config, std::move(data_store), std::move(graph_store)); } diff --git a/src/restapi/search_wrapper.cpp b/src/restapi/search_wrapper.cpp index 2cbefef3f..001e36d39 100644 --- a/src/restapi/search_wrapper.cpp +++ b/src/restapi/search_wrapper.cpp @@ -100,8 +100,9 @@ InMemorySearch::InMemorySearch(const std::string &baseFile, const std::string { size_t dimensions, total_points = 0; diskann::get_bin_metadata(baseFile, total_points, dimensions); + auto search_params = diskann::IndexSearchParams(search_l, num_threads); _index = std::unique_ptr>( - new diskann::Index(m, dimensions, total_points, nullptr, search_l, 0, false)); + new diskann::Index(m, dimensions, total_points, nullptr, search_params, 0, false)); _index->load(indexFile.c_str(), num_threads, search_l); } diff --git a/tests/index_write_parameters_builder_tests.cpp b/tests/index_write_parameters_builder_tests.cpp index acd5e2227..0aa798da8 100644 --- a/tests/index_write_parameters_builder_tests.cpp +++ b/tests/index_write_parameters_builder_tests.cpp @@ -14,7 +14,6 @@ BOOST_AUTO_TEST_CASE(test_build) float alpha = (float)rand(); uint32_t filter_list_size = rand(); uint32_t max_occlusion_size = rand(); - uint32_t num_frozen_points = rand(); bool saturate_graph = true; diskann::IndexWriteParametersBuilder builder(search_list_size, max_degree); @@ -22,7 +21,6 @@ BOOST_AUTO_TEST_CASE(test_build) builder.with_alpha(alpha) .with_filter_list_size(filter_list_size) .with_max_occlusion_size(max_occlusion_size) - .with_num_frozen_points(num_frozen_points) .with_num_threads(0) .with_saturate_graph(saturate_graph); @@ -34,7 +32,6 @@ BOOST_AUTO_TEST_CASE(test_build) BOOST_TEST(alpha == parameters.alpha); BOOST_TEST(filter_list_size == parameters.filter_list_size); BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size); - BOOST_TEST(num_frozen_points == parameters.num_frozen_points); BOOST_TEST(saturate_graph == parameters.saturate_graph); BOOST_TEST(parameters.num_threads > (uint32_t)0); @@ -43,8 +40,7 @@ BOOST_AUTO_TEST_CASE(test_build) { uint32_t num_threads = rand() + 1; saturate_graph = false; - builder.with_num_threads(num_threads) - .with_saturate_graph(saturate_graph); + builder.with_num_threads(num_threads).with_saturate_graph(saturate_graph); auto parameters = builder.build(); @@ -53,7 +49,6 @@ BOOST_AUTO_TEST_CASE(test_build) BOOST_TEST(alpha == parameters.alpha); BOOST_TEST(filter_list_size == parameters.filter_list_size); BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size); - BOOST_TEST(num_frozen_points == parameters.num_frozen_points); BOOST_TEST(saturate_graph == parameters.saturate_graph); BOOST_TEST(num_threads == parameters.num_threads); diff --git a/workflows/dynamic_index.md b/workflows/dynamic_index.md index ca3bfbf68..17c3fb3bf 100644 --- a/workflows/dynamic_index.md +++ b/workflows/dynamic_index.md @@ -22,6 +22,17 @@ The program then simultaneously inserts newer points drawn from the file and del in chunks of `consolidate_interval` points so that the number of active points in the index is approximately `active_window`. It terminates when the end of data file is reached, and the final index has `active_window + consolidate_interval` number of points. +The index also supports filters on steaming index, you can use `insert_point` function overloads to either insert points as before or insert points with labels. +Additional options are added to support this in `apps/test_streaming_scenario` and `apps/test_streaming_scenario` please refer to program arguments for more details. + +--- +> Note +* The index does not support mixed points, that is, either all points do not have labels or all points have labels. +* You can search the built filter index (one built with filters) without filters as well. + +> WARNING: Deleting points in case of filtered build may cause the quality of Index to degrade and affect recall. +--- + `apps/test_insert_deletes_consolidate` to try inserting, lazy deletes and consolidate_delete --------------------------------------------------------------------------------------------- @@ -63,7 +74,13 @@ The arguments are as follows: 12. **--consolidate_interval**: Granularity at which insert and delete functions are called. 13. **--start_point_norm**: Set the starting node to a random point on a sphere of this radius. A reasonable choice is to set this to the average norm of the data stream. +** To build with filters add these optional parameters. +14. **--label_file**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of labels corresponding to point `i` in the file passed via `--data_file`. +15. **--FilteredLbuild**: If building a filtered index, we maintain a separate search list from the one provided by `--Lbuild/-L`. +16. **--num_start_points**: number of frozen points in this case should be more then number of unique labels. +17. **--universal_label**: Optionally, the label data may contain a special "universal" label. A point with the universal label can be matched against a query with any label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding. +18. **--label_type**: Optionally, type of label to be use its either uint or short, defaulted to `uint`. To search the generated index, use the `apps/search_memory_index` program: --------------------------------------------------------------------------- @@ -83,6 +100,9 @@ The arguments are as follows: 10. **--dynamic** (default false): whether the index being searched is dynamic or not. 11. **--tags** (default false): whether to search with tags. This should be used if point *i* in the ground truth file does not correspond the point in the *i*th position in the loaded index. +** to search with filters add these + +12. **--filter_label**: Filter for each query. For each query, a search is performed with this filter. Example with BIGANN: -------------------- @@ -126,7 +146,13 @@ gt_file=data/sift/gt100_learn-conc-${deletes}-${inserts} are inserted, start deleting the first 10000 points while inserting points 40000--50000. Then delete points 10000--20000 while inserting points 50000--60000 and so until the index is left with points 60000-100000. + +Generate labels for filtered build like this. Generating 50 unique labels zipf's distributed for 100K point dataset. +``` +~/DiskANN/build/apps/utils/generate_synthetic_labels --num_labels 50 --num_points 100000 --output_file data/zipf_labels_50_100K.txt --distribution_type zipf ``` + +```bash type='float' data='data/sift/sift_learn.fbin' query='data/sift/sift_query.fbin' @@ -139,8 +165,23 @@ active=20000 cons_int=10000 index=${index_prefix}.after-streaming-act${active}-cons${cons_int}-max${inserts} gt=data/sift/gt100_learn-act${active}-cons${cons_int}-max${inserts} +filter_label=1 + +## filter options +universal_label = '0' +label_file = 'data/zipf_labels_50_100K.txt' +num_start_points = 50 +gt_filtered= data/sift/gt100_learn-act${active}-cons${cons_int}-max${inserts}_wlabel_${filter_label} + +# Without Filters (build and search) ./apps/test_streaming_scenario --data_type ${type} --dist_fn l2 --data_path ${data} --index_path_prefix ${index_prefix} -R 64 -L 600 --alpha 1.2 --insert_threads ${ins_thr} --consolidate_threads ${cons_thr} --max_points_to_insert ${inserts} --active_window ${active} --consolidate_interval ${cons_int} --start_point_norm 508; ./apps/utils/compute_groundtruth --data_type ${type} --dist_fn l2 --base_file ${index}.data --query_file ${query} --K 100 --gt_file ${gt} --tags_file ${index}.tags ./apps/search_memory_index --data_type ${type} --dist_fn l2 --index_path_prefix ${index} --result_path ${result} --query_file ${query} --gt_file ${gt} -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1 -``` \ No newline at end of file + +# With filters (build and search) + +./apps/test_streaming_scenario --data_type ${type} --num_start_points ${num_start_points} --label_file ${label_file} --universal_label {universal_label} --dist_fn l2 --data_path ${data} --index_path_prefix ${index_prefix} -R 64 -L 600 --alpha 1.2 --insert_threads ${ins_thr} --consolidate_threads ${cons_thr} --max_points_to_insert ${inserts} --active_window ${active} --consolidate_interval ${cons_int} --start_point_norm 508; +./apps/utils/compute_groundtruth_for_filters --data_type ${type} --dist_fn l2 --base_file ${index}.data --query_file ${query} --K 100 --gt_file ${gt_filtered} --label_file ${label_file} --universal_label {universal_label} --filter_label {filter_label} +./apps/search_memory_index --data_type ${type} --filter_label {filter_label} --dist_fn l2 --index_path_prefix ${index} --result_path ${result} --query_file ${query} --gt_file ${gt_filtered} -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1 +``` diff --git a/workflows/filtered_ssd_index.md b/workflows/filtered_ssd_index.md index 272100e6d..7457d8c9b 100644 --- a/workflows/filtered_ssd_index.md +++ b/workflows/filtered_ssd_index.md @@ -21,7 +21,7 @@ To generate an SSD-friendly index, use the `apps/build_disk_index` program. 11. **--build_PQ_bytes** (default is 0): Set to a positive value less than the dimensionality of the data to enable faster index build with PQ based distance comparisons. 12. **--use_opq**: use the flag to use OPQ rather than PQ compression. OPQ is more space efficient for some high dimensional datasets, but also needs a bit more build time. 13. **--label_file**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`. -14. **--universal_label**: Optionally, the the filter data may contain a "wild-card" filter corresponding to all filters. This is referred to as a universal label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding to said point. +14. **--universal_label**: Optionally, the label data may contain a special "universal" label. A point with the universal label can be matched against a query with any label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding. 15. **--FilteredLbuild**: If building a filtered index, we maintain a separate search list from the one provided by `--Lbuild`. 16. **--filter_threshold**: Threshold to break up the existing nodes to generate new graph internally by breaking dense points where each node will have a maximum F labels. Default value is zero where no break up happens for the dense points. From dee332df38f9e2603241d7fba7f6d3290af6c80f Mon Sep 17 00:00:00 2001 From: Shawn Zhong Date: Wed, 4 Oct 2023 13:25:49 -0500 Subject: [PATCH 22/23] Fix typo in SSD_index.md (#466) --- workflows/SSD_index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/SSD_index.md b/workflows/SSD_index.md index f86856796..e95ece97f 100644 --- a/workflows/SSD_index.md +++ b/workflows/SSD_index.md @@ -11,7 +11,7 @@ The arguments are as follows: 3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as an integer. The next 4 bytes represent the dimension of data as an integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. `sizeof(T)` is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices. 4. **--index_path_prefix**: the index will span a few files, all beginning with the specified prefix path. For example, if you provide `~/index_test` as the prefix path, build generates files such as `~/index_test_pq_pivots.bin, ~/index_test_pq_compressed.bin, ~/index_test_disk.index, ...`. There may be between 8 and 10 files generated with this prefix depending on how the index is constructed. 5. **-R (--max_degree)** (default is 64): the degree of the graph index, typically between 60 and 150. Larger R will result in larger indices and longer indexing times, but better search quality. -6. **-L (--Lbuild)** (default is 100): the size of search listduring index build. Typical values are between 75 to 200. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Use a value for L value that is at least the value of R unless you need to build indices really quickly and can somewhat compromise on quality. +6. **-L (--Lbuild)** (default is 100): the size of search list during index build. Typical values are between 75 to 200. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Use a value for L value that is at least the value of R unless you need to build indices really quickly and can somewhat compromise on quality. 7. **-B (--search_DRAM_budget)**: bound on the memory footprint of the index at search time in GB. Once built, the index will use up only the specified RAM limit, the rest will reside on disk. This will dictate how aggressively we compress the data vectors to store in memory. Larger will yield better performance at search time. For an n point index, to use b byte PQ compressed representation in memory, use `B = ((n * b) / 2^30 + (250000*(4*R + sizeof(T)*ndim)) / 2^30)`. The second term in the summation is to allow some buffer for caching about 250,000 nodes from the graph in memory while serving. If you are not sure about this term, add 0.25GB to the first term. 8. **-M (--build_DRAM_budget)**: Limit on the memory allowed for building the index in GB. If you specify a value less than what is required to build the index in one pass, the index is built using a divide and conquer approach so that sub-graphs will fit in the RAM budget. The sub-graphs are overlayed to build the overall index. This approach can be upto 1.5 times slower than building the index in one shot. Allocate as much memory as your RAM allows. 9. **-T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth). @@ -34,7 +34,7 @@ The arguments are as follows: 8. **--gt_file**: The ground truth file for the queries in arg (7) and data file used in index construction. The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format, followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `apps/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall. 9. **K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors. 10. **result_output_prefix**: Search results will be stored in files with specified prefix, in bin format. -11. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be atleast the value of *K* in arg (9). +11. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be at least the value of *K* in arg (9). Example with BIGANN: @@ -60,7 +60,7 @@ Now build and search the index and measure the recall using ground truth compute ./apps/search_disk_index --data_type float --dist_fn l2 --index_path_prefix data/sift/disk_index_sift_learn_R32_L50_A1.2 --query_file data/sift/sift_query.fbin --gt_file data/sift/sift_query_learn_gt100 -K 10 -L 10 20 30 40 50 100 --result_path data/sift/res --num_nodes_to_cache 10000 ``` -The search might be slower on machine with remote SSDs. The output lists the quer throughput, the mean and 99.9pc latency in microseconds and mean number of 4KB IOs to disk for each `L` parameter provided. +The search might be slower on machine with remote SSDs. The output lists the query throughput, the mean and 99.9pc latency in microseconds and mean number of 4KB IOs to disk for each `L` parameter provided. ``` L Beamwidth QPS Mean Latency 99.9 Latency Mean IOs CPU (s) Recall@10 From a5334dd89ee0cf305e8757ed1bb373258537d2d9 Mon Sep 17 00:00:00 2001 From: Huisheng Liu Date: Wed, 4 Oct 2023 11:35:49 -0700 Subject: [PATCH 23/23] add check for .enc extension to support encryption (#467) * add check for .enc extension to support encryption * check rotation_matrix file in file blobs --- include/utils.h | 25 ++++++++++++++++++++++++- src/pq.cpp | 6 ++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/utils.h b/include/utils.h index 195dd95ef..bb03d13f1 100644 --- a/include/utils.h +++ b/include/utils.h @@ -57,7 +57,7 @@ typedef int FileHandle; #define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||" #define PBWIDTH 60 -inline bool file_exists(const std::string &name, bool dirCheck = false) +inline bool file_exists_impl(const std::string &name, bool dirCheck = false) { int val; #ifndef _WINDOWS @@ -94,6 +94,29 @@ inline bool file_exists(const std::string &name, bool dirCheck = false) } } +inline bool file_exists(const std::string &name, bool dirCheck = false) +{ +#ifdef EXEC_ENV_OLS + bool exists = file_exists_impl(name, dirCheck); + if (exists) + { + return true; + } + if (!dirCheck) + { + // try with .enc extension + std::string enc_name = name + ENCRYPTED_EXTENSION; + return file_exists_impl(enc_name, dirCheck); + } + else + { + return exists; + } +#else + return file_exists_impl(name, dirCheck); +#endif +} + inline void open_file_to_write(std::ofstream &writer, const std::string &filename) { writer.exceptions(std::ofstream::failbit | std::ofstream::badbit); diff --git a/src/pq.cpp b/src/pq.cpp index 86c68ce0a..c59fc2dce 100644 --- a/src/pq.cpp +++ b/src/pq.cpp @@ -133,11 +133,13 @@ void FixedChunkPQTable::load_pq_centroid_bin(const char *pq_table_file, size_t n diskann::cout << "Loaded PQ Pivots: #ctrs: " << NUM_PQ_CENTROIDS << ", #dims: " << this->ndims << ", #chunks: " << this->n_chunks << std::endl; - if (file_exists(rotmat_file)) - { #ifdef EXEC_ENV_OLS + if (files.fileExists(rotmat_file)) + { diskann::load_bin(files, rotmat_file, (float *&)rotmat_tr, nr, nc); #else + if (file_exists(rotmat_file)) + { diskann::load_bin(rotmat_file, rotmat_tr, nr, nc); #endif if (nr != this->ndims || nc != this->ndims)