Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor filter code into its own class #593

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "gperftools"]
path = gperftools
url = https://github.com/gperftools/gperftools.git
[submodule "CRoaring"]
path = CRoaring
url = https://github.com/RoaringBitmap/CRoaring
25 changes: 25 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,31 @@ endif()

add_definitions(-DMKL_ILP64)


# Roaring Bitmap
#if ( NOT EXISTS "${PROJECT_SOURCE_DIR}/CRoaring/LICENSE")
# message(FATAL_ERROR "The RoaringBitmap submodule was not found. "
# " Please run 'git submodule init' followed by 'git submodule update'")
#endif()
#add_subdirectory(CRoaring)
#include_directories(CRoaring/include/roaring)

#set (CROARING_LIBRARY "${PROJECT_SOURCEDIR}/CRoaring/build/src/Release/roaring.lib")
#add_custom_target(build_croaring DEPENDS CROARING_LIBRARY)
#if (MSVC)
# add_custom_command(OUTPUT ${CROARING_LIBRARY}
# COMMAND ${CMAKE_VS_MSBUILD_COMMAND} build/RoaringBitmap.sln /m /nologo
# /t:roaring /p:Configuration="Release"
# /property:Platform="x64"
# /p:PlatformToolset=v${MSVC_TOOLSET_VERSION}
# /p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}
# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/CRoaring)
#endif()
#add_library(croaring_lib STATIC IMPORTED)
#set_target_properties(croaring_lib PROPERTIES IMPORTED_LOCATION "${CROARING_LIBRARY}")



# Section for tcmalloc. The DiskANN tools are always linked to tcmalloc. For Windows, they also need to
# force-include the _tcmalloc symbol for enabling tcmalloc.
#
Expand Down
1 change: 1 addition & 0 deletions CRoaring
Submodule CRoaring added at ad487e
29 changes: 17 additions & 12 deletions apps/build_disk_index.cpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <omp.h>
#include <boost/program_options.hpp>
#include <omp.h>

#include "utils.h"
#include "disk_utils.h"
#include "math_utils.h"
#include "index.h"
#include "math_utils.h"
#include "partition.h"
#include "program_options_utils.hpp"
#include "utils.h"

namespace po = boost::program_options;

int main(int argc, char **argv)
{
std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label,
label_type;
uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold;
uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold, filter_bf_threshold;
float B, M;
bool append_reorder_data = false;
bool use_opq = false;
Expand Down Expand Up @@ -74,8 +74,9 @@ int main(int argc, char **argv)
optional_configs.add_options()("FilteredLbuild", po::value<uint32_t>(&Lf)->default_value(0),
program_options_utils::FILTERED_LBUILD);
optional_configs.add_options()("filter_threshold,F", po::value<uint32_t>(&filter_threshold)->default_value(0),
"Threshold to break up the existing nodes to generate new graph "
"internally where each node has a maximum F labels.");
program_options_utils::FILTER_THRESHOLD_DESCRIPTION);
optional_configs.add_options()("filter_bruteforce_threshold", po::value<uint32_t>(&filter_bf_threshold)->default_value(0),
program_options_utils::FILTER_BRUTEFORCE_THRESHOLD_DESCRIPTION);
optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
program_options_utils::LABEL_TYPE_DESCRIPTION);

Expand Down Expand Up @@ -139,22 +140,26 @@ int main(int argc, char **argv)
std::string(std::to_string(append_reorder_data)) + " " +
std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD));

if (filter_bf_threshold == 0) {
filter_bf_threshold = std::numeric_limits<uint32_t>::max();
}

try
{
if (label_file != "" && label_type == "ushort")
{
if (data_type == std::string("int8"))
return diskann::build_disk_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file,
universal_label, filter_threshold, Lf);
universal_label, filter_threshold, Lf, filter_bf_threshold);
else if (data_type == std::string("uint8"))
return diskann::build_disk_index<uint8_t, uint16_t>(
data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
use_filters, label_file, universal_label, filter_threshold, Lf);
use_filters, label_file, universal_label, filter_threshold, Lf, filter_bf_threshold);
else if (data_type == std::string("float"))
return diskann::build_disk_index<float, uint16_t>(
data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
use_filters, label_file, universal_label, filter_threshold, Lf);
use_filters, label_file, universal_label, filter_threshold, Lf, filter_bf_threshold);
else
{
diskann::cerr << "Error. Unsupported data type" << std::endl;
Expand All @@ -166,15 +171,15 @@ int main(int argc, char **argv)
if (data_type == std::string("int8"))
return diskann::build_disk_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file,
universal_label, filter_threshold, Lf);
universal_label, filter_threshold, Lf, filter_bf_threshold);
else if (data_type == std::string("uint8"))
return diskann::build_disk_index<uint8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file,
universal_label, filter_threshold, Lf);
universal_label, filter_threshold, Lf, filter_bf_threshold);
else if (data_type == std::string("float"))
return diskann::build_disk_index<float>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
metric, use_opq, codebook_prefix, use_filters, label_file,
universal_label, filter_threshold, Lf);
universal_label, filter_threshold, Lf, filter_bf_threshold);
else
{
diskann::cerr << "Error. Unsupported data type" << std::endl;
Expand Down
8 changes: 4 additions & 4 deletions apps/build_memory_index.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <omp.h>
#include <cstring>
#include <boost/program_options.hpp>
#include <cstring>
#include <omp.h>

#include "index.h"
#include "utils.h"
#include "program_options_utils.hpp"
#include "utils.h"

#ifndef _WINDOWS
#include <sys/mman.h>
Expand All @@ -16,9 +16,9 @@
#include <Windows.h>
#endif

#include "memory_mapper.h"
#include "ann_exception.h"
#include "index_factory.h"
#include "memory_mapper.h"

namespace po = boost::program_options;

Expand Down
6 changes: 3 additions & 3 deletions apps/build_stitched_index.cpp
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include "filter_utils.h"
#include <boost/program_options.hpp>
#include <chrono>
#include <cstdio>
#include <cstring>
#include <omp.h>
#include <random>
#include <string>
#include <tuple>
#include "filter_utils.h"
#include <omp.h>
#ifndef _WINDOWS
#include <sys/uio.h>
#endif

#include "index.h"
#include "memory_mapper.h"
#include "parameters.h"
#include "utils.h"
#include "program_options_utils.hpp"
#include "utils.h"

namespace po = boost::program_options;
typedef std::tuple<std::vector<std::vector<uint32_t>>, uint64_t> stitch_indices_return_values;
Expand Down
10 changes: 5 additions & 5 deletions apps/range_search_disk_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,26 @@
// Licensed under the MIT license.

#include <atomic>
#include <boost/program_options.hpp>
#include <cstring>
#include <iomanip>
#include <omp.h>
#include <set>
#include <boost/program_options.hpp>

#include "index.h"
#include "disk_utils.h"
#include "index.h"
#include "math_utils.h"
#include "memory_mapper.h"
#include "pq_flash_index.h"
#include "partition.h"
#include "timer.h"
#include "pq_flash_index.h"
#include "program_options_utils.hpp"
#include "timer.h"

#ifndef _WINDOWS
#include "linux_aligned_file_reader.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include "linux_aligned_file_reader.h"
#else
#ifdef USE_BING_INFRA
#include "bing_aligned_file_reader.h"
Expand Down
12 changes: 6 additions & 6 deletions apps/search_disk_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@
#include "common_includes.h"
#include <boost/program_options.hpp>

#include "index.h"
#include "disk_utils.h"
#include "index.h"
#include "math_utils.h"
#include "memory_mapper.h"
#include "partition.h"
#include "pq_flash_index.h"
#include "timer.h"
#include "percentile_stats.h"
#include "pq_flash_index.h"
#include "program_options_utils.hpp"
#include "timer.h"

#ifndef _WINDOWS
#include "linux_aligned_file_reader.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include "linux_aligned_file_reader.h"
#else
#ifdef USE_BING_INFRA
#include "bing_aligned_file_reader.h"
Expand Down Expand Up @@ -123,8 +123,8 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre
diskann::cout << "Caching " << num_nodes_to_cache << " nodes around medoid(s)" << std::endl;
_pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list);
// if (num_nodes_to_cache > 0)
// _pFlashIndex->generate_cache_list_from_sample_queries(warmup_query_file, 15, 6, num_nodes_to_cache,
// num_threads, node_list);
// _pFlashIndex->generate_cache_list_from_sample_queries(warmup_query_file,
// 15, 6, num_nodes_to_cache, num_threads, node_list);
_pFlashIndex->load_cache_list(node_list);
node_list.clear();
node_list.shrink_to_fit();
Expand Down
14 changes: 7 additions & 7 deletions apps/search_memory_index.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <algorithm>
#include <boost/program_options.hpp>
#include <cstring>
#include <iomanip>
#include <algorithm>
#include <numeric>
#include <omp.h>
#include <set>
#include <string.h>
#include <boost/program_options.hpp>

#ifndef _WINDOWS
#include <sys/mman.h>
Expand All @@ -18,10 +18,10 @@
#endif

#include "index.h"
#include "index_factory.h"
#include "memory_mapper.h"
#include "utils.h"
#include "program_options_utils.hpp"
#include "index_factory.h"
#include "utils.h"

namespace po = boost::program_options;

Expand Down Expand Up @@ -323,9 +323,9 @@ int main(int argc, char **argv)
optional_configs.add_options()("num_threads,T",
po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
program_options_utils::NUMBER_THREADS_DESCRIPTION);
optional_configs.add_options()(
"dynamic", po::value<bool>(&dynamic)->default_value(false),
"Whether the index is dynamic. Dynamic indices must have associated tags. Default false.");
optional_configs.add_options()("dynamic", po::value<bool>(&dynamic)->default_value(false),
"Whether the index is dynamic. Dynamic indices must have associated "
"tags. Default false.");
optional_configs.add_options()("tags", po::value<bool>(&tags)->default_value(false),
"Whether to search with external identifiers (tags). Default false.");
optional_configs.add_options()("fail_if_recall_below",
Expand Down
8 changes: 4 additions & 4 deletions apps/test_insert_deletes_consolidate.cpp
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <boost/program_options.hpp>
#include <future>
#include <index.h>
#include <numeric>
#include <omp.h>
#include <string.h>
#include <time.h>
#include <timer.h>
#include <boost/program_options.hpp>
#include <future>

#include "utils.h"
#include "filter_utils.h"
#include "program_options_utils.hpp"
#include "index_factory.h"
#include "program_options_utils.hpp"
#include "utils.h"

#ifndef _WINDOWS
#include <sys/mman.h>
Expand Down
10 changes: 5 additions & 5 deletions apps/test_streaming_scenario.cpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <abstract_index.h>
#include <boost/program_options.hpp>
#include <future>
#include <index.h>
#include <index_factory.h>
#include <numeric>
#include <omp.h>
#include <string.h>
#include <time.h>
#include <timer.h>
#include <boost/program_options.hpp>
#include <future>
#include <abstract_index.h>
#include <index_factory.h>

#include "utils.h"
#include "filter_utils.h"
#include "program_options_utils.hpp"
#include "utils.h"

#ifndef _WINDOWS
#include <sys/mman.h>
Expand Down
2 changes: 1 addition & 1 deletion apps/utils/bin_to_fvecs.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <iostream>
#include "util.h"
#include <iostream>

void block_convert(std::ifstream &writr, std::ofstream &readr, float *read_buf, float *write_buf, uint64_t npts,
uint64_t ndims)
Expand Down
2 changes: 1 addition & 1 deletion apps/utils/bin_to_tsv.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <iostream>
#include "utils.h"
#include <iostream>

template <class T>
void block_convert(std::ofstream &writer, std::ifstream &reader, T *read_buf, size_t npts, size_t ndims)
Expand Down
2 changes: 1 addition & 1 deletion apps/utils/calculate_recall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include <string>
#include <vector>

#include "utils.h"
#include "disk_utils.h"
#include "utils.h"

int main(int argc, char **argv)
{
Expand Down
Loading
Loading