Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rakri/cosine bug fix #450

Merged
merged 3 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/actions/generate-random/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@ runs:

echo "Generating random vectors for index"
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0
dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0

echo "Generating random vectors for query"
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0
dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0

echo "Computing ground truth for floats across l2, mips, and cosine distance functions"
dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100

echo "Computing ground truth for int8s across l2, mips, and cosine distance functions"
dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/disk-pq.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ jobs:
run: |
dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, cosine, no diskPQ) (float)
if: success() || failure()
run: |
dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, L2, no diskPQ) (int8)
if: success() || failure()
run: |
Expand Down Expand Up @@ -66,6 +71,11 @@ jobs:
run: |
dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, cosine, no diskPQ) (float)
if: success() || failure()
run: |
dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, L2, no diskPQ) (int8)
run: |
dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
Expand Down
2 changes: 2 additions & 0 deletions apps/build_disk_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ int main(int argc, char **argv)
metric = diskann::Metric::L2;
else if (dist_fn == std::string("mips"))
metric = diskann::Metric::INNER_PRODUCT;
else if (dist_fn == std::string("cosine"))
metric = diskann::Metric::COSINE;
else
{
std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
Expand Down
52 changes: 40 additions & 12 deletions apps/utils/rand_data_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,31 @@

namespace po = boost::program_options;

int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, float norm)
int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm,
float rand_scale)
{
auto vec = new float[ndims];

std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0, 1};
std::uniform_real_distribution<> unif_dis(1.0, rand_scale);

for (size_t i = 0; i < npts; i++)
{
float sum = 0;
float scale = 1.0f;
if (rand_scale > 1.0f)
scale = (float)unif_dis(gen);
for (size_t d = 0; d < ndims; ++d)
vec[d] = (float)normal_rand(gen);
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
vec[d] = scale * (float)normal_rand(gen);
if (normalization)
{
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
}

writer.write((char *)vec, ndims * sizeof(float));
}
Expand Down Expand Up @@ -104,8 +112,8 @@ int main(int argc, char **argv)
{
std::string data_type, output_file;
size_t ndims, npts;
float norm;

float norm, rand_scaling;
bool normalization = false;
try
{
po::options_description desc{"Arguments"};
Expand All @@ -117,7 +125,11 @@ int main(int argc, char **argv)
"File name for saving the random vectors");
desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(), "Dimensoinality of the vector");
desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(), "Number of vectors");
desc.add_options()("norm", po::value<float>(&norm)->required(), "Norm of the vectors");
desc.add_options()("norm", po::value<float>(&norm)->default_value(-1.0f),
"Norm of the vectors (if not specified, vectors are not normalized)");
desc.add_options()("rand_scaling", po::value<float>(&rand_scaling)->default_value(1.0f),
"Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from "
"[1, rand_scale]. Only applicable for floating point data");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
Expand All @@ -139,9 +151,20 @@ int main(int argc, char **argv)
return -1;
}

if (norm <= 0.0)
if (norm > 0.0)
{
normalization = true;
}

if (rand_scaling < 1.0)
{
std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl;
return -1;
}

if ((rand_scaling > 1.0) && (normalization == true))
{
std::cerr << "Error: Norm must be a positive number" << std::endl;
std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl;
return -1;
}

Expand All @@ -155,6 +178,11 @@ int main(int argc, char **argv)
<< std::endl;
return -1;
}
if (rand_scaling > 1.0)
{
std::cout << "Data scaling only supported for floating point data." << std::endl;
return -1;
}
}

try
Expand All @@ -177,7 +205,7 @@ int main(int argc, char **argv)
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (data_type == std::string("float"))
{
ret = block_write_float(writer, ndims, cblk_size, norm);
ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling);
}
else if (data_type == std::string("int8"))
{
Expand Down
32 changes: 25 additions & 7 deletions src/disk_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,11 +1129,12 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
return -1;
}

if (!std::is_same<T, float>::value && compareMetric == diskann::Metric::INNER_PRODUCT)
if (!std::is_same<T, float>::value &&
(compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE))
{
std::stringstream stream;
stream << "DiskANN currently only supports floating point data for Max "
"Inner Product Search. "
stream << "Disk-index build currently only supports floating point data for Max "
"Inner Product Search/ cosine similarity. "
<< std::endl;
throw diskann::ANNException(stream.str(), -1);
}
Expand Down Expand Up @@ -1195,6 +1196,10 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin";
// optional, used if disk index must store pq data
std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin";
std::string prepped_base =
index_prefix_path +
"_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics
rakri marked this conversation as resolved.
Show resolved Hide resolved
bool created_temp_file_for_processed_data = false;

// output a new base file which contains extra dimension with sqrt(1 -
// ||x||^2/M^2) for every x, M is max norm of all points. Extra space on
Expand All @@ -1205,14 +1210,26 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::cout << "Using Inner Product search, so need to pre-process base "
"data into temp file. Please ensure there is additional "
"(n*(d+1)*4) bytes for storing pre-processed base vectors, "
"apart from the intermin indices and final index."
"apart from the interim indices created by DiskANN and the final index."
<< std::endl;
std::string prepped_base = index_prefix_path + "_prepped_base.bin";
data_file_to_use = prepped_base;
float max_norm_of_base = diskann::prepare_base_for_inner_products<T>(base_file, prepped_base);
std::string norm_file = disk_index_path + "_max_base_norm.bin";
diskann::save_bin<float>(norm_file, &max_norm_of_base, 1, 1);
diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl;
created_temp_file_for_processed_data = true;
}
else if (compareMetric == diskann::Metric::COSINE)
{
Timer timer;
std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional "
"(n*d*4) bytes for storing normalized base vectors, "
"apart from the interim indices created by DiskANN and the final index."
<< std::endl;
data_file_to_use = prepped_base;
diskann::normalize_data_file(base_file, prepped_base);
diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl;
created_temp_file_for_processed_data = true;
}

uint32_t R = (uint32_t)atoi(param_list[0].c_str());
Expand Down Expand Up @@ -1304,7 +1321,7 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
MallocExtension::instance()->ReleaseFreeMemory();
#endif

// Whether it is cosine or inner product, we still L2 metric due to the pre-processing.
timer.reset();
diskann::build_merged_vamana_index<T, LabelT>(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val,
indexing_ram_budget, mem_index_path, medoids_path, centroids_path,
Expand Down Expand Up @@ -1345,7 +1362,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::remove(augmented_labels_file.c_str());
std::remove(labels_file_to_use.c_str());
}

if (created_temp_file_for_processed_data)
std::remove(prepped_base.c_str());
std::remove(mem_index_path.c_str());
if (use_disk_pq)
std::remove(disk_pq_compressed_vectors_path.c_str());
Expand Down
27 changes: 16 additions & 11 deletions src/pq_flash_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ template <typename T, typename LabelT>
PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileReader, diskann::Metric m)
: reader(fileReader), metric(m), _thread_data(nullptr)
{
diskann::Metric metric_to_invoke = m;
if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT)
{
if (std::is_floating_point<T>::value)
{
diskann::cout << "Cosine metric chosen for (normalized) float data."
"Changing distance to L2 to boost accuracy."
diskann::cout << "Since data is floating point, we assume that it has been appropriately pre-processed "
rakri marked this conversation as resolved.
Show resolved Hide resolved
"(normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we "
"shall invoke an l2 distance function."
<< std::endl;
metric = diskann::Metric::L2;
metric_to_invoke = diskann::Metric::L2;
}
else
{
Expand All @@ -49,8 +51,8 @@ PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileRe
}
}

this->_dist_cmp.reset(diskann::get_distance_function<T>(metric));
this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric));
this->_dist_cmp.reset(diskann::get_distance_function<T>(metric_to_invoke));
this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric_to_invoke));
}

template <typename T, typename LabelT> PQFlashIndex<T, LabelT>::~PQFlashIndex()
Expand Down Expand Up @@ -1292,20 +1294,23 @@ void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t
float *query_float = pq_query_scratch->aligned_query_float;
float *query_rotated = pq_query_scratch->rotated_query;

// if inner product, we laso normalize the query and set the last coordinate
// to 0 (this is the extra coordindate used to convert MIPS to L2 search)
if (metric == diskann::Metric::INNER_PRODUCT)
// normalization step. for cosine, we simply normalize the query
// for mips, we normalize the first d-1 dims, and add a 0 for last dim, since an extra coordinate was used to
// convert MIPS to L2 search
if (metric == diskann::Metric::INNER_PRODUCT || metric == diskann::Metric::COSINE)
{
for (size_t i = 0; i < this->_data_dim - 1; i++)
uint64_t inherent_dim = (metric == diskann::Metric::COSINE) ? this->_data_dim : (uint64_t)(this->_data_dim - 1);
for (size_t i = 0; i < inherent_dim; i++)
{
aligned_query_T[i] = query1[i];
query_norm += query1[i] * query1[i];
}
aligned_query_T[this->_data_dim - 1] = 0;
if (metric == diskann::Metric::INNER_PRODUCT)
aligned_query_T[this->_data_dim - 1] = 0;

query_norm = std::sqrt(query_norm);

for (size_t i = 0; i < this->_data_dim - 1; i++)
for (size_t i = 0; i < inherent_dim; i++)
{
aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm);
}
Expand Down
Loading