From f61e5df861d78da70f0bce29c8f5324575839464 Mon Sep 17 00:00:00 2001 From: rakri Date: Wed, 24 Jan 2024 05:30:32 +0000 Subject: [PATCH] added cosine unit test with unnormalized data --- .github/actions/generate-random/action.yml | 3 ++ .github/workflows/disk-pq.yml | 10 +++++ apps/utils/rand_data_gen.cpp | 52 +++++++++++++++++----- 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/.github/actions/generate-random/action.yml b/.github/actions/generate-random/action.yml index 297209d7b..2755067df 100644 --- a/.github/actions/generate-random/action.yml +++ b/.github/actions/generate-random/action.yml @@ -9,11 +9,13 @@ runs: echo "Generating random vectors for index" dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0 dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0 echo "Generating random vectors for query" dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 @@ -21,6 +23,7 @@ runs: dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 dist/bin/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100 echo "Computing ground truth for int8s across l2, mips, and cosine distance functions" dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100 diff --git a/.github/workflows/disk-pq.yml b/.github/workflows/disk-pq.yml index 35c662184..10f160fe1 100644 --- a/.github/workflows/disk-pq.yml +++ b/.github/workflows/disk-pq.yml @@ -34,6 +34,11 @@ jobs: run: | dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 + - name: build and search disk index (one shot graph build, L2, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: build and search disk index (one shot graph build, L2, no diskPQ) (int8) if: success() || failure() run: | @@ -66,6 +71,11 @@ jobs: run: | dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 + - name: build and search disk index (sharded graph build, cosine, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 + dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: build and search disk index (sharded graph build, L2, no diskPQ) (int8) run: | dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 diff --git a/apps/utils/rand_data_gen.cpp b/apps/utils/rand_data_gen.cpp index a6f9305c8..e89ede800 100644 --- a/apps/utils/rand_data_gen.cpp +++ b/apps/utils/rand_data_gen.cpp @@ -11,23 +11,31 @@ namespace po = boost::program_options; -int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, float norm) +int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm, + float rand_scale) { auto vec = new float[ndims]; std::random_device rd{}; std::mt19937 gen{rd()}; std::normal_distribution<> normal_rand{0, 1}; + std::uniform_real_distribution<> unif_dis(1.0, rand_scale); for (size_t i = 0; i < npts; i++) { float sum = 0; + float scale = 1.0f; + if (rand_scale > 1.0f) + scale = (float)unif_dis(gen); for (size_t d = 0; d < ndims; ++d) - vec[d] = (float)normal_rand(gen); - for (size_t d = 0; d < ndims; ++d) - sum += vec[d] * vec[d]; - for (size_t d = 0; d < ndims; ++d) - vec[d] = vec[d] * norm / std::sqrt(sum); + vec[d] = scale * (float)normal_rand(gen); + if (normalization) + { + for (size_t d = 0; d < ndims; ++d) + sum += vec[d] * vec[d]; + for (size_t d = 0; d < ndims; ++d) + vec[d] = vec[d] * norm / std::sqrt(sum); + } writer.write((char *)vec, ndims * sizeof(float)); } @@ -104,8 +112,8 @@ int main(int argc, char **argv) { std::string data_type, output_file; size_t ndims, npts; - float norm; - + float norm, rand_scaling; + bool normalization = false; try { po::options_description desc{"Arguments"}; @@ -117,7 +125,11 @@ int main(int argc, char **argv) "File name for saving the random vectors"); desc.add_options()("ndims,D", po::value(&ndims)->required(), "Dimensoinality of the vector"); desc.add_options()("npts,N", po::value(&npts)->required(), "Number of vectors"); - desc.add_options()("norm", po::value(&norm)->required(), "Norm of the vectors"); + desc.add_options()("norm", po::value(&norm)->default_value(-1.0f), + "Norm of the vectors (if not specified, vectors are not normalized)"); + desc.add_options()("rand_scaling", po::value(&rand_scaling)->default_value(1.0f), + "Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from " + "[1, rand_scale]. Only applicable for floating point data"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); if (vm.count("help")) @@ -139,9 +151,20 @@ int main(int argc, char **argv) return -1; } - if (norm <= 0.0) + if (norm > 0.0) + { + normalization = true; + } + + if (rand_scaling < 1.0) + { + std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl; + return -1; + } + + if ((rand_scaling > 1.0) && (normalization == true)) { - std::cerr << "Error: Norm must be a positive number" << std::endl; + std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl; return -1; } @@ -155,6 +178,11 @@ int main(int argc, char **argv) << std::endl; return -1; } + if (rand_scaling > 1.0) + { + std::cout << "Data scaling only supported for floating point data." << std::endl; + return -1; + } } try @@ -177,7 +205,7 @@ int main(int argc, char **argv) size_t cblk_size = std::min(npts - i * blk_size, blk_size); if (data_type == std::string("float")) { - ret = block_write_float(writer, ndims, cblk_size, norm); + ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling); } else if (data_type == std::string("int8")) {