Skip to content

Commit 0d7df2b

Browse files
authored
Fix undefined behaviour of char bit shifting when combining classic indices (bingmann#6)
Previously when combining multiple classic indices into a single classic index, the contents of source indices are read in as `char`. During the interleaving process, depending on the current position of the destination index, both left and right shifts on the next char could be performed. However, there are a few undefined behaviours that could affect the results depending on the platform: 1. The signedness of a `char` is an undefined behaviour. Hence when bit shifting, the usual arithmetic conversion performed on the char is undefined. The char could be promoted to either signed int or unsigned int. 2. If the char is treated as signed int, the bit shifting (both left and right) is also undefined in pre-c++20 standards. The behaviour is platform dependent. This change fixes the issue by declare the contents read from source indices as `unsigned char`.
1 parent 221772c commit 0d7df2b

File tree

2 files changed

+76
-2
lines changed

2 files changed

+76
-2
lines changed

cobs/construction/classic_index.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ void classic_combine_streams(
247247

248248
// read many blocks from each file, interleave them into new block, and
249249
// write it out
250-
std::vector<std::vector<char> > in_blocks(streams.size());
250+
std::vector<std::vector<unsigned char> > in_blocks(streams.size());
251251
for (size_t i = 0; i < streams.size(); ++i) {
252252
in_blocks[i].resize(row_bytes[i] * batch_size);
253253
}
@@ -266,7 +266,7 @@ void classic_combine_streams(
266266
// read data from streams
267267
for (size_t i = 0; i < streams.size(); ++i) {
268268
streams[i].read(
269-
in_blocks[i].data(), row_bytes[i] * this_batch);
269+
(char*)(in_blocks[i].data()), row_bytes[i] * this_batch);
270270
LOG << "stream[" << i << "] read " << streams[i].gcount();
271271
die_unequal(row_bytes[i] * this_batch,
272272
static_cast<size_t>(streams[i].gcount()));

tests/classic_index_construction.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
*
66
* All rights reserved. Published under the MIT License in the LICENSE file.
77
******************************************************************************/
8+
#include <fstream>
9+
#include <algorithm>
810

911
#include <fstream>
1012
#include <algorithm>
1113

1214
#include "test_util.hpp"
15+
#include <cobs/file/classic_index_header.hpp>
1316
#include <cobs/query/classic_index/mmap_search_file.hpp>
1417
#include <cobs/util/calc_signature_size.hpp>
1518
#include <cobs/util/file.hpp>
@@ -24,6 +27,20 @@ static fs::path index_dir = base_dir / "index";
2427
static fs::path index_file = base_dir / "index.cobs_classic";
2528
static fs::path tmp_path = base_dir / "tmp";
2629

30+
// Compare two classic indices. Return true if the bloom filters of both files are the same.
31+
bool compare_classic_indices(const std::string& filename1, const std::string& filename2)
32+
{
33+
std::ifstream ifs1, ifs2;
34+
35+
cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs1, filename1);
36+
cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs2, filename2);
37+
38+
std::istreambuf_iterator<char> begin1(ifs1);
39+
std::istreambuf_iterator<char> begin2(ifs2);
40+
41+
return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
42+
}
43+
2744
// Compare two files. Return true if the contents of both files are the same.
2845
bool compare_files(const std::string& filename1, const std::string& filename2)
2946
{
@@ -220,4 +237,61 @@ TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) {
220237
ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index));
221238
}
222239

240+
TEST_F(classic_index_construction, same_documents_combined_into_same_index) {
241+
// This test starts with 18 copies of the same randomly generated document.
242+
// These documents are split in 4 groups: g1 with 1 copy, g2 with 2 copies,
243+
// g7 with 7 copies and g8 with 8 copies.
244+
// A classic index is constructed through `cobs::classic_construct` for these
245+
// 4 groups: c1, c2, c7 and c8.
246+
// We then combine these 4 classic indices in this way: c1 + c8 = c1c8 and
247+
// c2 + c7 = c2c7.
248+
// The point of this test is to verify that c1c8 and c2c7 are the same.
249+
using cobs::pad_index;
250+
fs::create_directories(index_dir);
251+
fs::create_directories(index_dir/pad_index(0));
252+
fs::create_directories(index_dir/pad_index(1));
253+
fs::create_directories(index_dir/pad_index(18));
254+
fs::create_directories(index_dir/pad_index(27));
255+
256+
// prepare 4 groups of copies of a randomly generated document
257+
std::string random_doc_src_string = cobs::random_sequence(1000, 1);
258+
auto random_doc_one_copy = generate_documents_one(random_doc_src_string, 1);
259+
auto random_doc_two_copies = std::vector<cobs::KMerBuffer<31> >(2, random_doc_one_copy[0]);
260+
auto random_doc_seven_copies = std::vector<cobs::KMerBuffer<31> >(7, random_doc_one_copy[0]);
261+
auto random_doc_eight_copies = std::vector<cobs::KMerBuffer<31> >(8, random_doc_one_copy[0]);
262+
generate_test_case(random_doc_one_copy, "random_", input_dir/pad_index(1));
263+
generate_test_case(random_doc_two_copies, "random_", input_dir/pad_index(2));
264+
generate_test_case(random_doc_seven_copies, "random_", input_dir/pad_index(7));
265+
generate_test_case(random_doc_eight_copies, "random_", input_dir/pad_index(8));
266+
267+
cobs::ClassicIndexParameters index_params;
268+
index_params.false_positive_rate = 0.001; // in order to use large signature size
269+
index_params.mem_bytes = 80;
270+
index_params.num_threads = 1;
271+
index_params.continue_ = true;
272+
273+
// generate a classic index for each document groups
274+
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
275+
index_dir/pad_index(18)/(pad_index(1) + ".cobs_classic"),
276+
tmp_path, index_params);
277+
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(8)),
278+
index_dir/pad_index(18)/(pad_index(8) + ".cobs_classic"),
279+
tmp_path, index_params);
280+
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(2)),
281+
index_dir/pad_index(27)/(pad_index(2) + ".cobs_classic"),
282+
tmp_path, index_params);
283+
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(7)),
284+
index_dir/pad_index(27)/(pad_index(7) + ".cobs_classic"),
285+
tmp_path, index_params);
286+
287+
// generate a combined index fro both classic constructed index
288+
fs::path c1c8, c2c7;
289+
cobs::classic_combine(index_dir/pad_index(18), index_dir/pad_index(0), c1c8,
290+
80, 1, false);
291+
cobs::classic_combine(index_dir/pad_index(27), index_dir/pad_index(1), c2c7,
292+
80, 1, false);
293+
294+
ASSERT_TRUE(compare_classic_indices(c1c8.string(), c2c7.string()));
295+
}
296+
223297
/******************************************************************************/

0 commit comments

Comments
 (0)