Skip to content

Commit

Permalink
Fast merging
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Nov 26, 2017
1 parent 7cd8431 commit 5e333bb
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 31 deletions.
64 changes: 39 additions & 25 deletions merge_gbwt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ const std::string tool_name = "GBWT merging";

void printUsage(int exit_code = EXIT_SUCCESS);

size_type insert(DynamicGBWT& index, const std::string input_name, size_type batch_size);

//------------------------------------------------------------------------------

int
Expand All @@ -45,14 +43,17 @@ main(int argc, char** argv)
if(argc < 5) { printUsage(); }

size_type batch_size = DynamicGBWT::MERGE_BATCH_SIZE;
bool fast_merging = false;
std::string output;
int c = 0;
while((c = getopt(argc, argv, "b:o:")) != -1)
while((c = getopt(argc, argv, "b:fo:")) != -1)
{
switch(c)
{
case 'b':
batch_size = std::stoul(optarg); break;
case 'f':
fast_merging = true; break;
case 'o':
output = optarg; break;
case '?':
Expand All @@ -64,7 +65,6 @@ main(int argc, char** argv)

size_type input_files = argc - optind;
size_type total_inserted = 0;
std::string first_input = argv[optind]; optind++;
if(input_files <= 1 || output.empty()) { printUsage(EXIT_FAILURE); }

Version::print(std::cout, tool_name);
Expand All @@ -76,19 +76,42 @@ main(int argc, char** argv)

double start = readTimer();

DynamicGBWT index;
sdsl::load_from_file(index, first_input + DynamicGBWT::EXTENSION);
printStatistics(index, first_input);

while(optind < argc)
if(fast_merging)
{
std::string input_name = argv[optind];
total_inserted += insert(index, input_name, batch_size);
optind++;
std::vector<GBWT> indexes(argc - optind);
for(int i = optind; i < argc; i++)
{
std::string input_name = argv[i];
sdsl::load_from_file(indexes[i - optind], input_name + GBWT::EXTENSION);
printStatistics(indexes[i - optind], input_name);
total_inserted += indexes[i - optind].size();
}
GBWT merged(indexes);
sdsl::store_to_file(merged, output + GBWT::EXTENSION);
printStatistics(merged, output);
}
else
{
DynamicGBWT index;
{
std::string input_name = argv[optind];
sdsl::load_from_file(index, input_name + DynamicGBWT::EXTENSION);
printStatistics(index, input_name);
optind++;
}
while(optind < argc)
{
std::string input_name = argv[optind];
GBWT next;
sdsl::load_from_file(next, input_name + GBWT::EXTENSION);
printStatistics(next, input_name);
index.merge(next, batch_size);
total_inserted += next.size();
optind++;
}
sdsl::store_to_file(index, output + DynamicGBWT::EXTENSION);
printStatistics(index, output);
}

sdsl::store_to_file(index, output + DynamicGBWT::EXTENSION);
printStatistics(index, output);

double seconds = readTimer() - start;

Expand All @@ -109,6 +132,7 @@ printUsage(int exit_code)

std::cerr << "Usage: merge_gbwt [options] -o output input1 input2 [input3 ...]" << std::endl;
std::cerr << " -b N Use batches of N sequences for merging (default: " << DynamicGBWT::MERGE_BATCH_SIZE << ")" << std::endl;
std::cerr << " -f Fast merging algorithm (node ids must not overlap)" << std::endl;
std::cerr << " -o X Use X as the base name for output (required)" << std::endl;
std::cerr << std::endl;
std::cerr << "Use base names for the inputs and the output." << std::endl;
Expand All @@ -117,14 +141,4 @@ printUsage(int exit_code)
std::exit(exit_code);
}

size_type
insert(DynamicGBWT& index, const std::string input_name, size_type batch_size)
{
GBWT next;
sdsl::load_from_file(next, input_name + GBWT::EXTENSION);
printStatistics(next, input_name);
index.merge(next, batch_size);
return next.size();
}

//------------------------------------------------------------------------------
44 changes: 38 additions & 6 deletions support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,8 @@ RecordArray::RecordArray(const std::vector<DynamicRecord>& bwt) :
this->buildIndex(offsets);
}

RecordArray::RecordArray(const std::vector<RecordArray const*> sources, const sdsl::int_vector<0>& origins, const std::vector<size_type>& record_offsets)
RecordArray::RecordArray(const std::vector<RecordArray const*> sources, const sdsl::int_vector<0>& origins, const std::vector<size_type>& record_offsets) :
records(origins.size())
{
size_type data_size = 0;
for(auto source : sources) { data_size += source->data.size(); }
Expand Down Expand Up @@ -528,7 +529,7 @@ RecordArray::serialize(std::ostream& out, sdsl::structure_tree_node* v, std::str
// Serialize the data.
size_type data_bytes = this->data.size() * sizeof(byte_type);
sdsl::structure_tree_node* data_node =
sdsl::structure_tree::add_child(child, "data", "std::vector<gbwt::byte_type>");
sdsl::structure_tree::add_child(child, "data", "std::vector<gbwt::byte_type>");
out.write((const char*)(this->data.data()), data_bytes);
sdsl::structure_tree::add_size(data_node, data_bytes);
written_bytes += data_bytes;
Expand Down Expand Up @@ -631,8 +632,6 @@ DASamples::DASamples(const std::vector<DynamicRecord>& bwt)

DASamples::DASamples(const std::vector<DASamples const*> sources, const sdsl::int_vector<0>& origins, const std::vector<size_type>& record_offsets, const std::vector<size_type>& sequence_counts)
{
// FIXME handle ENDMARKER as a special case

// Compute statistics and build iterators over the sources.
size_type sample_count = 0, total_sequences = 0;
std::vector<size_type> sequence_offsets(sources.size(), 0);
Expand All @@ -648,9 +647,25 @@ DASamples::DASamples(const std::vector<DASamples const*> sources, const sdsl::in
}

// Compute statistics over the records and mark the sampled nodes.
// Note that the endmarker requires special treatment.
size_type record_count = 0, bwt_offsets = 0;
this->sampled_records = sdsl::bit_vector(origins.size(), 0);
for(size_type i = 0; i < origins.size(); i++)
bool sample_endmarker = false;
for(size_type origin = 0; origin < sources.size(); origin++)
{
if(sources[origin]->isSampled(ENDMARKER))
{
sample_endmarker = true;
++range_iterators[origin];
}
}
if(sample_endmarker)
{
record_count++;
bwt_offsets += total_sequences;
this->sampled_records[ENDMARKER] = 1;
}
for(size_type i = 1; i < origins.size(); i++)
{
size_type origin = origins[i];
if(origin >= sources.size()) { continue; } // No record.
Expand All @@ -672,11 +687,28 @@ DASamples::DASamples(const std::vector<DASamples const*> sources, const sdsl::in
}

// Build the bitvectors over BWT offsets and store the samples.
// The endmarker requires special treatment again.
sdsl::sd_vector_builder range_builder(bwt_offsets, record_count);
sdsl::sd_vector_builder offset_builder(bwt_offsets, sample_count);
this->array = sdsl::int_vector<0>(sample_count, 0, bit_length(total_sequences - 1));
size_type record_start = 0, curr = 0;
for(size_type i = 0; i < origins.size(); i++)
if(sample_endmarker)
{
range_builder.set(record_start);
for(size_type origin = 0; origin < sources.size(); origin++)
{
if(!(sources[origin]->isSampled(ENDMARKER))) { continue; }
while(!(sample_iterators[origin].end()) && sample_iterators[origin].offset() < range_iterators[origin].limit())
{
offset_builder.set((sample_iterators[origin]).offset() + sequence_offsets[origin]);
this->array[curr] = *(sample_iterators[origin]) + sequence_offsets[origin]; curr++;
++sample_iterators[origin];
}
++range_iterators[origin];
}
record_start += total_sequences;
}
for(size_type i = 1; i < origins.size(); i++)
{
if(!(this->isSampled(i))) { continue; }
size_type origin = origins[i];
Expand Down

0 comments on commit 5e333bb

Please sign in to comment.