diff --git a/build_gbwt.cpp b/build_gbwt.cpp index e89c0db..55fe598 100644 --- a/build_gbwt.cpp +++ b/build_gbwt.cpp @@ -56,19 +56,23 @@ main(int argc, char** argv) if(argc < 2) { printUsage(); } size_type batch_size = DynamicGBWT::INSERT_BATCH_SIZE / MILLION; - bool verify_index = false; + bool verify_index = false, both_orientations = false; std::string index_base, input_base, output_base; int c = 0; - while((c = getopt(argc, argv, "b:i:o:v")) != -1) + while((c = getopt(argc, argv, "b:fi:o:rv")) != -1) { switch(c) { case 'b': batch_size = std::stoul(optarg); break; + case 'f': + both_orientations = false; break; case 'i': index_base = optarg; break; case 'o': output_base = optarg; break; + case 'r': + both_orientations = true; break; case 'v': verify_index = true; break; case '?': @@ -94,6 +98,7 @@ main(int argc, char** argv) printHeader("Input files"); std::cout << input_files << std::endl; printHeader("Output name"); std::cout << output_base << std::endl; if(batch_size != 0) { printHeader("Batch size"); std::cout << batch_size << " million" << std::endl; } + printHeader("Orientation"); std::cout << (both_orientations ? "both" : "forward only") << std::endl; std::cout << std::endl; double start = readTimer(); @@ -111,7 +116,7 @@ main(int argc, char** argv) printHeader("Input name"); std::cout << input_base << std::endl; text_buffer_type input(input_base); input_size += input.size(); - dynamic_index.insert(input, batch_size * MILLION); + dynamic_index.insert(input, batch_size * MILLION, both_orientations); optind++; } std::cout << std::endl; @@ -126,7 +131,8 @@ main(int argc, char** argv) std::cout << "Memory usage " << inGigabytes(memoryUsage()) << " GB" << std::endl; std::cout << std::endl; - if(verify_index) + // FIXME verify both orientations + if(verify_index && !both_orientations) { std::cout << "Verifying the index..." << std::endl; double verify_start = readTimer(); @@ -160,8 +166,10 @@ printUsage(int exit_code) std::cerr << "Usage: build_gbwt [options] input1 [input2 ...]" << std::endl; std::cerr << " -b N Insert in batches of N million nodes (default: " << (DynamicGBWT::INSERT_BATCH_SIZE / MILLION) << ")" << std::endl; + std::cerr << " -f Index the sequences only in forward orientation (default)" << std::endl; std::cerr << " -i X Insert the sequences into an existing index with base name X" << std::endl; std::cerr << " -o X Use base name X for output (default: the only input)" << std::endl; + std::cerr << " -r Index the sequences also in reverse orientation" << std::endl; std::cerr << " -v Verify the index after construction" << std::endl; std::cerr << std::endl; diff --git a/dynamic_gbwt.cpp b/dynamic_gbwt.cpp index cbc8e97..49b8841 100644 --- a/dynamic_gbwt.cpp +++ b/dynamic_gbwt.cpp @@ -652,7 +652,7 @@ DynamicGBWT::insert(const std::vector& text) } void -DynamicGBWT::insert(text_buffer_type& text, size_type batch_size) +DynamicGBWT::insert(text_buffer_type& text, size_type batch_size, bool both_orientations) { double start = readTimer(); @@ -675,7 +675,7 @@ DynamicGBWT::insert(text_buffer_type& text, size_type batch_size) std::vector sequence; for(size_type node : text) { - if(node == ENDMARKER) { builder.insert(sequence); sequence.clear(); } + if(node == ENDMARKER) { builder.insert(sequence, both_orientations); sequence.clear(); } else { sequence.push_back(node); } } if(!(sequence.empty())) { builder.insert(sequence); sequence.clear(); } diff --git a/include/gbwt/dynamic_gbwt.h b/include/gbwt/dynamic_gbwt.h index 967e39f..1747046 100644 --- a/include/gbwt/dynamic_gbwt.h +++ b/include/gbwt/dynamic_gbwt.h @@ -79,9 +79,10 @@ class DynamicGBWT /* Use the above to insert the sequences in batches of up to 'batch_size' nodes. Use batch - size 0 to insert the entire text at once. + size 0 to insert the entire text at once. By default, the sequences are only inserted in + forward orientation. Set both_orientations = true to insert the reverse complement as well. */ - void insert(text_buffer_type& text, size_type batch_size = INSERT_BATCH_SIZE); + void insert(text_buffer_type& text, size_type batch_size = INSERT_BATCH_SIZE, bool both_orientations = false); /* Insert the sequences from the other GBWT into this. Use batch size 0 to insert all