Skip to content

Commit

Permalink
Minimizer size as compile time variable and fix issue with reading km…
Browse files Browse the repository at this point in the history
…er size from graph during update operation
  • Loading branch information
Guillaume Holley committed Feb 21, 2022
1 parent 5df0dce commit 9379ab7
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 409 deletions.
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ project(Bifrost)

# To enable a larger default k-mer size, replace MAX_KMER_SIZE with a larger multiple of 32: actual maximum k-mer size will be MAX_KMER_SIZE-1.
SET(MAX_KMER_SIZE "32" CACHE STRING "MAX_KMER_SIZE")
SET(MAX_GMER_SIZE "${MAX_KMER_SIZE}" CACHE STRING "MAX_GMER_SIZE")
# Enable architecture optimizations
SET(COMPILATION_ARCH "native" CACHE STRING "COMPILATION_ARCH")
# Enable AVX2 instructions
SET(ENABLE_AVX2 "ON" CACHE STRING "ENABLE_AVX2")

# Set some default compile flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -DXXH_NAMESPACE=BIFROST_HASH_")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DXXH_NAMESPACE=BIFROST_HASH_")

set_property(SOURCE BlockedBloomFilter.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -funroll-loops")

Expand Down Expand Up @@ -52,4 +53,7 @@ endif(CMAKE_BUILD_TYPE MATCHES Debug)
MATH(EXPR PRINT_MAX_KMER_SIZE "${MAX_KMER_SIZE}-1")
message("Maximum k-mer size: " ${PRINT_MAX_KMER_SIZE})

MATH(EXPR PRINT_MAX_GMER_SIZE "${MAX_GMER_SIZE}-1")
message("Maximum g-mer size: " ${PRINT_MAX_GMER_SIZE})

add_subdirectory(src)
57 changes: 29 additions & 28 deletions src/Bifrost.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ void PrintUsage() {
cout << " -B, --bloom-bits2 Number of Bloom filter bits per k-mer with 2+ occurrences in the input files (default is 14)" << endl;
cout << " -l, --load-mbbf Input Blocked Bloom Filter file, skips filtering step (default is no input)" << endl;
cout << " -w, --write-mbbf Output Blocked Bloom Filter file (default is no output)" << endl;
cout << " -u, --chunk-size Read chunk size per thread (default is 64)" << endl << endl;

cout << " > Optional with no argument:" << endl << endl;

Expand Down Expand Up @@ -108,7 +107,7 @@ int parse_ProgramOptions(int argc, char **argv, CCDBG_Build_opt& opt) {

int option_index = 0, c;

const char* opt_string = "s:r:q:g:f:o:t:k:m:e:b:B:l:w:u:nidvcya";
const char* opt_string = "s:r:q:g:f:o:t:k:m:e:b:B:l:w:nidvcya";

static struct option long_options[] = {

Expand All @@ -126,7 +125,6 @@ int parse_ProgramOptions(int argc, char **argv, CCDBG_Build_opt& opt) {
{"bloom-bits2", required_argument, 0, 'B'},
{"load-mbbf", required_argument, 0, 'l'},
{"write-mbbf", required_argument, 0, 'w'},
{"chunk-size", required_argument, 0, 'u'},
{"inexact_search", no_argument, 0, 'n'},
{"clip-tips", no_argument, 0, 'i'},
{"del-isolated", no_argument, 0, 'd'},
Expand Down Expand Up @@ -192,9 +190,6 @@ int parse_ProgramOptions(int argc, char **argv, CCDBG_Build_opt& opt) {
case 'l':
opt.inFilenameBBF = optarg;
break;
case 'u':
opt.read_chunksize = atoi(optarg);
break;
case 'n':
opt.inexact_search = true;
break;
Expand Down Expand Up @@ -431,12 +426,6 @@ bool check_ProgramOptions(CCDBG_Build_opt& opt) {

if (opt.build){ // Check param. command build

if (opt.read_chunksize <= 0) {

cerr << "Error: Chunk size of reads to share among threads cannot be less than or equal to 0." << endl;
ret = false;
}

if (opt.outFilenameBBF.length() != 0){

FILE* fp = fopen(opt.outFilenameBBF.c_str(), "wb");
Expand Down Expand Up @@ -563,45 +552,57 @@ int main(int argc, char **argv){
}
else if (opt.update){

if (opt.filename_colors_in.size() != 0){ // If colors in or out
CCDBG_Build_opt l_opt = opt;

if (l_opt.filename_colors_in.size() != 0){ // If colors in or out

ColoredCDBG<> ccdbg1(opt.k, opt.g);
ColoredCDBG<> ccdbg2(opt.k, opt.g);
ColoredCDBG<> ccdbg1(l_opt.k, l_opt.g);

ccdbg1.read(opt.filename_graph_in, opt.filename_colors_in, opt.nb_threads, opt.verbose);
ccdbg2.buildGraph(opt);
ccdbg2.buildColors(opt);
ccdbg1.read(l_opt.filename_graph_in, l_opt.filename_colors_in, l_opt.nb_threads, l_opt.verbose);

l_opt.k = ccdbg1.getK();
l_opt.g = ccdbg1.getG();

ColoredCDBG<> ccdbg2(l_opt.k, l_opt.g);

ccdbg2.buildGraph(l_opt);
ccdbg2.buildColors(l_opt);

const size_t ccdbg1_len = ccdbg1.length();
const size_t ccdbg2_len = ccdbg2.length();

ColoredCDBG<>& ccdbg_a = (ccdbg1_len > ccdbg2_len) ? ccdbg1 : ccdbg2;
ColoredCDBG<>& ccdbg_b = (ccdbg1_len > ccdbg2_len) ? ccdbg2 : ccdbg1;

ccdbg_a.merge(move(ccdbg_b), opt.nb_threads, opt.verbose);
ccdbg_a.merge(move(ccdbg_b), l_opt.nb_threads, l_opt.verbose);

ccdbg_a.simplify(opt.deleteIsolated, opt.clipTips, opt.verbose);
ccdbg_a.write(opt.prefixFilenameOut, opt.nb_threads, opt.verbose);
ccdbg_a.simplify(l_opt.deleteIsolated, l_opt.clipTips, l_opt.verbose);
ccdbg_a.write(l_opt.prefixFilenameOut, l_opt.nb_threads, l_opt.verbose);
}
else {

CompactedDBG<> cdbg1(opt.k, opt.g);
CompactedDBG<> cdbg2(opt.k, opt.g);
CompactedDBG<> cdbg1(l_opt.k, l_opt.g);

cdbg1.read(l_opt.filename_graph_in, l_opt.nb_threads, l_opt.verbose);

l_opt.k = cdbg1.getK();
l_opt.g = cdbg1.getG();

CompactedDBG<> cdbg2(l_opt.k, l_opt.g);

cdbg1.read(opt.filename_graph_in, opt.nb_threads, opt.verbose);
cdbg2.build(opt);
cdbg2.build(l_opt);

const size_t cdbg1_len = cdbg1.length();
const size_t cdbg2_len = cdbg2.length();

CompactedDBG<>& cdbg_a = (cdbg1_len > cdbg2_len) ? cdbg1 : cdbg2;
CompactedDBG<>& cdbg_b = (cdbg1_len > cdbg2_len) ? cdbg2 : cdbg1;

cdbg_a.merge(cdbg_b, opt.nb_threads, opt.verbose);
cdbg_a.merge(cdbg_b, l_opt.nb_threads, l_opt.verbose);
cdbg_b.clear();

cdbg_a.simplify(opt.deleteIsolated, opt.clipTips, opt.verbose);
cdbg_a.write(opt.prefixFilenameOut, opt.nb_threads, opt.outputGFA, opt.verbose);
cdbg_a.simplify(l_opt.deleteIsolated, l_opt.clipTips, l_opt.verbose);
cdbg_a.write(l_opt.prefixFilenameOut, l_opt.nb_threads, l_opt.outputGFA, l_opt.verbose);
}
}
else if (opt.query){
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ file(GLOB headers *.h *.hpp *.hh *.tcc)
list(REMOVE_ITEM sources Bifrost.cpp)

add_definitions(-DMAX_KMER_SIZE=${MAX_KMER_SIZE})
add_definitions(-DMAX_GMER_SIZE=${MAX_GMER_SIZE})

add_library(bifrost_static STATIC ${sources} ${headers})
add_library(bifrost_dynamic SHARED ${sources} ${headers})
Expand Down Expand Up @@ -36,3 +37,4 @@ install(TARGETS Bifrost DESTINATION bin)
install(TARGETS bifrost_dynamic DESTINATION lib)
install(TARGETS bifrost_static DESTINATION lib)
install(FILES ${headers} DESTINATION include/bifrost)
install(FILES xxhash.c DESTINATION include/bifrost)
7 changes: 1 addition & 6 deletions src/ColoredCDBG.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,11 @@ class ColoredCDBG : public CompactedDBG<DataAccessor<Unitig_data_t>, DataStorage

public:

/** Constructor (set up an empty colored cdBG).
* @param kmer_length is the length k of k-mers used in the graph (each unitig is of length at least k).
*/
ColoredCDBG(int kmer_length = DEFAULT_K);

/** Constructor (set up an empty colored cdBG).
* @param kmer_length is the length k of k-mers used in the graph (each unitig is of length at least k).
* @param minimizer_length is the length g of minimizers (g < k) used in the graph.
*/
ColoredCDBG(int kmer_length, int minimizer_length);
ColoredCDBG(int kmer_length = DEFAULT_K, int minimizer_length = -1);

/** Copy constructor (copy a colored cdBG).
* This function is expensive in terms of time and memory as the content of a colored and compacted
Expand Down
16 changes: 6 additions & 10 deletions src/ColoredCDBG.tcc
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
#ifndef BIFROST_COLOREDCDBG_TCC
#define BIFROST_COLOREDCDBG_TCC

template<typename U>
ColoredCDBG<U>::ColoredCDBG(int kmer_length) : CompactedDBG<DataAccessor<U>, DataStorage<U>>(kmer_length){

invalid = this->isInvalid();
}

template<typename U>
ColoredCDBG<U>::ColoredCDBG(int kmer_length, int minimizer_length) : CompactedDBG<DataAccessor<U>, DataStorage<U>>(kmer_length, minimizer_length){

Expand Down Expand Up @@ -816,8 +810,8 @@ void ColoredCDBG<U>::buildUnitigColors(const size_t nb_threads){

const size_t nb_locks = nb_threads * 1024;
const size_t chunk_size = 64;
const size_t max_len_seq = 1024;
const size_t thread_seq_buf_sz = chunk_size * max_len_seq;
const size_t max_len_seq = rndup(static_cast<size_t>(1024 + k_ - 1));
const size_t thread_seq_buf_sz = BUFFER_SIZE;
const size_t thread_col_buf_sz = (thread_seq_buf_sz / (k_ + 1)) + 1;

size_t prev_file_id = 0;
Expand Down Expand Up @@ -1330,8 +1324,10 @@ bool ColoredCDBG<U>::search(const vector<string>& query_filenames, const string&

const size_t k = this->getK();

const size_t max_len_seq = 1024;
const size_t thread_seq_buf_sz = 64 * max_len_seq;
//const size_t max_len_seq = 1024;
//const size_t thread_seq_buf_sz = 64 * max_len_seq;
const size_t max_len_seq = rndup(static_cast<size_t>(1024 + k - 1));
const size_t thread_seq_buf_sz = BUFFER_SIZE;

FileParser fp(query_filenames);

Expand Down
12 changes: 2 additions & 10 deletions src/CompactedDBG.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ using namespace std;
* Print information messages during execution if true. Default is false.
* @var CDBG_Build_opt::nb_threads
* Number of threads to use for building the graph. Default is 1.
* @var CDBG_Build_opt::read_chunksize
* Number of reads a thread can read and process at a time. Default is 64.
* @var CDBG_Build_opt::nb_bits_unique_kmers_bf
* Number of Bloom filter bits per k-mer occurring at least once in the FASTA/FASTQ/GFA files of
* CDBG_Build_opt::filename_in. Default is 14.
Expand Down Expand Up @@ -132,7 +130,6 @@ struct CDBG_Build_opt {
bool verbose;

size_t nb_threads;
size_t read_chunksize;

size_t nb_bits_unique_kmers_bf;
size_t nb_bits_non_unique_kmers_bf;
Expand Down Expand Up @@ -170,7 +167,7 @@ struct CDBG_Build_opt {
vector<string> filename_query_in;

CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(-1), nb_bits_unique_kmers_bf(14),
nb_bits_non_unique_kmers_bf(14), read_chunksize(64), ratio_kmers(0.8),
nb_bits_non_unique_kmers_bf(14), ratio_kmers(0.8),
build(false), update(false), query(false), clipTips(false), deleteIsolated(false),
inexact_search(false), useMercyKmers(false), outputGFA(true), verbose(false) {}
};
Expand Down Expand Up @@ -324,16 +321,11 @@ class CompactedDBG {
typedef unitigIterator<U, G, false> iterator; /**< An iterator for the unitigs of the graph. No specific order is assumed. */
typedef unitigIterator<U, G, true> const_iterator; /**< A constant iterator for the unitigs of the graph. No specific order is assumed. */

/** Constructor (set up an empty compacted dBG).
* @param kmer_length is the length k of k-mers used in the graph (each unitig is of length at least k).
*/
CompactedDBG(const int kmer_length = DEFAULT_K);

/** Constructor (set up an empty compacted dBG).
* @param kmer_length is the length k of k-mers used in the graph (each unitig is of length at least k).
* @param minimizer_length is the length g of minimizers (g < k) used in the graph.
*/
CompactedDBG(const int kmer_length, const int minimizer_length);
CompactedDBG(const int kmer_length = DEFAULT_K, const int minimizer_length = -1);

/** Copy constructor (copy a compacted de Bruijn graph).
* This function is expensive in terms of time and memory as the content of a compacted
Expand Down
38 changes: 10 additions & 28 deletions src/CompactedDBG.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@ static const uint8_t bits[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};

template<typename U, typename G>
CompactedDBG<U, G>::CompactedDBG(const int kmer_length) : invalid(false) {

setKmerGmerLength(kmer_length);
}

template<typename U, typename G>
CompactedDBG<U, G>::CompactedDBG(const int kmer_length, const int minimizer_length) : invalid(false) {

Expand Down Expand Up @@ -279,12 +273,6 @@ bool CompactedDBG<U, G>::build(CDBG_Build_opt& opt){
construct_finished = false;
}
if (opt.read_chunksize <= 0){
cerr << "CompactedDBG::build(): Chunk size of reads to share among threads cannot be less than or equal to 0" << endl;
construct_finished = false;
}
if (opt.outFilenameBBF.length() != 0){
FILE* fp = fopen(opt.outFilenameBBF.c_str(), "wb");
Expand Down Expand Up @@ -527,12 +515,6 @@ bool CompactedDBG<U, G>::build(CDBG_Build_opt& opt){
construct_finished = false;
}

if (opt.read_chunksize <= 0){

cerr << "CompactedDBG::build(): Chunk size of reads to share among threads cannot be less than or equal to 0" << endl;
construct_finished = false;
}

if (opt.outFilenameBBF.length() != 0){

FILE* fp = fopen(opt.outFilenameBBF.c_str(), "wb");
Expand Down Expand Up @@ -1445,6 +1427,7 @@ UnitigMap<U, G> CompactedDBG<U, G>::find(const Kmer& km, const bool extremities_
while (it_min != it_min_end){

int mhr_pos = it_min.getPosition();

Minimizer minz(Minimizer(km_tmp + mhr_pos).rep());
MinimizerIndex::const_iterator it = hmap_min_unitigs.find(minz);

Expand Down Expand Up @@ -2819,8 +2802,10 @@ bool CompactedDBG<U, G>::filter(const CDBG_Build_opt& opt, const size_t nb_uniqu
size_t pos_read = 0;
size_t nb_seq = 0;

const size_t max_len_seq = 1024;
const size_t thread_seq_buf_sz = opt.read_chunksize * max_len_seq;
//const size_t max_len_seq = 1024;
//const size_t thread_seq_buf_sz = 64 * max_len_seq;
const size_t max_len_seq = rndup(static_cast<size_t>(1024 + k_ - 1));
const size_t thread_seq_buf_sz = BUFFER_SIZE;

const bool multi_threaded = (opt.nb_threads != 1);

Expand All @@ -2830,11 +2815,6 @@ bool CompactedDBG<U, G>::filter(const CDBG_Build_opt& opt, const size_t nb_uniqu

FileParser fp(filename_in);

//------------------------
//uint64_t wyp[4]; // "Secret" for wyhash
//make_secret(time(NULL), wyp); //Make secret for wyhash
//------------------------

// Main worker thread
auto worker_function = [&](char* seq_buf, const size_t seq_buf_sz) {

Expand Down Expand Up @@ -3030,8 +3010,10 @@ bool CompactedDBG<U, G>::construct(const CDBG_Build_opt& opt, const size_t nb_un
size_t len_read = 0;
size_t pos_read = 0;

const size_t max_len_seq = 1024;
const size_t thread_seq_buf_sz = opt.read_chunksize * max_len_seq;
//const size_t max_len_seq = 1024;
//const size_t thread_seq_buf_sz = 64 * max_len_seq;
const size_t max_len_seq = rndup(static_cast<size_t>(1024 + k_ - 1));
const size_t thread_seq_buf_sz = BUFFER_SIZE;

tiny_vector<Kmer, 2>* fp_candidate = nullptr;

Expand Down Expand Up @@ -8392,7 +8374,7 @@ void CompactedDBG<U, G>::setKmerGmerLength(const int kmer_length, const int mini
invalid = true;
}

if (minimizer_length >= MAX_KMER_SIZE){
if (minimizer_length >= MAX_GMER_SIZE){

cerr << "CompactedDBG::CompactedDBG(): Length g of minimizers cannot exceed or be equal to " << MAX_KMER_SIZE << endl;
invalid = true;
Expand Down
Loading

0 comments on commit 9379ab7

Please sign in to comment.