From 7a074b7989206db0fb0e89cdfde807c8f85b5b48 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 23 Feb 2021 12:01:49 -0500 Subject: [PATCH 001/350] Templatize hash table with bits-per-kmer --- include/Annotated_Kmer.hpp | 18 ++++++---- include/CdBG.hpp | 2 +- include/Kmer_Hash_Entry_API.hpp | 61 ++++++++++++++++----------------- include/Kmer_Hash_Table.hpp | 53 ++++++++++++++-------------- include/State.hpp | 12 +++---- include/globals.hpp | 10 ++++-- src/CdBG_Builder.cpp | 10 +++--- src/CdBG_GFA_Writer.cpp | 2 +- src/CdBG_Plain_Writer.cpp | 2 +- src/Kmer_Hash_Table.cpp | 30 ++++++++-------- 10 files changed, 105 insertions(+), 95 deletions(-) diff --git a/include/Annotated_Kmer.hpp b/include/Annotated_Kmer.hpp index 81919bea..30ed5532 100644 --- a/include/Annotated_Kmer.hpp +++ b/include/Annotated_Kmer.hpp @@ -8,8 +8,8 @@ #include "Kmer_Hash_Table.hpp" -// Complete k-mer information: k-mer itself and its reverse complement, -// canonical form, direction, index in corresponding sequence, and its class. +// Complete k-mer information: the k-mer itself and its reverse complement, canonical +// form, direction, index in the corresponding sequence, and its state-class. template class Annotated_Kmer: public Directed_Kmer { @@ -25,7 +25,10 @@ class Annotated_Kmer: public Directed_Kmer {} // Constructs an annotated k-mer with its complete information. - Annotated_Kmer(const Kmer& kmer, size_t kmer_idx, const Kmer_Hash_Table& hash); + // The template parameter `BITS_PER_KEY` is required to access + // the hash table `hash`. + template + Annotated_Kmer(const Kmer& kmer, size_t kmer_idx, const Kmer_Hash_Table& hash); // Copy constructs the annotated k-mer from `rhs`. Annotated_Kmer(const Annotated_Kmer& rhs) = default; @@ -34,7 +37,8 @@ class Annotated_Kmer: public Directed_Kmer // appending the next base `next_base` to the end, i.e. rolls // the k-mer by one base, sets all the relevant k-mer // information accordingly (k-mer state is set using the `hash`). - void roll_to_next_kmer(char next_base, const Kmer_Hash_Table& hash); + template + void roll_to_next_kmer(char next_base, const Kmer_Hash_Table& hash); void operator=(const Annotated_Kmer& rhs); @@ -47,13 +51,15 @@ class Annotated_Kmer: public Directed_Kmer template -inline Annotated_Kmer::Annotated_Kmer(const Kmer& kmer, const size_t kmer_idx, const Kmer_Hash_Table& hash): +template +inline Annotated_Kmer::Annotated_Kmer(const Kmer& kmer, const size_t kmer_idx, const Kmer_Hash_Table& hash): Directed_Kmer(kmer), idx_(kmer_idx), state_class_(hash[this->canonical_].state_class()) {} template -inline void Annotated_Kmer::roll_to_next_kmer(const char next_base, const Kmer_Hash_Table& hash) +template +inline void Annotated_Kmer::roll_to_next_kmer(const char next_base, const Kmer_Hash_Table& hash) { Directed_Kmer::roll_to_next_kmer(next_base); diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 1e974f97..18e0e138 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -25,7 +25,7 @@ class CdBG private: const Build_Params params; // Required parameters wrapped in one object. - Kmer_Hash_Table Vertices; // The hash table for the vertices (canonical k-mers) of the de Bruijn graph. + Kmer_Hash_Table Vertices; // The hash table for the vertices (canonical k-mers) of the de Bruijn graph. // Minimum size of a partition to be processed by one thread. static constexpr uint16_t PARTITION_SIZE_THRESHOLD = 1; diff --git a/include/Kmer_Hash_Entry_API.hpp b/include/Kmer_Hash_Entry_API.hpp index b13f1578..b2200ed7 100644 --- a/include/Kmer_Hash_Entry_API.hpp +++ b/include/Kmer_Hash_Entry_API.hpp @@ -8,20 +8,29 @@ #include "State.hpp" -template class Kmer_Hash_Table; +template class Kmer_Hash_Table; // Wrapper class acting as an API to the entries of the bitvector used as hash table for k-mers. +template class Kmer_Hash_Entry_API +{}; + + +// Instantiation of the API class used in reference-dBG compaction. +template <> +class Kmer_Hash_Entry_API { - template + template friend class Kmer_Hash_Table; + typedef compact::iterator_imp::lhs_setter bitvector_entry_t; + private: // Position information (base pointer and offset) for the bitvector entry. - cuttlefish::bitvector_entry_t bv_entry; + bitvector_entry_t bv_entry; // Value read from the bitvector entry when the object is constructed; is immutable. const State state_read; @@ -31,48 +40,36 @@ class Kmer_Hash_Entry_API // Constructs an API to the bitvector entry `bv_entry`. - Kmer_Hash_Entry_API(const cuttlefish::bitvector_entry_t& bv_entry); + Kmer_Hash_Entry_API(const bitvector_entry_t& bv_entry): + bv_entry(bv_entry), state_read(bv_entry) + { + state = state_read; + } // Returns the state value read when the object was constructed. - cuttlefish::state_code_t get_read_state() const; + cuttlefish::state_code_t get_read_state() const + { + return state_read.get_state(); + } // Returns the value of the mutable state value wrapped inside the API, // i.e. the state value that had been read at the object creation, and then // possibly have been modified. - cuttlefish::state_code_t get_current_state() const; + cuttlefish::state_code_t get_current_state() const + { + return state.get_state(); + } public: // Returns a reference to the mutable copy of the wrapped state value. - State& get_state(); + State& get_state() + { + return state; + } }; -inline Kmer_Hash_Entry_API::Kmer_Hash_Entry_API(const cuttlefish::bitvector_entry_t& bv_entry): - bv_entry(bv_entry), state_read(bv_entry) -{ - state = state_read; -} - - -inline cuttlefish::state_code_t Kmer_Hash_Entry_API::get_read_state() const -{ - return state_read.get_state(); -} - - -inline cuttlefish::state_code_t Kmer_Hash_Entry_API::get_current_state() const -{ - return state.get_state(); -} - - -inline State& Kmer_Hash_Entry_API::get_state() -{ - return state; -} - - #endif diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 4d4b1852..9f8cb6b0 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -17,13 +17,15 @@ template class CdBG; -template +template class Kmer_Hash_Table { friend class CdBG; typedef boomphf::mphf, Kmer_Hasher> mphf_t; // The MPH function type. + typedef compact::ts_vector> bitvector_t; + private: // Lowest bits/elem is achieved with gamma = 1, higher values lead to larger mphf but faster construction/query. @@ -32,17 +34,19 @@ class Kmer_Hash_Table // The MPH function. mphf_t* mph = NULL; - // The values (`State`) collection for the hash table; - // keys (`kmer_t`) are passed to the MPHF, and the resulting function-value is used as index in the values table. - cuttlefish::bitvector_t hash_table; + // The buckets collection (raw `State` representations) for the hash table structure. + // Keys (`Kmer`) are passed to the MPHF, and the resulting function-value is used as index into the buckets table. + bitvector_t hash_table; - // Number of locks for thread-safe access to the bitvector `hash_table`. + // Number of locks for mutually exclusive access for threads to the same indices into the bitvector `hash_table`. + // TODO: increase locks and check note at the end about the false `const` issue. constexpr static uint64_t lock_count{65536}; // Number of contiguous entries of the bitvector that each lock is assigned to. + // TODO: try making it `const`. uint64_t lock_range_size; - // The locks to maintain mutually exclusive access for threads, to the bitvector `hash_table`. + // The locks to maintain mutually exclusive access for threads to the same indices into the bitvector `hash_table`. std::array locks_; @@ -72,12 +76,10 @@ class Kmer_Hash_Table // Returns an API to the entry (in the hash table) for a k-mer hashing // to the bucket number `bucket_id` of the hash table. The API wraps // the hash table position and the state value at that position. - Kmer_Hash_Entry_API operator[](uint64_t bucket_id); + Kmer_Hash_Entry_API operator[](uint64_t bucket_id); -public: - Kmer_Hash_Table() - {} +public: // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, @@ -88,51 +90,52 @@ class Kmer_Hash_Table // Returns an API to the entry (in the hash table) for the key `kmer`. The API // wraps the hash table position and the state value at that position. - Kmer_Hash_Entry_API operator[](const Kmer& kmer); + Kmer_Hash_Entry_API operator[](const Kmer& kmer); // Returns the value (in the hash-table) for the key `kmer`. State operator[](const Kmer& kmer) const; // Attempts to update the entry (in the hash-table) for the API object according - // to its wrapped state values, and returns true or false as per success + // to its wrapped state values, and returns `true` or `false` as per success // status. If the corresponding hash table position now contains a different // state than the one that had been read earlier, then the update fails. - bool update(Kmer_Hash_Entry_API& api); + bool update(Kmer_Hash_Entry_API& api); // Clears the hash-table. Do not invoke on an unused object. void clear(); }; -template -inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) const +template +inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) const { return mph->lookup(kmer); } -template -inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) +template +inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) { uint64_t lidx = bucket_id / lock_range_size; locks_[lidx].lock(); - auto r = Kmer_Hash_Entry_API(hash_table[bucket_id]); + auto r = Kmer_Hash_Entry_API(hash_table[bucket_id]); locks_[lidx].unlock(); return r; } -template -inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const Kmer& kmer) +template +inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const Kmer& kmer) { - return operator[](mph->lookup(kmer)); + return operator[](bucket_id(kmer)); } -template -inline State Kmer_Hash_Table::operator[](const Kmer& kmer) const +template +inline State Kmer_Hash_Table::operator[](const Kmer& kmer) const { // NOTE: this makes the `const` a lie. Should be a better solution here. + // TODO: Design a sparse-locks collection class, moving the locks array there. Have a pointer to `Sparse_Lock` in this class. auto v = mph->lookup(kmer); uint64_t lidx = v / lock_range_size; auto* tp = const_cast(this); @@ -143,8 +146,8 @@ inline State Kmer_Hash_Table::operator[](const Kmer& kmer) const } -template -inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API& api) +template +inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API& api) { auto it = &(api.bv_entry); uint64_t lidx = (std::distance(hash_table.begin(), it)) / lock_range_size; diff --git a/include/State.hpp b/include/State.hpp index b56a94f9..0580b3ad 100644 --- a/include/State.hpp +++ b/include/State.hpp @@ -10,16 +10,16 @@ #include -template class Kmer_Hash_Table; -class Kmer_Hash_Entry_API; +template class Kmer_Hash_Table; +template class Kmer_Hash_Entry_API; class State { - template + template friend class Kmer_Hash_Table; - friend class Kmer_Hash_Entry_API; + friend class Kmer_Hash_Entry_API; private: @@ -31,7 +31,7 @@ class State State(cuttlefish::state_code_t code); // Constructs a `State` from the state stored at the bitvector entry `bv_entry`. - State(const cuttlefish::bitvector_entry_t& bv_entry); + State(const cuttlefish::ref_bitvector_entry_t& bv_entry); // Sets the DNA base 2-bit encoding at the bits b1 and b0 of `code`. // Requirement: the two bits must be zero before the call, for consistent behavior. @@ -111,7 +111,7 @@ inline State::State(const cuttlefish::state_code_t code): } -inline State::State(const cuttlefish::bitvector_entry_t& bv_entry) +inline State::State(const cuttlefish::ref_bitvector_entry_t& bv_entry) { // CAS vector `fetch` does not work. // bv_entry.fetch_val(vertex_code); diff --git a/include/globals.hpp b/include/globals.hpp index 78a5d03f..24c542b4 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -18,6 +18,10 @@ #define INSTANTIATE(z, k, class_name) template class class_name<2 * k + 1>; #define ENUMERATE(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) + +#define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, 5>;// template class class_name<2 * k + 1, 6>; +#define ENUMERATE_PER_BIT(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) + // BOOST_PP_REPEAT reference: https://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/ref/repeat.html @@ -62,9 +66,9 @@ namespace cuttlefish }; - constexpr uint8_t BITS_PER_KMER = 5; - typedef compact::ts_vector> bitvector_t; - typedef compact::iterator_imp::lhs_setter bitvector_entry_t; + constexpr uint8_t BITS_PER_REF_KMER = 5; + typedef compact::ts_vector> ref_bitvector_t; + typedef compact::iterator_imp::lhs_setter ref_bitvector_entry_t; typedef std::shared_ptr logger_t; diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index cbf12fe5..de94d2a8 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -250,7 +250,7 @@ bool CdBG::process_loop(const Directed_Kmer& kmer, const Directed_Kmer& { // Fetch the entry for `kmer_hat`. const Kmer& kmer_hat = kmer.canonical(); - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; State& state = hash_table_entry.get_state(); state = State(Vertex(cuttlefish::State_Class::multi_in_multi_out)); @@ -273,7 +273,7 @@ bool CdBG::process_leftmost_kmer(const Directed_Kmer& kmer, const Directed const Kmer& next_kmer_hat = next_kmer.canonical(); // Fetch the entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -375,7 +375,7 @@ bool CdBG::process_rightmost_kmer(const Directed_Kmer& kmer, const char pr const cuttlefish::dir_t dir = kmer.dir(); // Fetch the entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -475,7 +475,7 @@ bool CdBG::process_internal_kmer(const Directed_Kmer& kmer, const Directed const Kmer& next_kmer_hat = next_kmer.canonical(); // Fetch the hash table entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -591,7 +591,7 @@ bool CdBG::process_isolated_kmer(const Directed_Kmer& kmer) const Kmer& kmer_hat = kmer.canonical(); // Fetch the hash table entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; State& state = hash_table_entry.get_state(); diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index 0513c046..a18b5b4f 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -277,7 +277,7 @@ void CdBG::output_gfa_unitig(const uint16_t thread_id, const char* const seq, // k-mer, irrespective of which direction the unitig may be traversed at. const Kmer min_flanking_kmer = std::min(start_kmer.canonical(), end_kmer.canonical()); const uint64_t bucket_id = Vertices.bucket_id(min_flanking_kmer); - Kmer_Hash_Entry_API hash_table_entry = Vertices[bucket_id]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[bucket_id]; State& state = hash_table_entry.get_state(); // Name the GFA segment with the hash value of the first k-mer of the canonical form unitig. diff --git a/src/CdBG_Plain_Writer.cpp b/src/CdBG_Plain_Writer.cpp index e6643c8d..1a957017 100644 --- a/src/CdBG_Plain_Writer.cpp +++ b/src/CdBG_Plain_Writer.cpp @@ -140,7 +140,7 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se // For a particular unitig, always query the same well-defined canonical flanking // k-mer, irrespective of which direction the unitig may be traversed at. const Kmer min_flanking_kmer = std::min(start_kmer.canonical(), end_kmer.canonical()); - Kmer_Hash_Entry_API hash_table_entry = Vertices[min_flanking_kmer]; + Kmer_Hash_Entry_API hash_table_entry = Vertices[min_flanking_kmer]; State& state = hash_table_entry.get_state(); if(state.is_outputted()) diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 7c858a65..cfde8e3d 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -10,8 +10,8 @@ #include -template -void Kmer_Hash_Table::build_mph_function(const Kmer_Container& kmer_container, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) +template +void Kmer_Hash_Table::build_mph_function(const Kmer_Container& kmer_container, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { // The serialized BBHash file (saved from some earlier execution) exists. struct stat buffer; @@ -51,8 +51,8 @@ void Kmer_Hash_Table::build_mph_function(const Kmer_Container& kmer_contai } -template -void Kmer_Hash_Table::load_mph_function(const std::string& file_path) +template +void Kmer_Hash_Table::load_mph_function(const std::string& file_path) { std::ifstream input(file_path.c_str(), std::ifstream::in); if(input.fail()) @@ -68,8 +68,8 @@ void Kmer_Hash_Table::load_mph_function(const std::string& file_path) } -template -void Kmer_Hash_Table::save_mph_function(const std::string& file_path) const +template +void Kmer_Hash_Table::save_mph_function(const std::string& file_path) const { std::ofstream output(file_path.c_str(), std::ofstream::out); if(output.fail()) @@ -84,8 +84,8 @@ void Kmer_Hash_Table::save_mph_function(const std::string& file_path) const } -template -void Kmer_Hash_Table::save_hash_buckets(const std::string& file_path) const +template +void Kmer_Hash_Table::save_hash_buckets(const std::string& file_path) const { std::ofstream output(file_path.c_str(), std::ofstream::out); if(output.fail()) @@ -100,8 +100,8 @@ void Kmer_Hash_Table::save_hash_buckets(const std::string& file_path) const } -template -void Kmer_Hash_Table::load_hash_buckets(const std::string& file_path) +template +void Kmer_Hash_Table::load_hash_buckets(const std::string& file_path) { std::ifstream input(file_path.c_str(), std::ifstream::in); if(input.fail()) @@ -117,8 +117,8 @@ void Kmer_Hash_Table::load_hash_buckets(const std::string& file_path) } -template -void Kmer_Hash_Table::construct(const std::string& kmc_db_path, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) +template +void Kmer_Hash_Table::construct(const std::string& kmc_db_path, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -155,8 +155,8 @@ void Kmer_Hash_Table::construct(const std::string& kmc_db_path, const uint16_ } -template -void Kmer_Hash_Table::clear() +template +void Kmer_Hash_Table::clear() { if(mph != NULL) delete mph; @@ -170,4 +170,4 @@ void Kmer_Hash_Table::clear() // Template instantiations for the required specializations. -ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Kmer_Hash_Table) +ENUMERATE_PER_BIT(INSTANCE_COUNT, INSTANTIATE_PER_BIT, Kmer_Hash_Table) From 1f9c599299c91dad80c254418b4ab97ed5171a5a Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 24 Feb 2021 11:42:45 -0500 Subject: [PATCH 002/350] Add read-space state class rudiments --- include/DNA.hpp | 16 ++++++ include/State_Read_Space.hpp | 101 +++++++++++++++++++++++++++++++++++ include/globals.hpp | 7 +++ src/State.cpp | 2 + 4 files changed, 126 insertions(+) create mode 100644 include/State_Read_Space.hpp diff --git a/include/DNA.hpp b/include/DNA.hpp index 9483e9e6..4a373d78 100644 --- a/include/DNA.hpp +++ b/include/DNA.hpp @@ -13,6 +13,7 @@ namespace DNA // Note that, this is not possible to change this mapping w/o modifications to the // interfacing of our code with the KMC api. This mapping is essential for some // crucial performance hacks in the interfacing. + // TODO: consider having it as `enum class`. enum Base: uint8_t { A = 0b00, // 0 @@ -21,6 +22,21 @@ namespace DNA T = 0b11, // 3 N = 0b100 // 4 }; + + + // E = 0, A = 1, C = 2, G = 3, T = 4, N = 7. + // Implementation of the state class for the read de Bruijn graph uses intricacies + // of this mapping (and the underlying transition function of the DFA) heavily. Do + // not alter the mapping without updating the state-class. + enum class Extended_Base: uint8_t + { + E = 0b000, // 0 + A = 0b001, // 1 + C = 0b010, // 2 + G = 0b011, // 3 + T = 0b100, // 4 + N = 0b111, // 7 + }; } diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp new file mode 100644 index 00000000..98ffb7cf --- /dev/null +++ b/include/State_Read_Space.hpp @@ -0,0 +1,101 @@ + +#ifndef STATE_READ_SPACE_HPP +#define STATE_READ_SPACE_HPP + + + +#include "globals.hpp" + + +// Class for a state in the state-space of the automata in read de Bruijn graphs. +class State_Read_Space +{ + typedef DNA::Extended_Base edge_encoding_t; + +private: + + cuttlefish::state_code_t code; // Numeric code of the state. + + static constexpr uint8_t BITS_PER_SIDE = 3; // Number of bits required to `Extended_Base`-encode edges incident to a side. + static constexpr uint8_t FRONT_IDX = BITS_PER_SIDE; // Starting index of the three bits encoding the front-incident edge. + static constexpr uint8_t BACK_IDX = 0; // Starting index of the three bits encoding the back-incident edge. + + // Bitmask to extract the edge-encoding from some side. Has to be shifted to appropriate index before extraction. + static constexpr uint8_t SIDE_MASK = (1 << BITS_PER_SIDE) - 1; + + // Bitmask used to extract the 'Extended_Base`-encoding of the edge(s) incident to the front side of a vertex. + static constexpr cuttlefish::state_code_t FRONT_MASK = SIDE_MASK << FRONT_IDX; + + // Bitmask used to extract the 'Extended_Base`-encoding of the edge(s) incident to the back side of a vertex. + static constexpr cuttlefish::state_code_t BACK_MASK = SIDE_MASK << BACK_IDX; + + // State code for vertices that has been outputted. + // TODO: Use a well-thought-out value as the marker. + static constexpr cuttlefish::state_code_t OUTPUTTED = static_cast((0b101 << FRONT_IDX) | 0b101 << BACK_IDX); + + + // Sets the back-encoding of the state to the `Extended_Base`-encoding `edge`. + // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. + void set_back_encoding(edge_encoding_t edge); + + // Sets the front-encoding of the state to the `Extended_Base`-encoding `edge`. + // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. + void set_front_encoding(edge_encoding_t edge); + + +public: + + // Constructs the state of a vertex having both its sides unvisited. + constexpr State_Read_Space(); + + // Returns `true` iff some vertex having this state has been outputted. + bool is_outputted() const; + + // Returns the `Extended_Base`-encoding of the edge(s) incident to the side + // `side` of a vertex having this state. + edge_encoding_t edge_at(cuttlefish::side_t side) const; + + // Updates the `Extended_Base` encoding of the side `side` of this state, with + // `edge`. For optimization purposes, only certain edge-updates have defined + // behavior: empty-to-rest and unique-to-multi. + void update_edge_at(cuttlefish::side_t side, edge_encoding_t edge); +}; + + +inline constexpr State_Read_Space::State_Read_Space(): + code{(static_cast(edge_encoding_t::E) << FRONT_IDX) | static_cast(edge_encoding_t::E)} +{} + + +inline void State_Read_Space::set_back_encoding(edge_encoding_t edge) +{ + code |= (static_cast(edge) << BACK_IDX); +} + + +inline void State_Read_Space::set_front_encoding(edge_encoding_t edge) +{ + code |= (static_cast(edge) << FRONT_IDX); +} + + +inline bool State_Read_Space::is_outputted() const +{ + return code == OUTPUTTED; +} + + +inline State_Read_Space::edge_encoding_t State_Read_Space::edge_at(const cuttlefish::side_t side) const +{ + return static_cast(side == cuttlefish::side_t::front ? (code & FRONT_MASK) >> FRONT_IDX : (code & BACK_MASK) >> BACK_IDX); +} + + +inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, const edge_encoding_t edge) +{ + side == cuttlefish::side_t::front ? set_front_encoding(edge) : set_back_encoding(edge); +} + + + +#endif diff --git a/include/globals.hpp b/include/globals.hpp index 24c542b4..0c59e982 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -66,6 +66,13 @@ namespace cuttlefish }; + typedef enum class Side: bool + { + front = false, + back = true + } side_t; + + constexpr uint8_t BITS_PER_REF_KMER = 5; typedef compact::ts_vector> ref_bitvector_t; typedef compact::iterator_imp::lhs_setter ref_bitvector_entry_t; diff --git a/src/State.cpp b/src/State.cpp index c87c69b2..cce81a3d 100644 --- a/src/State.cpp +++ b/src/State.cpp @@ -73,6 +73,7 @@ State::State(const Vertex& vertex) } +// TODO: Replace switch-case with `code |= static_cast(base)`. inline void State::set_nibble_lower_half(const cuttlefish::base_t base) { switch(base) @@ -99,6 +100,7 @@ inline void State::set_nibble_lower_half(const cuttlefish::base_t base) } +// TODO: Replace switch-case with `code |= (static_cast(base) << 2)`. inline void State::set_nibble_upper_half(const cuttlefish::base_t base) { switch(base) From d3c1e2f24dc07b0a6748cd47ae54a9093aa875be Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 24 Feb 2021 12:36:51 -0500 Subject: [PATCH 003/350] Have hash table entry API for read-space states --- include/Kmer_Hash_Entry_API.hpp | 57 ++++++++++++++++++++++++++++++++- include/State_Read_Space.hpp | 22 +++++++++++++ include/globals.hpp | 2 ++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/include/Kmer_Hash_Entry_API.hpp b/include/Kmer_Hash_Entry_API.hpp index b2200ed7..189aa1df 100644 --- a/include/Kmer_Hash_Entry_API.hpp +++ b/include/Kmer_Hash_Entry_API.hpp @@ -6,6 +6,7 @@ #include "globals.hpp" #include "State.hpp" +#include "State_Read_Space.hpp" template class Kmer_Hash_Table; @@ -35,7 +36,7 @@ class Kmer_Hash_Entry_API // Value read from the bitvector entry when the object is constructed; is immutable. const State state_read; - // Value read from the bitvector entry when the object is constrcuted; is mutable. + // Value read from the bitvector entry when the object is constructed; is mutable. State state; @@ -71,5 +72,59 @@ class Kmer_Hash_Entry_API }; +// Instantiation of the API class used in read-dBG compaction. +template <> +class Kmer_Hash_Entry_API +{ + template + friend class Kmer_Hash_Table; + + typedef compact::iterator_imp::lhs_setter bitvector_entry_t; + + +private: + + // Position information (base pointer and offset) for the bitvector entry. + bitvector_entry_t bv_entry; + + // Value read from the bitvector entry when the object is constructed; is immutable. + const State_Read_Space state_read; + + // Value read from the bitvector entry when the object is constructed; is mutable. + State_Read_Space state; + + + // Constructs an API to the bitvector entry `bv_entry`. + Kmer_Hash_Entry_API(const bitvector_entry_t& bv_entry): + bv_entry(bv_entry), state_read(bv_entry) + { + state = state_read; + } + + // Returns the state value read when the object was constructed. + cuttlefish::state_code_t get_read_state() const + { + return state_read.get_state(); + } + + // Returns the value of the mutable state value wrapped inside the API, + // i.e. the state value that had been read at the object creation, and then + // possibly have been modified. + cuttlefish::state_code_t get_current_state() const + { + return state.get_state(); + } + + +public: + + // Returns a reference to the mutable copy of the wrapped state value. + State_Read_Space& get_state() + { + return state; + } +}; + + #endif diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 98ffb7cf..4b7c4826 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -7,9 +7,14 @@ #include "globals.hpp" +template class Kmer_Hash_Entry_API; + + // Class for a state in the state-space of the automata in read de Bruijn graphs. class State_Read_Space { + friend class Kmer_Hash_Entry_API; + typedef DNA::Extended_Base edge_encoding_t; private: @@ -34,6 +39,9 @@ class State_Read_Space static constexpr cuttlefish::state_code_t OUTPUTTED = static_cast((0b101 << FRONT_IDX) | 0b101 << BACK_IDX); + // Constructs a state that wraps the provided numeric value `code`. + State_Read_Space(cuttlefish::state_code_t code); + // Sets the back-encoding of the state to the `Extended_Base`-encoding `edge`. // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. void set_back_encoding(edge_encoding_t edge); @@ -42,6 +50,9 @@ class State_Read_Space // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. void set_front_encoding(edge_encoding_t edge); + // Returns the wrapped state-code value. + cuttlefish::state_code_t get_state() const; + public: @@ -67,6 +78,11 @@ inline constexpr State_Read_Space::State_Read_Space(): {} +inline State_Read_Space::State_Read_Space(const cuttlefish::state_code_t code): + code(code) +{} + + inline void State_Read_Space::set_back_encoding(edge_encoding_t edge) { code |= (static_cast(edge) << BACK_IDX); @@ -79,6 +95,12 @@ inline void State_Read_Space::set_front_encoding(edge_encoding_t edge) } +inline cuttlefish::state_code_t State_Read_Space::get_state() const +{ + return code; +} + + inline bool State_Read_Space::is_outputted() const { return code == OUTPUTTED; diff --git a/include/globals.hpp b/include/globals.hpp index 0c59e982..a581ca83 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -77,6 +77,8 @@ namespace cuttlefish typedef compact::ts_vector> ref_bitvector_t; typedef compact::iterator_imp::lhs_setter ref_bitvector_entry_t; + constexpr uint8_t BITS_PER_READ_KMER = 6; + typedef std::shared_ptr logger_t; } From dcedd7708ab7e95e073ec557607cca6307302732 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 1 Mar 2021 11:36:43 -0500 Subject: [PATCH 004/350] Have some local consts --- include/Kmer_Hash_Table.hpp | 17 +++++++++-------- src/Kmer_Hash_Table.cpp | 10 ++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 9f8cb6b0..60942872 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -32,6 +32,7 @@ class Kmer_Hash_Table constexpr static double GAMMA_FACTOR = 2.0; // The MPH function. + // TODO: Initialize with `std::nullptr`. mphf_t* mph = NULL; // The buckets collection (raw `State` representations) for the hash table structure. @@ -116,9 +117,9 @@ inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) template inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) { - uint64_t lidx = bucket_id / lock_range_size; + const uint64_t lidx = bucket_id / lock_range_size; locks_[lidx].lock(); - auto r = Kmer_Hash_Entry_API(hash_table[bucket_id]); + const Kmer_Hash_Entry_API r(hash_table[bucket_id]); locks_[lidx].unlock(); return r; } @@ -136,11 +137,11 @@ inline State Kmer_Hash_Table::operator[](const Kmer& kmer) c { // NOTE: this makes the `const` a lie. Should be a better solution here. // TODO: Design a sparse-locks collection class, moving the locks array there. Have a pointer to `Sparse_Lock` in this class. - auto v = mph->lookup(kmer); - uint64_t lidx = v / lock_range_size; + const auto v = mph->lookup(kmer); + const uint64_t lidx = v / lock_range_size; auto* tp = const_cast(this); const_castlocks_[lidx])>(tp->locks_[lidx]).lock(); - auto ve = State(hash_table[v]); + const State ve(hash_table[v]); const_castlocks_[lidx])>(tp->locks_[lidx]).unlock(); return ve; } @@ -149,10 +150,10 @@ inline State Kmer_Hash_Table::operator[](const Kmer& kmer) c template inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API& api) { - auto it = &(api.bv_entry); - uint64_t lidx = (std::distance(hash_table.begin(), it)) / lock_range_size; + const auto it = &(api.bv_entry); + const uint64_t lidx = (std::distance(hash_table.begin(), it)) / lock_range_size; locks_[lidx].lock(); - bool success = (api.bv_entry == api.get_read_state()); + const bool success = (api.bv_entry == api.get_read_state()); if (success) { api.bv_entry = api.get_current_state(); } diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index cfde8e3d..b7e28281 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -1,7 +1,5 @@ #include "Kmer_Hash_Table.hpp" -//#include "Kmer_Iterator.hpp" -// #include "Kmer_Buffered_Iterator.hpp" #include "Kmer_SPMC_Iterator.hpp" #include @@ -30,7 +28,7 @@ void Kmer_Hash_Table::build_mph_function(const Kmer_Container::construct(const std::string& kmc_db_path, // Open a container over the k-mer database. - Kmer_Container kmer_container(kmc_db_path); + const Kmer_Container kmer_container(kmc_db_path); const uint64_t kmer_count = kmer_container.size(); std::cout << "Total number of k-mers in the set (KMC database): " << kmer_count << ".\n"; @@ -144,13 +142,13 @@ void Kmer_Hash_Table::construct(const std::string& kmc_db_path, std::cout << "Allocated hash table buckets for the k-mers. Total size: " << hash_table.bytes() / (1024 * 1024) << " MB.\n"; - uint64_t total_mem = (total_bits / 8) + hash_table.bytes(); // in bytes + const uint64_t total_mem = (total_bits / 8) + hash_table.bytes(); // in bytes std::cout << "Total memory usage by the hash table: " << total_mem / (1024 * 1024) << " MB." " Bits per k-mer: " << (total_mem * 8.0) / kmer_count << ".\n"; std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + const double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); std::cout << "Done allocating the hash table. Time taken = " << elapsed_seconds << " seconds.\n"; } From 8328f17a54caf2648defcd72f0b1333ac0b0e007 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 1 Mar 2021 11:38:29 -0500 Subject: [PATCH 005/350] Instantiate hash table per bit --- include/globals.hpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/include/globals.hpp b/include/globals.hpp index a581ca83..43d16a97 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -10,21 +10,6 @@ #include "boost/preprocessor/repetition/repeat.hpp" -// The macro `INSTANCE_COUNT` must be set exactly to `(MAX_K + 1) / 2` for a required maximum k-value. -// Also, the `MAX_K` value must be odd (as the k-values used in the algorithm) for correct results. -#ifndef INSTANCE_COUNT - #define INSTANCE_COUNT 32 -#endif - -#define INSTANTIATE(z, k, class_name) template class class_name<2 * k + 1>; -#define ENUMERATE(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) - -#define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, 5>;// template class class_name<2 * k + 1, 6>; -#define ENUMERATE_PER_BIT(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) - -// BOOST_PP_REPEAT reference: https://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/ref/repeat.html - - // Forward declarations of the type of the bitvector used and the type to access its entries (mutable). namespace compact { @@ -84,5 +69,21 @@ namespace cuttlefish } +// The macro `INSTANCE_COUNT` must be set exactly to `(MAX_K + 1) / 2` for a required maximum k-value. +// Also, the `MAX_K` value must be odd (as the k-values used in the algorithm) for correct results. +#ifndef INSTANCE_COUNT + #define INSTANCE_COUNT 32 +#endif + +#define INSTANTIATE(z, k, class_name) template class class_name<2 * k + 1>; +#define ENUMERATE(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) + +#define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, cuttlefish::BITS_PER_REF_KMER>;\ + template class class_name<2 * k + 1, cuttlefish::BITS_PER_READ_KMER>; +#define ENUMERATE_PER_BIT(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) + +// BOOST_PP_REPEAT reference: https://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/ref/repeat.html + + #endif From 7eccb27f05a32838420eea8bf371f8fc093daae7 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 1 Mar 2021 13:23:13 -0500 Subject: [PATCH 006/350] Fix compilation --- include/globals.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/globals.hpp b/include/globals.hpp index 43d16a97..3dbb8539 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -10,6 +10,13 @@ #include "boost/preprocessor/repetition/repeat.hpp" +// The macro `INSTANCE_COUNT` must be set exactly to `(MAX_K + 1) / 2` for a required maximum k-value. +// Also, the `MAX_K` value must be odd (as the k-values used in the algorithm) for correct results. +#ifndef INSTANCE_COUNT + #define INSTANCE_COUNT 32 +#endif + + // Forward declarations of the type of the bitvector used and the type to access its entries (mutable). namespace compact { @@ -69,11 +76,7 @@ namespace cuttlefish } -// The macro `INSTANCE_COUNT` must be set exactly to `(MAX_K + 1) / 2` for a required maximum k-value. -// Also, the `MAX_K` value must be odd (as the k-values used in the algorithm) for correct results. -#ifndef INSTANCE_COUNT - #define INSTANCE_COUNT 32 -#endif +// Metaprogramming macro-loops for instantiating required template instances. #define INSTANTIATE(z, k, class_name) template class class_name<2 * k + 1>; #define ENUMERATE(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) From f0cbdf227efcd495cdc8479e8cb0d4995032e83e Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 1 Mar 2021 15:34:52 -0500 Subject: [PATCH 007/350] Resolve paranoia on locks --- include/Kmer_Hash_Table.hpp | 5 ++-- include/Spin_Lock.hpp | 52 +++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 include/Spin_Lock.hpp diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 60942872..ba1884b3 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -11,7 +11,7 @@ #include "Kmer_Hasher.hpp" #include "compact_vector/compact_vector.hpp" #include "Kmer_Hash_Entry_API.hpp" -#include "SpinLock/SpinLock.hpp" +#include "Spin_Lock.hpp" template class CdBG; @@ -48,7 +48,8 @@ class Kmer_Hash_Table uint64_t lock_range_size; // The locks to maintain mutually exclusive access for threads to the same indices into the bitvector `hash_table`. - std::array locks_; + // std::array locks_; + std::array locks_; // Builds the minimal perfect hash function `mph` over the set of diff --git a/include/Spin_Lock.hpp b/include/Spin_Lock.hpp new file mode 100644 index 00000000..7836b543 --- /dev/null +++ b/include/Spin_Lock.hpp @@ -0,0 +1,52 @@ + +#ifndef SPIN_LOCK_HPP +#define SPIN_LOCK_HPP + + + +#include + + +// A lightweight lock-free mutex class. +// It is based on `std::atomic_flag`, which is guaranteed to be a lock-free atomic construct . +// Reference: https://en.cppreference.com/w/cpp/atomic/atomic_flag +class Spin_Lock +{ +private: + + std::atomic_flag lock_{ATOMIC_FLAG_INIT}; + + +public: + + // Acquires the lock for mutually-exlcusive access to it. + void lock(); + + // Releases the lock, giving up the exclusive access to it. + void unlock(); +}; + + +inline void Spin_Lock::lock() +{ + // Due to the memory access order `memory_order_acquire`, no reads or writes in the current thread can be + // reordered before this load of the variable `lock_` (enforced by the compiler and the processor) — + // ensuring that memory-access instructions after a `lock` invokation stays after it. + + while(lock_.test_and_set(std::memory_order_acquire)) + ;// while(lock_.test(std::memory_order_relaxed)); // C++20 optimization to avoid the redundant stores from the spinning `test_and_set`. +} + + +inline void Spin_Lock::unlock() +{ + // Due to the memory access order `memory_order_release`, no reads or writes in the current thread can be + // reordered after this store of the variable `lock_` (enforced by the compiler and the processor) — + // ensuring that memory-access instructions before an `unlock` invokation stays before it. + + lock_.clear(std::memory_order_release); +} + + + +#endif From 2b8c51acc238077abace316325f69fe7d65ef2f9 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 1 Mar 2021 19:31:04 -0500 Subject: [PATCH 008/350] Have sparse-locks collection --- include/Sparse_Lock.hpp | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 include/Sparse_Lock.hpp diff --git a/include/Sparse_Lock.hpp b/include/Sparse_Lock.hpp new file mode 100644 index 00000000..63cc32c6 --- /dev/null +++ b/include/Sparse_Lock.hpp @@ -0,0 +1,64 @@ + +#ifndef SPARSE_LOCK_HPP +#define SPARSE_LOCK_HPP + + + +#include + + +// A collection of locks, of type `T_Lock`. +// Intended to be used when a set of sparsely distributed locks over some index range is required. +template +class Sparse_Lock +{ +private: + + // Each lock is assigned a power-of-two number of entries to guard. + + const size_t num_entries; // Number of entries to guard. + const uint8_t lg_per_lock_range; // Base-2 log of the number of entries assigned to each lock. + const size_t per_lock_range; // Number of contiguous entries (indices) that each lock is assigned to. + const size_t num_locks; // Number of locks in the collection. + std::vector lock_; // The collection of locks. + + +public: + + // Constructs a sparse-lock collection consisting of `lock_count` locks, for `range_size` number of entries. + Sparse_Lock(size_t range_size, size_t lock_count); + + // Acquires lock for the entry with index `idx`. + void lock(size_t idx); + + // Releases lock for the entry with index `idx`. + void unlock(size_t idx); +}; + + +template +inline Sparse_Lock::Sparse_Lock(const size_t range_size, const size_t lock_count): + num_entries(range_size), + lg_per_lock_range(static_cast(std::floor(std::log2((num_entries + lock_count - 1) / lock_count)))), + per_lock_range(static_cast(1) << lg_per_lock_range), + num_locks((num_entries + per_lock_range - 1) / per_lock_range), + lock_(num_locks) +{} + + +template +inline void Sparse_Lock::lock(const size_t idx) +{ + lock_[idx >> lg_per_lock_range].lock(); +} + + +template +inline void Sparse_Lock::unlock(const size_t idx) +{ + lock_[idx >> lg_per_lock_range].unlock(); +} + + + +#endif From b1887d1a671509b477f080f72a08d0c97a79aa80 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 2 Mar 2021 09:55:59 -0500 Subject: [PATCH 009/350] Use the sparse-lock class Addresses Rob's note (on a `const` method): "this makes the `const` a lie. Should be a better solution here." --- include/Kmer_Hash_Table.hpp | 54 +++++++++++++++++++++---------------- src/Kmer_Hash_Table.cpp | 16 ++++++++++- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index ba1884b3..dd146f98 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -12,6 +12,7 @@ #include "compact_vector/compact_vector.hpp" #include "Kmer_Hash_Entry_API.hpp" #include "Spin_Lock.hpp" +#include "Sparse_Lock.hpp" template class CdBG; @@ -43,13 +44,9 @@ class Kmer_Hash_Table // TODO: increase locks and check note at the end about the false `const` issue. constexpr static uint64_t lock_count{65536}; - // Number of contiguous entries of the bitvector that each lock is assigned to. - // TODO: try making it `const`. - uint64_t lock_range_size; // The locks to maintain mutually exclusive access for threads to the same indices into the bitvector `hash_table`. - // std::array locks_; - std::array locks_; + Sparse_Lock* sparse_lock_ptr{nullptr}; // Builds the minimal perfect hash function `mph` over the set of @@ -105,6 +102,9 @@ class Kmer_Hash_Table // Clears the hash-table. Do not invoke on an unused object. void clear(); + + // Destructs the hash table. + ~Kmer_Hash_Table(); }; @@ -118,10 +118,10 @@ inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) template inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) { - const uint64_t lidx = bucket_id / lock_range_size; - locks_[lidx].lock(); + sparse_lock_ptr->lock(bucket_id); const Kmer_Hash_Entry_API r(hash_table[bucket_id]); - locks_[lidx].unlock(); + sparse_lock_ptr->unlock(bucket_id); + return r; } @@ -136,29 +136,37 @@ inline Kmer_Hash_Entry_API Kmer_Hash_Table::opera template inline State Kmer_Hash_Table::operator[](const Kmer& kmer) const { - // NOTE: this makes the `const` a lie. Should be a better solution here. - // TODO: Design a sparse-locks collection class, moving the locks array there. Have a pointer to `Sparse_Lock` in this class. - const auto v = mph->lookup(kmer); - const uint64_t lidx = v / lock_range_size; - auto* tp = const_cast(this); - const_castlocks_[lidx])>(tp->locks_[lidx]).lock(); - const State ve(hash_table[v]); - const_castlocks_[lidx])>(tp->locks_[lidx]).unlock(); - return ve; + const uint64_t bucket = bucket_id(kmer); + + sparse_lock_ptr->lock(bucket); + const State state(hash_table[bucket]); + sparse_lock_ptr->unlock(bucket); + + return state; } template inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API& api) { - const auto it = &(api.bv_entry); - const uint64_t lidx = (std::distance(hash_table.begin(), it)) / lock_range_size; - locks_[lidx].lock(); + // const auto it = &(api.bv_entry); + // const uint64_t lidx = (std::distance(hash_table.begin(), it)) / lock_range_size; + // locks_[lidx].lock(); + // const bool success = (api.bv_entry == api.get_read_state()); + // if (success) { + // api.bv_entry = api.get_current_state(); + // } + // locks_[lidx].unlock(); + // return success; + + const uint64_t bucket = std::distance(hash_table.begin(), &(api.bv_entry)); + + sparse_lock_ptr->lock(bucket); const bool success = (api.bv_entry == api.get_read_state()); - if (success) { + if(success) api.bv_entry = api.get_current_state(); - } - locks_[lidx].unlock(); + sparse_lock_ptr->unlock(bucket); + return success; } diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index b7e28281..e58b6196 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -126,7 +126,7 @@ void Kmer_Hash_Table::construct(const std::string& kmc_db_path, const uint64_t kmer_count = kmer_container.size(); std::cout << "Total number of k-mers in the set (KMC database): " << kmer_count << ".\n"; - lock_range_size = uint64_t(std::ceil(double(kmer_count) / lock_count)); + sparse_lock_ptr = new Sparse_Lock(kmer_count, lock_count); // Build the minimal perfect hash function. @@ -160,12 +160,26 @@ void Kmer_Hash_Table::clear() delete mph; mph = NULL; + + + if(sparse_lock_ptr != nullptr) + delete sparse_lock_ptr; + + sparse_lock_ptr = nullptr; + // hash_table.clear(); hash_table.resize(0); } +template +Kmer_Hash_Table::~Kmer_Hash_Table() +{ + clear(); +} + + // Template instantiations for the required specializations. ENUMERATE_PER_BIT(INSTANCE_COUNT, INSTANTIATE_PER_BIT, Kmer_Hash_Table) From 75ecadd078467fe19273ae17f7efc14d6281efdc Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 2 Mar 2021 10:58:06 -0500 Subject: [PATCH 010/350] Move out non-class-specific method --- include/CdBG.hpp | 4 ---- include/utility.hpp | 3 +++ src/CdBG.cpp | 15 +-------------- src/utility.cpp | 14 ++++++++++++++ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 18e0e138..9764635f 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -93,10 +93,6 @@ class CdBG double logger_flush_time = 0; - // Removes the k-mer set (KMC database) with the path prefix `kmc_file_pref`. - void remove_kmer_set(const std::string& kmc_file_pref) const; - - /* Build methods */ // TODO: rename the "classify" methods with appropriate terminology that are consistent with the theory. diff --git a/include/utility.hpp b/include/utility.hpp index e8c91a95..686e28e3 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -19,6 +19,9 @@ bool file_prefix_exists(const std::string& path, const std::string& prefix); // Returns a string that is a copy of `s` but has all the whitespaces removed. std::string remove_whitespaces(const char* s); +// Removes the k-mer set (KMC database) with the path prefix `kmc_file_pref`. +void remove_kmer_set(const std::string& kmc_file_pref); + #endif diff --git a/src/CdBG.cpp b/src/CdBG.cpp index c575afac..5d4f9f37 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -1,5 +1,6 @@ #include "CdBG.hpp" +#include "utility.hpp" template @@ -35,20 +36,6 @@ void CdBG::construct() } -template -void CdBG::remove_kmer_set(const std::string& kmc_file_pref) const -{ - const std::string kmc_file1_path(kmc_file_pref + ".kmc_pre"); - const std::string kmc_file2_path(kmc_file_pref + ".kmc_suf"); - - if(std::remove(kmc_file1_path.c_str()) || std::remove(kmc_file2_path.c_str())) - { - std::cerr << "Error removing the KMC database file from path prefix " << kmc_file_pref << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } -} - - template size_t CdBG::search_valid_kmer(const char* const seq, const size_t left_end, const size_t right_end) const { diff --git a/src/utility.cpp b/src/utility.cpp index 40d3662e..3a0b1aac 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -6,6 +6,7 @@ #include #include #include +#include std::string get_random_string(const size_t len) @@ -64,3 +65,16 @@ std::string remove_whitespaces(const char* s) return str; } + + +void remove_kmer_set(const std::string& kmc_file_pref) +{ + const std::string kmc_file1_path(kmc_file_pref + ".kmc_pre"); + const std::string kmc_file2_path(kmc_file_pref + ".kmc_suf"); + + if(std::remove(kmc_file1_path.c_str()) || std::remove(kmc_file2_path.c_str())) + { + std::cerr << "Error removing the KMC database file from path prefix " << kmc_file_pref << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } +} From 9052c3e8c9c348f64b3edd91b001094430164216 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 2 Mar 2021 11:36:26 -0500 Subject: [PATCH 011/350] Expose k-mer iterator consructor --- include/Kmer_SPMC_Iterator.hpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index 60626850..3a3c61c0 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -32,11 +32,6 @@ struct Consumer_Data template class Kmer_SPMC_Iterator { - friend class Kmer_Container; - - -public: - typedef Kmer_SPMC_Iterator iterator; @@ -68,11 +63,6 @@ class Kmer_SPMC_Iterator volatile Task_Status* task_status{nullptr}; // Collection of the task statuses of the consumers. - // Constructs an iterator for the provided container `kmer_container`, on either - // its beginning or its ending position, based on `at_begin` and `at_end`. The - // iterator is to support `consumer_count` number of different consumers. - Kmer_SPMC_Iterator(const Kmer_Container* kmer_container, size_t consumer_count, bool at_begin = true, bool at_end = false); - // Opens the k-mer database file with the path prefix `db_path`. void open_kmer_database(const std::string& db_path); @@ -90,6 +80,11 @@ class Kmer_SPMC_Iterator public: + // Constructs an iterator for the provided container `kmer_container`, on either + // its beginning or its ending position, based on `at_begin` and `at_end`. The + // iterator is to support `consumer_count` number of different consumers. + Kmer_SPMC_Iterator(const Kmer_Container* kmer_container, size_t consumer_count, bool at_begin = true, bool at_end = false); + // Copy constructs an iterator from another one `other`. // Note: this should be prohibited, like the `operator=`. But the BBHash code // requires this to be implemented. From a3360f9cd515f5eb53781acf9bf54766823b9127 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 2 Mar 2021 19:35:40 -0500 Subject: [PATCH 012/350] Add CdBG-constructor skeleton --- include/Read_CdBG_Constructor.hpp | 45 ++++++++++++++++++ include/Task_Params.hpp | 14 ++++++ include/Thread_Pool.hpp | 11 +++-- src/CMakeLists.txt | 1 + src/Read_CdBG_Constructor.cpp | 47 ++++++++++++++++++ src/Thread_Pool.cpp | 79 +++++++++++++++++++++++-------- 6 files changed, 174 insertions(+), 23 deletions(-) create mode 100644 include/Read_CdBG_Constructor.hpp create mode 100644 src/Read_CdBG_Constructor.cpp diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp new file mode 100644 index 00000000..e1c26b4a --- /dev/null +++ b/include/Read_CdBG_Constructor.hpp @@ -0,0 +1,45 @@ + +#ifndef READ_CDBG_CONSTRUCTOR_HPP +#define READ_CDBG_CONSTRUCTOR_HPP + + + +#include "globals.hpp" +#include "Kmer_Hash_Table.hpp" +#include "Build_Params.hpp" +#include "Thread_Pool.hpp" + + +// A class to construct read-compacted de Bruijn graphs. +template +class Read_CdBG_Constructor +{ + friend class Thread_Pool; + +private: + + const Build_Params params; // Required parameters (wrapped inside). + Kmer_Hash_Table& hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + + + // Distributes the DFA-states computation task to the worker threads in the thread pool `thread_pool`. + void distribute_states_computation(Thread_Pool& thread_pool); + + // Processes the edges provided to the thread with id `thread_id`, i.e. makes state-transitions for + // the DFA as per the edges provided to that thread. + void process_edges(uint16_t thread_id); + + +public: + + // Consructs a read-CdBG builder object, with the required parameters wrapped in `params`, and uses + // the Cuttlefish hash table `hash_table`. + Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table); + + // Computes the states of the DFA in the de Bruijn graph. + void compute_DFA_states(); +}; + + + +#endif diff --git a/include/Task_Params.hpp b/include/Task_Params.hpp index 42b98283..f9f8561c 100644 --- a/include/Task_Params.hpp +++ b/include/Task_Params.hpp @@ -45,5 +45,19 @@ struct Output_Task_Params }; +// Wrapper over the parameters for the DFA-states computation task for read-dBGs. +struct Compute_States_Read_Space_Params +{ + uint16_t thread_id; + + + Compute_States_Read_Space_Params() {} + + Compute_States_Read_Space_Params(const uint16_t thread_id): + thread_id(thread_id) + {} +}; + + #endif diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 1589b9fc..6906cec5 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -30,6 +30,7 @@ class Thread_Pool output_plain, output_gfa, output_gfa_reduced, + compute_states_read_space, }; @@ -48,7 +49,7 @@ class Thread_Pool const uint16_t thread_count; // The de Bruijn graph object that this thread pool is operating with. - CdBG* const cdbg; + void* const dBG; // The type of task that this thread pool will execute. const Task_Type task_type; @@ -62,6 +63,7 @@ class Thread_Pool // Collection of the task parameters for each thread. std::vector classify_params; std::vector output_params; + std::vector compute_states_read_space_params; // Marks the thread number `thread_id` as busy with some task. @@ -80,8 +82,8 @@ class Thread_Pool // Constructs a thread pool with `thread_count` number of threads to operate - // on the de Brujin graph `cdbg` for tasks of type `task_type`. - Thread_Pool(uint16_t thread_count, CdBG* cdbg, Task_Type task_type); + // on the de Brujin graph `dBG` for tasks of type `task_type`. + Thread_Pool(uint16_t thread_count, void* dBG, Task_Type task_type); // Returns the id (number) of an idle thread from the pool. uint16_t get_idle_thread() const; @@ -95,6 +97,9 @@ class Thread_Pool // Assigns an outputting task to the thread number `thread_id` with the provided parameters. void assign_output_task(uint16_t thread_id, const char* seq, size_t seq_len, size_t left_end, size_t right_end); + // Assigns a DFA-states computation task to the thread number `thread_id`. + void assign_compute_states_read_space_task(uint16_t thread_id); + // Waits until all the threads in the pool have completed their active tasks. void wait_completion() const; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f68ee3e3..61d81310 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ set(PROJECT_SRC CdBG_Plain_Writer.cpp CdBG_GFA_Writer.cpp CdBG_GFA_Reduced_Writer.cpp + Read_CdBG_Constructor.cpp Validator.cpp Validator_Hash_Table.cpp Sequence_Validator.cpp diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp new file mode 100644 index 00000000..d68fd039 --- /dev/null +++ b/src/Read_CdBG_Constructor.cpp @@ -0,0 +1,47 @@ + +#include "Read_CdBG_Constructor.hpp" + + +template +Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table): + params(params), + hash_table(hash_table) +{} + + +template +void Read_CdBG_Constructor::compute_DFA_states() +{ + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); + + // Multi-threaded computation. + distribute_states_computation(thread_pool); + thread_pool.close(); +} + + +template +void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thread_pool) +{ + const uint16_t thread_count = params.thread_count(); + + for(uint16_t t_id = 0; t_id < thread_count; ++t_id) + { + const uint16_t idle_thread_id = thread_pool.get_idle_thread(); + thread_pool.assign_compute_states_read_space_task(idle_thread_id); + } +} + + +template +void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) +{ + (void)thread_id; +} + + + +// Template instantiations for the required specializations. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Constructor) diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index 59107d8f..75adb771 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -1,14 +1,15 @@ #include "Thread_Pool.hpp" #include "CdBG.hpp" +#include "Read_CdBG_Constructor.hpp" #include template -Thread_Pool::Thread_Pool(const uint16_t thread_count, CdBG* const cdbg, const Task_Type task_type): +Thread_Pool::Thread_Pool(const uint16_t thread_count, void* const dBG, const Task_Type task_type): thread_count(thread_count), - cdbg(cdbg), + dBG(dBG), task_type(task_type), task_status(new volatile Task_Status[thread_count]) { @@ -18,10 +19,26 @@ Thread_Pool::Thread_Pool(const uint16_t thread_count, CdBG* const cdbg, co // Resize the parameters collections. - if(task_type == Task_Type::classification) + switch(task_type) + { + case Task_Type::classification: classify_params.resize(thread_count); - else + break; + + case Task_Type::output_plain: + case Task_Type::output_gfa: + case Task_Type::output_gfa_reduced: output_params.resize(thread_count); + break; + + case Task_Type::compute_states_read_space: + compute_states_read_space_params.resize(thread_count); + break; + + default: + std::cerr << "Unrecognized task type encountered in thread pool. Aborting.\n"; + std::exit(EXIT_FAILURE); + } // Launch the threads. @@ -46,18 +63,36 @@ void Thread_Pool::task(const uint16_t thread_id) // Some task is available for the thread number `thread_id`. if(task_status[thread_id] == Task_Status::available) { - if(task_type == Task_Type::classification) - { - const Classification_Task_Params& params = classify_params[thread_id]; - cdbg->process_substring(params.seq, params.seq_len, params.left_end, params.right_end); - } - else + switch(task_type) { - const Output_Task_Params& params = output_params[thread_id]; - if(task_type == Task_Type::output_plain) - cdbg->output_plain_off_substring(params.thread_id, params.seq, params.seq_len, params.left_end, params.right_end); - else // `task_type == Task_Type::output_gfa` - cdbg->output_gfa_off_substring(params.thread_id, params.seq, params.seq_len, params.left_end, params.right_end); + case Task_Type::classification: + { + const Classification_Task_Params& params = classify_params[thread_id]; + static_cast*>(dBG)->process_substring(params.seq, params.seq_len, params.left_end, params.right_end); + } + break; + + case Task_Type::output_plain: + { + const Output_Task_Params& params = output_params[thread_id]; + static_cast*>(dBG)->output_plain_off_substring(params.thread_id, params.seq, params.seq_len, params.left_end, params.right_end); + } + break; + + case Task_Type::output_gfa: + case Task_Type::output_gfa_reduced: + { + const Output_Task_Params& params = output_params[thread_id]; + static_cast*>(dBG)->output_gfa_off_substring(params.thread_id, params.seq, params.seq_len, params.left_end, params.right_end); + } + break; + + case Task_Type::compute_states_read_space: + { + const Compute_States_Read_Space_Params& params = compute_states_read_space_params[thread_id]; + static_cast*>(dBG)->process_edges(params.thread_id); + } + break; } @@ -109,6 +144,15 @@ void Thread_Pool::assign_output_task(const uint16_t thread_id, const char* co } +template +void Thread_Pool::assign_compute_states_read_space_task(const uint16_t thread_id) +{ + compute_states_read_space_params[thread_id] = Compute_States_Read_Space_Params(thread_id); + + assign_task(thread_id); +} + + template void Thread_Pool::assign_task(const uint16_t thread_id) { @@ -168,11 +212,6 @@ void Thread_Pool::close() delete[] task_status; - - if(task_type == Task_Type::classification) - classify_params.clear(); - else - output_params.clear(); } From 1872f65935b573347c9f92ff032307c3c34c2691 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 4 Mar 2021 11:36:08 -0500 Subject: [PATCH 013/350] Remove redundant macro --- include/globals.hpp | 2 +- src/Kmer_Hash_Table.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/globals.hpp b/include/globals.hpp index 3dbb8539..3e5d8f6d 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -83,7 +83,7 @@ namespace cuttlefish #define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, cuttlefish::BITS_PER_REF_KMER>;\ template class class_name<2 * k + 1, cuttlefish::BITS_PER_READ_KMER>; -#define ENUMERATE_PER_BIT(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) + // BOOST_PP_REPEAT reference: https://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/ref/repeat.html diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index e58b6196..b6313ecb 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -182,4 +182,4 @@ Kmer_Hash_Table::~Kmer_Hash_Table() // Template instantiations for the required specializations. -ENUMERATE_PER_BIT(INSTANCE_COUNT, INSTANTIATE_PER_BIT, Kmer_Hash_Table) +ENUMERATE(INSTANCE_COUNT, INSTANTIATE_PER_BIT, Kmer_Hash_Table) From 418860e5bb7e3e2d4f78a91b2bbf54f3a3ee1840 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 4 Mar 2021 12:09:15 -0500 Subject: [PATCH 014/350] Describe the meta-programming macros explicitly High time this everytime-confusing arcane syntax gets some documentation. --- include/globals.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/globals.hpp b/include/globals.hpp index 3e5d8f6d..b280a12b 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -78,9 +78,17 @@ namespace cuttlefish // Metaprogramming macro-loops for instantiating required template instances. -#define INSTANTIATE(z, k, class_name) template class class_name<2 * k + 1>; +// Given some `x`, explicitly instantiates the class `class_name` for the template parameter `k` with `2x + 1`; +// i.e. it is an instantiator for odd k-values. +#define INSTANTIATE(z, x, class_name) template class class_name<2 * x + 1>; + +// Enumerates all the explicit instantiations of the template class `class_name` using `instantiator`, for all +// `x` in `[0, count)`. The `x`-value is used as appropriate by `instantiator`. #define ENUMERATE(count, instantiator, class_name) BOOST_PP_REPEAT(count, instantiator, class_name) +// Given some `x`, explicitly instantiates two instances of the class `class_name`, with the template parameters +// `k` = `2x + 1`, and `BITS_PER_KEY` with `BITS_PER_REF_KMER` and `BITS_PER_READ_KMER` for alternate instances; +// i.e. it is an instantiator for odd k-values and all the different bits-per-key requirements. #define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, cuttlefish::BITS_PER_REF_KMER>;\ template class class_name<2 * k + 1, cuttlefish::BITS_PER_READ_KMER>; From ce9dd54fdef5042a621eabeb7e4cb79fcac8ac90 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 4 Mar 2021 16:28:16 -0500 Subject: [PATCH 015/350] Add tentative user-params for read dBG --- include/Build_Params.hpp | 70 +++++++++++++++++++++++++++++++------ include/Reference_Input.hpp | 7 ++++ src/CdBG.cpp | 4 +-- src/CdBG_Builder.cpp | 2 +- src/main.cpp | 13 +++++-- 5 files changed, 79 insertions(+), 17 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index ab8f0406..9a4e9b06 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -16,9 +16,11 @@ class Build_Params { private: + const bool is_read_graph_; // Whether to build a read- or a reference-compacted de Bruijn graph. const Reference_Input reference_input_; // Collection of the input references. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. - const std::string kmc_db_path_; // Path to the KMC database containing the k-mer set. + const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). + const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). const uint16_t thread_count_; // Number of threads to work with. const std::string& output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). @@ -31,11 +33,13 @@ class Build_Params public: // Constructs a parameters wrapper object with the self-explanatory parameters. - Build_Params( const std::vector& ref_paths, + Build_Params( const bool is_read_graph, + const std::vector& ref_paths, const std::vector& list_paths, const std::vector& dir_paths, const uint16_t k, - const std::string& kmc_db_path, + const std::string& vertex_db_path, + const std::string& edge_db_path, const uint16_t thread_count, const std::string& output_file_path, const uint8_t output_format, @@ -43,9 +47,11 @@ class Build_Params const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path): + is_read_graph_(is_read_graph), reference_input_(ref_paths, list_paths, dir_paths), k_(k), - kmc_db_path_(kmc_db_path), + vertex_db_path_(vertex_db_path), + edge_db_path_(edge_db_path), thread_count_(thread_count), output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), @@ -56,6 +62,13 @@ class Build_Params {} + // Returns the boolean flag to whether to build a read- or a reference-compacted de Bruijn graph. + bool is_read_graph() const + { + return is_read_graph_; + } + + // Returns the reference input collections. const Reference_Input& reference_input() const { @@ -70,10 +83,17 @@ class Build_Params } - // Returns the path to the KMC database. - const std::string& kmc_db_path() const + // Returns the path to the vertex database. + const std::string& vertex_db_path() const + { + return vertex_db_path_; + } + + + // Returns the path to the edge database. + const std::string& edge_db_path() const { - return kmc_db_path_; + return edge_db_path_; } @@ -133,12 +153,40 @@ class Build_Params inline bool Build_Params::is_valid() const { + bool valid = true; + + + // Check if read and reference de Bruijn graphs parameters are being mixed with. + if(is_read_graph_) + { + if(!reference_input_.empty()) + { + std::cout << "No reference is to be provided for a compacted read de Bruijn graph construction.\n"; + valid = false; + } + + if(edge_db_path_.empty()) + { + std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; + valid = false; + } + } + else + { + if(!edge_db_path_.empty()) + { + std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; + valid = false; + } + } + + // Even `k` values are not consistent with the theory. // Also, `k` needs to be in the range `[1, MAX_K]`. if((k_ & 1) == 0 || (k_ > cuttlefish::MAX_K)) { std::cout << "The k-mer length (k) needs to be odd and within " << cuttlefish::MAX_K << ".\n"; - return false; + valid = false; } @@ -147,7 +195,7 @@ inline bool Build_Params::is_valid() const if(num_threads > 0 && thread_count_ > num_threads) { std::cout << "At most " << num_threads << " concurrent threads are supported at the machine.\n"; - return false; + valid = false; } @@ -155,11 +203,11 @@ inline bool Build_Params::is_valid() const if(output_format_ >= cuttlefish::num_op_formats) { std::cout << "Invalid output file format.\n"; - return false; + valid = false; } - return true; + return valid; } diff --git a/include/Reference_Input.hpp b/include/Reference_Input.hpp index 28c94d3c..970b8eb4 100644 --- a/include/Reference_Input.hpp +++ b/include/Reference_Input.hpp @@ -48,6 +48,13 @@ class Reference_Input { return dir_paths_; } + + + // Returns whether the reference collection is empty or not. + bool empty() const + { + return ref_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); + } }; diff --git a/src/CdBG.cpp b/src/CdBG.cpp index 5d4f9f37..ab27b837 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -15,11 +15,11 @@ template void CdBG::construct() { std::cout << "\nConstructing the minimal perfect hash function (MPHF).\n"; - Vertices.construct(params.kmc_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + Vertices.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); if(params.remove_kmc_db()) { - remove_kmer_set(params.kmc_db_path()); + remove_kmer_set(params.vertex_db_path()); std::cout << "\nRemoved the KMC database from disk.\n"; } diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index de94d2a8..8b18f270 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -608,7 +608,7 @@ bool CdBG::process_isolated_kmer(const Directed_Kmer& kmer) template void CdBG::print_state_class_dist() const { - const std::string& kmc_db_path = params.kmc_db_path(); + const std::string& kmc_db_path = params.vertex_db_path(); Kmer_Container kmers(kmc_db_path); auto it_beg = kmers.begin(); diff --git a/src/main.cpp b/src/main.cpp index c53f799a..7c658827 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,13 +19,15 @@ // Driver function for the CdBG build. void build(int argc, char** argv) { - cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from references"); + cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from references or reads"); options.add_options() + ("read", "construct a compacted read de Bruijn graph") ("r,refs", "reference files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("l,lists", "reference file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("k,kmer_len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) - ("s,kmc_db", "set of k-mers (KMC database) prefix", cxxopts::value()) + ("s,kmc_db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()) + ("e,edge_db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) @@ -44,11 +46,13 @@ void build(int argc, char** argv) return; } + const bool is_read_graph = result["read"].as(); const auto refs = result["refs"].as>(); const auto lists = result["lists"].as>(); const auto dirs = result["dirs"].as>(); const auto k = result["kmer_len"].as(); const auto kmer_database = result["kmc_db"].as(); + const auto edge_database = result["edge_db"].as(); const auto thread_count = result["threads"].as(); const auto output_file = result["output"].as(); const auto format = result["format"].as(); @@ -57,7 +61,7 @@ void build(int argc, char** argv) const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); - const Build_Params params(refs, lists, dirs, k, kmer_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file); + const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file); if(!params.is_valid()) { std::cerr << "Invalid input configuration. Aborting.\n"; @@ -143,7 +147,10 @@ void validate(int argc, char** argv) int main(int argc, char** argv) { if(argc < 2) + { std::cout << "Usage:\ncuttlefish [OPTIONS]" << std::endl; + std::cout << "Supported commands: `build` and `validate`." << std::endl; + } else { const std::string command = argv[1]; From dfb09bdb1a47f647c8dd1638d6db0b12d5297419 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 4 Mar 2021 22:15:04 -0500 Subject: [PATCH 016/350] Templatize template application Batshit to the amateur.. --- include/Application.hpp | 64 ++++++++++++++++++++--------------------- src/main.cpp | 4 +-- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/Application.hpp b/include/Application.hpp index 6133247d..c21b82f4 100644 --- a/include/Application.hpp +++ b/include/Application.hpp @@ -4,24 +4,24 @@ -#include "CdBG.hpp" #include "Validator.hpp" #include "Build_Params.hpp" #include "Validation_Params.hpp" // The top-level application class for the compaction algorithm. -template +template typename T_App> class Application { private: // Pointer to an application instance of the next `Application` class in the top-down hierarchy (on `k`). - Application* const app_next_level; + Application* const app_next_level; - // Pointer to a `CdBG` object that operates with the k-value `k`. - CdBG* const cdbg; + // Pointer to a driver object that operates with the k-value `k`. + T_App* const app; + // TODO: Make the validator member generic, like `T_App`. // Pointer to a `Validator` object that operates with the k-value `k`. Validator* const validator; @@ -46,12 +46,12 @@ class Application }; -template <> -class Application<1> +template typename T_App> +class Application<1, T_App> { private: - CdBG<1>* const cdbg; + T_App<1>* const app; Validator<1>* const validator; @@ -59,21 +59,21 @@ class Application<1> public: Application(const Build_Params& params): - cdbg(params.k() == 1 ? new CdBG<1>(params) : nullptr), + app(params.k() == 1 ? new T_App<1>(params) : nullptr), validator(nullptr) {} Application(const Validation_Params& params): - cdbg(nullptr), + app(nullptr), validator(params.k() == 1 ? new Validator<1>(params) : nullptr) {} ~Application() { - if(cdbg != nullptr) - delete cdbg; + if(app != nullptr) + delete app; if(validator != nullptr) delete validator; @@ -82,8 +82,8 @@ class Application<1> void execute() const { - if(cdbg != nullptr) - cdbg->construct(); + if(app != nullptr) + app->construct(); else { std::cerr << "The provided k is not valid. Aborting.\n"; @@ -103,47 +103,47 @@ class Application<1> }; -template -inline Application::Application(const Build_Params& params): - app_next_level(new Application(params)), - cdbg(params.k() == k ? new CdBG(params) : nullptr), +template typename T_App> +inline Application::Application(const Build_Params& params): + app_next_level(new Application(params)), + app(params.k() == k ? new T_App(params) : nullptr), validator(nullptr) {} -template -inline Application::Application(const Validation_Params& params): - app_next_level(new Application(params)), - cdbg(nullptr), +template typename T_App> +inline Application::Application(const Validation_Params& params): + app_next_level(new Application(params)), + app(nullptr), validator(params.k() == k ? new Validator(params): nullptr) {} -template -inline Application::~Application() +template typename T_App> +inline Application::~Application() { delete app_next_level; - if(cdbg != nullptr) - delete cdbg; + if(app != nullptr) + delete app; if(validator != nullptr) delete validator; } -template -inline void Application::execute() const +template typename T_App> +inline void Application::execute() const { - if(cdbg != nullptr) - cdbg->construct(); + if(app != nullptr) + app->construct(); else app_next_level->execute(); } -template -inline bool Application::validate() const +template typename T_App> +inline bool Application::validate() const { if(validator != nullptr) return validator->validate(); diff --git a/src/main.cpp b/src/main.cpp index 7c658827..70f8a52b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -73,7 +73,7 @@ void build(int argc, char** argv) std::cout << "\nConstructing the compacted de Bruijn graph for k = " << k << ".\n"; - const Application app(params); + const Application app(params); app.execute(); std::cout << "\nConstructed the compacted de Bruijn graph at " << output_file << ".\n"; @@ -132,7 +132,7 @@ void validate(int argc, char** argv) std::cout << "\nValidating the compacted de Bruijn graph for k = " << k << "\n"; - const Application app(params); + const Application app(params); std::cout << (app.validate() ? "\nValidation successful" : "\nValidation failed") << std::endl; } catch(const std::exception& e) From a870e13867fb80984f5454ed880edadef6b55cf2 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 7 Mar 2021 19:24:10 -0500 Subject: [PATCH 017/350] Have execution framework for read dBG --- include/Build_Params.hpp | 4 +++- include/Read_CdBG.hpp | 34 ++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Read_CdBG.cpp | 18 ++++++++++++++++++ src/main.cpp | 14 +++++++++----- 5 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 include/Read_CdBG.hpp create mode 100644 src/Read_CdBG.cpp diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 9a4e9b06..8bd531fc 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -4,12 +4,14 @@ +#include "globals.hpp" #include "Reference_Input.hpp" #include "Output_Format.hpp" #include #include #include +#include class Build_Params @@ -156,7 +158,7 @@ inline bool Build_Params::is_valid() const bool valid = true; - // Check if read and reference de Bruijn graphs parameters are being mixed with. + // Check if read and reference de Bruijn graph parameters are being mixed with. if(is_read_graph_) { if(!reference_input_.empty()) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp new file mode 100644 index 00000000..d707218f --- /dev/null +++ b/include/Read_CdBG.hpp @@ -0,0 +1,34 @@ + +#ifndef READ_CDBG_HPP +#define READ_CDBG_HPP + + + +#include "Build_Params.hpp" +#include "Kmer_Hash_Table.hpp" + + +// Read de Bruijn graph class to support the compaction algorithm. +template +class Read_CdBG +{ +private: + + const Build_Params params; // Required parameters (wrapped inside). + Kmer_Hash_Table hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + + +public: + + // Constructs a `Read_CdBG` object with the parameters required for the construction of + // the compacted representation of the underlying read de Bruijn graph wrapped in `params`. + Read_CdBG(const Build_Params& params); + + // Constructs the compacted read de Bruijn graph, employing the parameters received + // with the object-constructor. + void construct(); +}; + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 61d81310..f04b08a4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ set(PROJECT_SRC CdBG_Plain_Writer.cpp CdBG_GFA_Writer.cpp CdBG_GFA_Reduced_Writer.cpp + Read_CdBG.cpp Read_CdBG_Constructor.cpp Validator.cpp Validator_Hash_Table.cpp diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp new file mode 100644 index 00000000..6edf13b0 --- /dev/null +++ b/src/Read_CdBG.cpp @@ -0,0 +1,18 @@ + +#include "Read_CdBG.hpp" + + +template +Read_CdBG::Read_CdBG(const Build_Params& params): + params(params) +{} + + +template +void Read_CdBG::construct() +{} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) diff --git a/src/main.cpp b/src/main.cpp index 70f8a52b..c14b46e9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,6 +1,7 @@ #include "Input_Defaults.hpp" #include "CdBG.hpp" +#include "Read_CdBG.hpp" #include "Validator.hpp" #include "Build_Params.hpp" #include "Validation_Params.hpp" @@ -69,14 +70,17 @@ void build(int argc, char** argv) } std::cout.precision(3); - - std::cout << "\nConstructing the compacted de Bruijn graph for k = " << k << ".\n"; - const Application app(params); - app.execute(); + const std::string dBG_type = (params.is_read_graph() ? "read" : "reference"); + + std::cout << "\nConstructing the " << dBG_type << " compacted de Bruijn graph for k = " << k << ".\n"; + + const Application app_ref_dBG(params); + const Application app_read_dBG(params); + params.is_read_graph() ? app_read_dBG.execute() : app_ref_dBG.execute(); - std::cout << "\nConstructed the compacted de Bruijn graph at " << output_file << ".\n"; + std::cout << "\nConstructed the " << dBG_type << " compacted de Bruijn graph at " << output_file << ".\n"; } catch(const std::exception& e) { From 774c923217dd3c9e78c24315b4819b1b1ba81b43 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 7 Mar 2021 19:28:21 -0500 Subject: [PATCH 018/350] Correct template-terminology specializations -> instances --- src/CdBG.cpp | 2 +- src/CdBG_Builder.cpp | 2 +- src/CdBG_GFA_Reduced_Writer.cpp | 2 +- src/CdBG_GFA_Writer.cpp | 2 +- src/CdBG_Plain_Writer.cpp | 2 +- src/CdBG_Writer.cpp | 2 +- src/Kmer_Hash_Table.cpp | 2 +- src/Kmers_Validator.cpp | 2 +- src/Sequence_Validator.cpp | 2 +- src/Thread_Pool.cpp | 2 +- src/Validator.cpp | 2 +- src/Validator_Hash_Table.cpp | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/CdBG.cpp b/src/CdBG.cpp index ab27b837..b790e511 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -67,5 +67,5 @@ size_t CdBG::search_valid_kmer(const char* const seq, const size_t left_end, -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index 8b18f270..bd94b314 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -627,5 +627,5 @@ void CdBG::print_state_class_dist() const -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_GFA_Reduced_Writer.cpp b/src/CdBG_GFA_Reduced_Writer.cpp index ba37d662..0d131238 100644 --- a/src/CdBG_GFA_Reduced_Writer.cpp +++ b/src/CdBG_GFA_Reduced_Writer.cpp @@ -125,5 +125,5 @@ void CdBG::write_sequence_tiling(Job_Queue& job -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index a18b5b4f..48d4b2de 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -926,5 +926,5 @@ void CdBG::remove_temp_files(const uint64_t file_id) const -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_Plain_Writer.cpp b/src/CdBG_Plain_Writer.cpp index 1a957017..c9e22ff9 100644 --- a/src/CdBG_Plain_Writer.cpp +++ b/src/CdBG_Plain_Writer.cpp @@ -189,5 +189,5 @@ void CdBG::write_path(const uint16_t thread_id, const char* const seq, const -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index 93a3bdff..acc9298a 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -756,5 +756,5 @@ void CdBG::flush_path_loggers() -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index b6313ecb..b220c4db 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -181,5 +181,5 @@ Kmer_Hash_Table::~Kmer_Hash_Table() -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE_PER_BIT, Kmer_Hash_Table) diff --git a/src/Kmers_Validator.cpp b/src/Kmers_Validator.cpp index 6f8a7a14..2d7b4762 100644 --- a/src/Kmers_Validator.cpp +++ b/src/Kmers_Validator.cpp @@ -82,5 +82,5 @@ void Validator::validate_kmer_set(bool& result) const -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Validator) diff --git a/src/Sequence_Validator.cpp b/src/Sequence_Validator.cpp index e78028c5..575ae87d 100644 --- a/src/Sequence_Validator.cpp +++ b/src/Sequence_Validator.cpp @@ -252,5 +252,5 @@ bool Validator::walk_unitig(const char* const seq, const size_t seq_len, cons -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Validator) diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index 75adb771..7321710c 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -216,5 +216,5 @@ void Thread_Pool::close() -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Thread_Pool) diff --git a/src/Validator.cpp b/src/Validator.cpp index 4bd26008..98f54792 100644 --- a/src/Validator.cpp +++ b/src/Validator.cpp @@ -91,5 +91,5 @@ size_t Validator::search_valid_kmer(const char* const seq, const size_t seq_l -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Validator) diff --git a/src/Validator_Hash_Table.cpp b/src/Validator_Hash_Table.cpp index 64d2c8f6..1c7dc1ce 100644 --- a/src/Validator_Hash_Table.cpp +++ b/src/Validator_Hash_Table.cpp @@ -69,5 +69,5 @@ void Validator::clear() -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Validator) From 94f593db99f9980e27652812be2c4048a6ddd92b Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 7 Mar 2021 19:50:53 -0500 Subject: [PATCH 019/350] Construct MPHF --- src/Read_CdBG.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 6edf13b0..16802e34 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -10,7 +10,13 @@ Read_CdBG::Read_CdBG(const Build_Params& params): template void Read_CdBG::construct() -{} +{ + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; + hash_table.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + + + hash_table.clear(); +} From 57086070df12da285a69099c4909ef98365b371b Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 8 Mar 2021 17:33:56 -0500 Subject: [PATCH 020/350] Save memory in peeking k-mer db --- src/Kmer_Container.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 504d6e6e..67b70916 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -9,7 +9,7 @@ Kmer_Container::Kmer_Container(const std::string& kmc_file_path): kmc_file_path(kmc_file_path) { CKMCFile kmer_database; - if(!kmer_database.OpenForListing(kmc_file_path)) + if(!kmer_database.open_for_listing_unbuffered(kmc_file_path)) { std::cout << "Error opening KMC database files with prefix " << kmc_file_path << ". Aborting.\n"; std::exit(EXIT_FAILURE); @@ -119,5 +119,5 @@ void Kmer_Container::load_kmers(std::vector>& kmers) const -// Template instantiations for the required specializations. -ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Kmer_Container) +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE_ALL, Kmer_Container) From 4ce11a43d0a5ffa17abd4eb904dc4425151d7b36 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 8 Mar 2021 18:27:46 -0500 Subject: [PATCH 021/350] Fix iterator test code --- src/test.cpp | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index 58f0feff..8832cf8a 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -395,7 +395,8 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons { Kmer_Container kmer_container(db_path); - Kmer_SPMC_Iterator it(kmer_container.spmc_begin(consumer_count)); + // Kmer_SPMC_Iterator it(kmer_container.spmc_begin(consumer_count)); + Kmer_SPMC_Iterator it(&kmer_container, consumer_count); it.launch_production(); std::cout << "\nProduction ongoing\n"; @@ -403,10 +404,10 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons std::vector> T(consumer_count); std::vector> max_kmer(consumer_count); + std::atomic ctr{0}; for(size_t i = 0; i < consumer_count; ++i) { const size_t consumer_id = i; - std::atomic ctr{0}; auto& mk = max_kmer[consumer_id]; T[consumer_id].reset( new std::thread([&kmer_container, &it, &mk, &ctr, consumer_id]() @@ -427,29 +428,8 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons std::cerr << "parsed " << ctr << " k-mers\n"; } } - /* - if(it.task_available(consumer_id)) {// && it.value_at(consumer_id, kmer)) { - // - { - auto* ts = it.thread_state_for(consumer_id); - while (ts->kmers_parsed < ts->kmers_available) - { - kmerdb->parse_kmer(ts->pref_idx, ts->suff_idx, ts->buffer, - ts->kmers_parsed * kmerdb->suff_record_size(), kmer); - - max_kmer = std::max(max_kmer, kmer); - //auto v = ctr++; - //if (v % 10000000 == 1) - //{ - // std::cerr << "parsed " << ctr << " k-mers\n"; - //} - ts->kmers_parsed++; - } - it.set_pending(consumer_id); - } - } - */ - // max_kmer[i] = std::max(max_kmer[i], kmer); + + ctr += local_count; mk = max_kmer; } ) @@ -465,6 +445,7 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons //for (size_t i = 0; i < consumer_count; ++i) { // global_max = std::max(global_max, max_kmer[i]); //} + std::cout << "Parsed " << ctr << " k-mers\n"; std::cout << "Max k-mer: " << std::max_element(max_kmer.begin(), max_kmer.end())->string_label() << "\n"; } @@ -595,11 +576,11 @@ int main(int argc, char** argv) // count_kmers_in_unitigs(argv[1], atoi(argv[2])); - // static constexpr uint16_t k = 25; - // static const size_t consumer_count = std::atoi(argv[2]); + static constexpr uint16_t k = 26; + static const size_t consumer_count = std::atoi(argv[2]); // test_buffered_iterator_performance(argv[1]); - // test_SPMC_iterator_performance(argv[1], consumer_count); + test_SPMC_iterator_performance(argv[1], consumer_count); return 0; From 897eb68aad2381bcaaad836589a4fd2b38b916aa Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 8 Mar 2021 18:29:43 -0500 Subject: [PATCH 022/350] Add skeleton for states computing --- include/Read_CdBG_Constructor.hpp | 6 +++++- include/globals.hpp | 5 +++++ src/Read_CdBG.cpp | 4 ++++ src/Read_CdBG_Constructor.cpp | 26 +++++++++++++++++++++----- 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index e1c26b4a..12acaddd 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -8,9 +8,11 @@ #include "Kmer_Hash_Table.hpp" #include "Build_Params.hpp" #include "Thread_Pool.hpp" +#include "Kmer_Container.hpp" +#include "Kmer_SPMC_Iterator.hpp" -// A class to construct read-compacted de Bruijn graphs. +// A class to construct compacted read de Bruijn graphs. template class Read_CdBG_Constructor { @@ -20,6 +22,8 @@ class Read_CdBG_Constructor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + const Kmer_Container edge_container; // Wrapper container for the edge-database. + Kmer_SPMC_Iterator edge_parser; // Parser for the edges from the edge-database. // Distributes the DFA-states computation task to the worker threads in the thread pool `thread_pool`. diff --git a/include/globals.hpp b/include/globals.hpp index b280a12b..c497c3b8 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -92,6 +92,11 @@ namespace cuttlefish #define INSTANTIATE_PER_BIT(z, k, class_name) template class class_name<2 * k + 1, cuttlefish::BITS_PER_REF_KMER>;\ template class class_name<2 * k + 1, cuttlefish::BITS_PER_READ_KMER>; +// Given some `x`, explicitly instantiates two instances of the class `class_name`, using the template parameter `k` +// with `2x + 1` and `2x + 2`, i.e. it is an instantiator for both odd and even k-values. +#define INSTANTIATE_ALL(z, x, class_name) template class class_name<2 * x + 1>;\ + template class class_name<2 * x + 2>; + // BOOST_PP_REPEAT reference: https://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/ref/repeat.html diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 16802e34..74eb010e 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -1,5 +1,6 @@ #include "Read_CdBG.hpp" +#include "Read_CdBG_Constructor.hpp" template @@ -14,6 +15,9 @@ void Read_CdBG::construct() std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; hash_table.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + std::cout << "\nComputing the DFA states.\n"; + Read_CdBG_Constructor cdBg_constructor(params, hash_table); + cdBg_constructor.compute_DFA_states(); hash_table.clear(); } diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index d68fd039..675688ed 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -5,8 +5,12 @@ template Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table): params(params), - hash_table(hash_table) -{} + hash_table(hash_table), + edge_container(params.edge_db_path()), + edge_parser(&edge_container, params.thread_count()) +{ + std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; +} template @@ -16,8 +20,16 @@ void Read_CdBG_Constructor::compute_DFA_states() const uint16_t thread_count = params.thread_count(); Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); - // Multi-threaded computation. + // Launch the reading (and parsing per demand) of the edges from disk. + edge_parser.launch_production(); + + // Launch (multi-threaded) computation of the states. distribute_states_computation(thread_pool); + + // Wait for the edges to be depleted from the database. + edge_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing the edges. thread_pool.close(); } @@ -38,10 +50,14 @@ void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thr template void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) { - (void)thread_id; + Kmer edge; + + while(edge_parser.tasks_expected(thread_id)) + if(edge_parser.value_at(thread_id, edge)) + {} } -// Template instantiations for the required specializations. +// Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Constructor) From cea18b04b01cc3538b6bb0998a79c8583c212313 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 8 Mar 2021 20:31:27 -0500 Subject: [PATCH 023/350] Help debug --- src/Read_CdBG.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 74eb010e..dbd944cc 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -12,8 +12,8 @@ Read_CdBG::Read_CdBG(const Build_Params& params): template void Read_CdBG::construct() { - std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - hash_table.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + // std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; + // hash_table.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); std::cout << "\nComputing the DFA states.\n"; Read_CdBG_Constructor cdBg_constructor(params, hash_table); From f1c5f394db6d1d1fa22190625cf355088cd7d8e8 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 9 Mar 2021 12:49:38 -0500 Subject: [PATCH 024/350] Add hash table constructor --- include/Kmer_Container.hpp | 3 +++ include/Kmer_Hash_Table.hpp | 14 ++++++++++++-- src/CdBG.cpp | 5 +++-- src/Kmer_Container.cpp | 8 ++++++++ src/Kmer_Hash_Table.cpp | 21 ++++++++++++++------- src/Read_CdBG.cpp | 3 ++- 6 files changed, 42 insertions(+), 12 deletions(-) diff --git a/include/Kmer_Container.hpp b/include/Kmer_Container.hpp index 3c0196f7..00f977bc 100644 --- a/include/Kmer_Container.hpp +++ b/include/Kmer_Container.hpp @@ -43,6 +43,9 @@ class Kmer_Container // Returns the number of k-mers present in the underlying k-mer database. uint64_t size() const; + // Returns the number of k-mers present in the k-mer database with path prefix `kmc_db_path`. + static uint64_t size(const std::string& kmc_db_path); + // Returns an iterator pointing to the beginning of the underlying k-mer // database. diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index dd146f98..d31f18c4 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -32,6 +32,12 @@ class Kmer_Hash_Table // Lowest bits/elem is achieved with gamma = 1, higher values lead to larger mphf but faster construction/query. constexpr static double GAMMA_FACTOR = 2.0; + // Path to the underlying k-mer database, over which the hash table is constructed. + const std::string& kmc_db_path; + + // Number of keys (`Kmer`s) in the hash table. + const uint64_t kmer_count; + // The MPH function. // TODO: Initialize with `std::nullptr`. mphf_t* mph = NULL; @@ -54,7 +60,7 @@ class Kmer_Hash_Table // with `mph_file_path` being the file to use for BBHash build // using `thread_count` number of threads. Uses the directory // at `working_dir_path` to store temporary files. - void build_mph_function(const Kmer_Container& kmer_container, uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); + void build_mph_function(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); // Loads an MPH function from the file at `file_path` into `mph`. void load_mph_function(const std::string& file_path); @@ -80,12 +86,16 @@ class Kmer_Hash_Table public: + // Constructs a `Kmer_Hash_Table` object, where the hash table is to be built + // over the KMC database with path prefix `kmc_db_path`. + Kmer_Hash_Table(const std::string& kmc_db_path); + // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, // using up-to `thread_count` number of threads. If a non-empty path is passed // with `mph_file_path`, either an MPH is loaded from there (instead of building // from scratch), or the newly built MPH is saved there. - void construct(const std::string& kmc_db_path, uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); + void construct(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); // Returns an API to the entry (in the hash table) for the key `kmer`. The API // wraps the hash table position and the state value at that position. diff --git a/src/CdBG.cpp b/src/CdBG.cpp index b790e511..e34670f7 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -5,7 +5,8 @@ template CdBG::CdBG(const Build_Params& params): - params(params) + params(params), + Vertices(params.vertex_db_path()) { Kmer::set_k(params.k()); } @@ -15,7 +16,7 @@ template void CdBG::construct() { std::cout << "\nConstructing the minimal perfect hash function (MPHF).\n"; - Vertices.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + Vertices.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); if(params.remove_kmc_db()) { diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 67b70916..9feb2a2d 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -54,6 +54,14 @@ uint64_t Kmer_Container::size() const } +template +uint64_t Kmer_Container::size(const std::string& kmc_db_path) +{ + const Kmer_Container kmer_container(kmc_db_path); + return kmer_container.size(); +} + + template typename Kmer_Container::iterator Kmer_Container::begin() const { diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index b220c4db..d11c046e 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -9,7 +9,14 @@ template -void Kmer_Hash_Table::build_mph_function(const Kmer_Container& kmer_container, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path): + kmc_db_path(kmc_db_path), + kmer_count{Kmer_Container::size(kmc_db_path)} +{} + + +template +void Kmer_Hash_Table::build_mph_function(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { // The serialized BBHash file (saved from some earlier execution) exists. struct stat buffer; @@ -24,12 +31,15 @@ void Kmer_Hash_Table::build_mph_function(const Kmer_Container kmer_container(kmc_db_path); + // Build the MPHF. std::cout << "Building the MPHF from the k-mer database " << kmer_container.container_location() << ".\n"; // auto data_iterator = boomphf::range(kmer_container.buf_begin(), kmer_container.buf_end()); const auto data_iterator = boomphf::range(kmer_container.spmc_begin(thread_count), kmer_container.spmc_end(thread_count)); - mph = new mphf_t(kmer_container.size(), data_iterator, working_dir_path, thread_count, GAMMA_FACTOR); + mph = new mphf_t(kmer_count, data_iterator, working_dir_path, thread_count, GAMMA_FACTOR); std::cout << "Built the MPHF in memory.\n"; @@ -116,21 +126,18 @@ void Kmer_Hash_Table::load_hash_buckets(const std::string& file template -void Kmer_Hash_Table::construct(const std::string& kmc_db_path, const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) +void Kmer_Hash_Table::construct(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - // Open a container over the k-mer database. - const Kmer_Container kmer_container(kmc_db_path); - const uint64_t kmer_count = kmer_container.size(); std::cout << "Total number of k-mers in the set (KMC database): " << kmer_count << ".\n"; sparse_lock_ptr = new Sparse_Lock(kmer_count, lock_count); // Build the minimal perfect hash function. - build_mph_function(kmer_container, thread_count, working_dir_path, mph_file_path); + build_mph_function(thread_count, working_dir_path, mph_file_path); const uint64_t total_bits = mph->totalBitSize(); std::cout << "\nTotal MPHF size: " << total_bits / (8 * 1024 * 1024) << " MB." diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index dbd944cc..593471f8 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -5,7 +5,8 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): - params(params) + params(params), + hash_table(params.vertex_db_path()) {} From 0e5e38e57b0f8dc8a2b903dc78a3451d22c3f8da Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 9 Mar 2021 13:14:13 -0500 Subject: [PATCH 025/350] Better design choice for (hidden) locks T* const -> mutable T --- include/Kmer_Hash_Table.hpp | 15 +++++++-------- src/Kmer_Hash_Table.cpp | 11 ++--------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index d31f18c4..5d646b0a 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -50,9 +50,8 @@ class Kmer_Hash_Table // TODO: increase locks and check note at the end about the false `const` issue. constexpr static uint64_t lock_count{65536}; - // The locks to maintain mutually exclusive access for threads to the same indices into the bitvector `hash_table`. - Sparse_Lock* sparse_lock_ptr{nullptr}; + mutable Sparse_Lock sparse_lock; // Builds the minimal perfect hash function `mph` over the set of @@ -128,9 +127,9 @@ inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) template inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) { - sparse_lock_ptr->lock(bucket_id); + sparse_lock.lock(bucket_id); const Kmer_Hash_Entry_API r(hash_table[bucket_id]); - sparse_lock_ptr->unlock(bucket_id); + sparse_lock.unlock(bucket_id); return r; } @@ -148,9 +147,9 @@ inline State Kmer_Hash_Table::operator[](const Kmer& kmer) c { const uint64_t bucket = bucket_id(kmer); - sparse_lock_ptr->lock(bucket); + sparse_lock.lock(bucket); const State state(hash_table[bucket]); - sparse_lock_ptr->unlock(bucket); + sparse_lock.unlock(bucket); return state; } @@ -171,11 +170,11 @@ inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_APIlock(bucket); + sparse_lock.lock(bucket); const bool success = (api.bv_entry == api.get_read_state()); if(success) api.bv_entry = api.get_current_state(); - sparse_lock_ptr->unlock(bucket); + sparse_lock.unlock(bucket); return success; } diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index d11c046e..9d7d4e89 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -11,7 +11,8 @@ template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path): kmc_db_path(kmc_db_path), - kmer_count{Kmer_Container::size(kmc_db_path)} + kmer_count{Kmer_Container::size(kmc_db_path)}, + sparse_lock(kmer_count, lock_count) {} @@ -132,8 +133,6 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co std::cout << "Total number of k-mers in the set (KMC database): " << kmer_count << ".\n"; - - sparse_lock_ptr = new Sparse_Lock(kmer_count, lock_count); // Build the minimal perfect hash function. @@ -168,12 +167,6 @@ void Kmer_Hash_Table::clear() mph = NULL; - - if(sparse_lock_ptr != nullptr) - delete sparse_lock_ptr; - - sparse_lock_ptr = nullptr; - // hash_table.clear(); hash_table.resize(0); From cbfda4ffac1720c7ae52a10fd59db707aa796424 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 15 Mar 2021 12:13:39 -0400 Subject: [PATCH 026/350] Update minor comments --- include/CdBG.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 9764635f..92af122a 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -415,11 +415,12 @@ class CdBG public: - // Constructs a `CdBG` object with the parameters wrapped at `params`. + // Constructs a `CdBG` object with the parameters required for the construction of the + // compacted representation of the underlying reference de Bruijn graph wrapped in `params`. CdBG(const Build_Params& params); - // Constructs the compacted de Bruijn graph using up-to `thread_count` threads, and - // outputs the maximal unitigs into the file named `output_file_name`. + // Constructs the compacted reference de Bruijn graph, employing the parameters received + // with the object-constructor. void construct(); }; From f7f998ed429d0f47de72b6746c998c5c8523a97b Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 15 Mar 2021 15:26:25 -0400 Subject: [PATCH 027/350] Reinstate MPHF construction --- src/Read_CdBG.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 593471f8..cab9d2a1 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -13,8 +13,8 @@ Read_CdBG::Read_CdBG(const Build_Params& params): template void Read_CdBG::construct() { - // std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - // hash_table.construct(params.vertex_db_path(), params.thread_count(), params.working_dir_path(), params.mph_file_path()); + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; + hash_table.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); std::cout << "\nComputing the DFA states.\n"; Read_CdBG_Constructor cdBg_constructor(params, hash_table); From 5c4c48b6c275cef73fa8b8dc91a72a3c52e7f217 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 15 Mar 2021 15:27:07 -0400 Subject: [PATCH 028/350] Add states computation timer --- src/Read_CdBG_Constructor.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 675688ed..d36792b2 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -1,6 +1,8 @@ #include "Read_CdBG_Constructor.hpp" +#include "chrono" + template Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table): @@ -16,6 +18,9 @@ Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer template void Read_CdBG_Constructor::compute_DFA_states() { + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + + // Construct a thread pool. const uint16_t thread_count = params.thread_count(); Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); @@ -31,6 +36,11 @@ void Read_CdBG_Constructor::compute_DFA_states() // Wait for the consumer threads to finish parsing and processing the edges. thread_pool.close(); + + + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + std::cout << "Done computing the DFA states. Time taken = " << elapsed_seconds << " seconds.\n"; } From 9d36a9ca28c3a674c8f1b5b2edf29cbabc785e77 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 16 Mar 2021 20:58:22 -0400 Subject: [PATCH 029/350] Static-link jemalloc --- CMakeLists.txt | 27 +++++++++++++++++++++++++++ src/CMakeLists.txt | 7 +++++++ 2 files changed, 34 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ff6b711..a0ad3549 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,33 @@ set(THREADS_PREFER_PTHREAD_FLAG TRUE) # The BBHash library uses `pthread`. # the `kseq` library to gzipped compressed files. find_package(ZLIB REQUIRED) +# Search and load setting for the `jemalloc` library. It provides scalable concurrency support +# and better avoidance of fragmentation. +set(FAST_MALLOC_LIB "") +find_package(jemalloc) +if(jemalloc_FOUND) + message("Found the Jemalloc library") + set(FAST_MALLOC_LIB ${JEMALLOC_LIBRARIES}) +else() + message("Build system is fetching and installing jemalloc") + + include(ExternalProject) + + ExternalProject_Add(libjemalloc + DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external + DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz + && tar -xzf jemalloc-5.2.1.tar.gz + + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 + BUILD_IN_SOURCE TRUE + INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/install + CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" + INSTALL_COMMAND cp -r lib / && cp -r include / + ) + + set(FAST_MALLOC_LIB ${CMAKE_SOURCE_DIR}/external/install/lib/libjemalloc.a) +endif() + # The `Debug` configuration optimizes the program for debugging and enables full debug information. # The `Release` configuration enables most compiler optimizations for speed and defines `NDEBUG` diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f04b08a4..1dfce584 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,6 +52,13 @@ target_compile_options(core PRIVATE ${WARNING_FLAGS} ${SUPPRESS_WARNING_FLAGS} $ add_executable(${PROJECT_NAME} main.cpp) +# Link the core library to the `jemalloc` library, for better `malloc` support. +target_link_libraries(core PRIVATE ${FAST_MALLOC_LIB}) + +# Link the core library to the `dl` library, required in using dynamic shared object. +# Needed by `jemalloc`. +target_link_libraries(core PRIVATE ${CMAKE_DL_LIBS}) + # Link the core library to the threads package in the platform. target_link_libraries(core PRIVATE ${CMAKE_THREAD_LIBS_INIT}) From 6eb4c19cae684e78af9f63c66393adb73ffad509 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 17 Mar 2021 11:21:19 -0400 Subject: [PATCH 030/350] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9c98bf59..5f3c03c7 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ bin/ # Miscellaneous .vscode/ +external/ From 59f5a8bb8b044931a5d1ce8415ede1c1cc109c1f Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 17 Mar 2021 12:11:54 -0400 Subject: [PATCH 031/350] Track processed edge count --- include/Read_CdBG_Constructor.hpp | 4 ++++ src/Read_CdBG_Constructor.cpp | 12 +++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 12acaddd..51a6188f 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -25,6 +25,10 @@ class Read_CdBG_Constructor const Kmer_Container edge_container; // Wrapper container for the edge-database. Kmer_SPMC_Iterator edge_parser; // Parser for the edges from the edge-database. + // Members required to keep track of the total number of edges processed across different threads. + mutable Spin_Lock lock; + mutable uint64_t edges_processed = 0; + // Distributes the DFA-states computation task to the worker threads in the thread pool `thread_pool`. void distribute_states_computation(Thread_Pool& thread_pool); diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index d36792b2..a60d82ed 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -37,6 +37,8 @@ void Read_CdBG_Constructor::compute_DFA_states() // Wait for the consumer threads to finish parsing and processing the edges. thread_pool.close(); + std::cout << "Number of processed egdes: " << edges_processed << "\n"; + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); @@ -61,10 +63,18 @@ template void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) { Kmer edge; + uint64_t edge_count = 0; while(edge_parser.tasks_expected(thread_id)) if(edge_parser.value_at(thread_id, edge)) - {} + { + edge_count++; + } + + lock.lock(); + std::cout << "Thread " << thread_id << " processed " << edge_count << " edges.\n"; // Temporary log. TODO: remove. + edges_processed += edge_count; + lock.unlock(); } From 5355972cbebb164b33429912883465cc979af0cd Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 17 Mar 2021 20:47:58 -0400 Subject: [PATCH 032/350] Add extractability of vertices from edges --- include/Kmer.hpp | 58 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 9dd73a69..84771fb1 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -13,9 +13,18 @@ #include +// Defining this macro states our intent that only odd k-values will be used for de Bruijn graph vertices. +// Hence, extraction of k-mers from (k + 1)-mers — vertices from edges — will only happen when k is odd. +#define ODD_K + + template class Kmer: public DNA_Utility { + // Make k-mers friend for (k + 1)-mer, so that de Bruijn graph vertices, i.e. k-mers, + // may access private information (the raw data) from edges, i.e. (k + 1)-mers. + friend class Kmer; + private: // Number of 64-bit integers required to compactly represent the underlying k-mer with 2-bits/base encoding. @@ -80,6 +89,14 @@ class Kmer: public DNA_Utility // Gets the k-mer from its KMC raw-binary representation. void from_KMC_data(const uint64_t* kmc_data); + // Gets the k-mer that is a prefix of the provided + // (k + 1)-mer `k_plus_1_mer`. + void from_prefix(const Kmer& k_plus_1_mer); + + // Gets the k-mer that is a suffix of the provided + // (k + 1)-mer `k_plus_1_mer`. + void from_suffix(const Kmer& k_plus_1_mer); + // Returns the reverese complement of the k-mer. Kmer reverse_complement() const; @@ -113,9 +130,10 @@ class Kmer: public DNA_Utility // Returns the string label of the k-mer. std::string string_label() const; - // For debugging purposes. + // Prints the literal representation of the K-mer `kmer` to the + // stream `ostream`. template - friend std::ostream& operator<<(std::ostream& out, const Kmer& kmer); + friend std::ostream& operator<<(std::ostream& out, const Kmer& kmer); }; @@ -268,6 +286,39 @@ inline void Kmer::from_KMC_data(const uint64_t* const kmc_data) } +template +inline void Kmer::from_prefix(const Kmer& k_plus_1_mer) +{ + // Note: `Kmer` and `Kmer` always have the same number of words (i.e. `NUM_INTS`) for odd k-values. + // The only time that they have different numbers of words is when `k` is a multiple of 32. In such cases, + // a (k + 1)-mer contains just one base in its highest index word, and a k-mer's words are fully packed. + + std::memcpy(kmer_data, k_plus_1_mer.kmer_data, NUM_INTS * sizeof(uint64_t)); + right_shift(); // Clear the LSN of the (k + 1)-mer, from the k-mer. + + #ifndef ODD_K // The following `if` conditional can only be `true` when `k` is a multiple of 32. + constexpr uint16_t kp1_NUM_INTS = ((k + 1) + 31) / 32; + if(kp1_NUM_INTS != NUM_INTS) // Fetch the only base not copied from the (k + 1)-mer as the MSN for this k-mer. + kmer_data[NUM_INTS - 1] |= (k_plus_1_mer.kmer_data[kp1_NUM_INTS - 1] << 62); + #endif +} + + +template +inline void Kmer::from_suffix(const Kmer& k_plus_1_mer) +{ + std::memcpy(kmer_data, k_plus_1_mer.kmer_data, NUM_INTS * sizeof(uint64_t)); + + #ifndef ODD_K // The following `if` conditional can only be `true` when `k` is a multiple of 32. + constexpr uint16_t kp1_NUM_INTS = ((k + 1) + 31) / 32; + if(kp1_NUM_INTS != NUM_INTS) // The only base not copied from the (k + 1)-mer isn't required to be fetched — it will be cleared out anyways. + return; + #endif + + kmer_data[NUM_INTS - 1] &= Kmer::CLEAR_MSN_MASK; // Clear the MSN of the (k + 1)-mer from this k-mer. +} + + template inline Kmer Kmer::reverse_complement() const { @@ -330,6 +381,9 @@ inline void Kmer::roll_to_next_kmer(const char next_base, Kmer& rev_compl) { const DNA::Base mapped_base = map_base(next_base); + // Logically, since a left shift moves the MSN out of the length `k` boundary, the clearing of the base + // may seem redundant. But, the `to_u64` hashing method implementation works with bytes — not clearing + // out this base breaks the consistency of the hashing. kmer_data[NUM_INTS - 1] &= CLEAR_MSN_MASK; left_shift(); kmer_data[0] |= mapped_base; From 127a6f6833a529228a1610cae7d110a81da01038 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 18 Mar 2021 21:07:59 -0400 Subject: [PATCH 033/350] Have no executing test code --- src/test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index 8832cf8a..3e6e6e28 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -576,11 +576,11 @@ int main(int argc, char** argv) // count_kmers_in_unitigs(argv[1], atoi(argv[2])); - static constexpr uint16_t k = 26; - static const size_t consumer_count = std::atoi(argv[2]); + // static constexpr uint16_t k = 26; + // static const size_t consumer_count = std::atoi(argv[2]); // test_buffered_iterator_performance(argv[1]); - test_SPMC_iterator_performance(argv[1], consumer_count); + // test_SPMC_iterator_performance(argv[1], consumer_count); return 0; From 8af5429d86fd5fa5ebd78be1f78f482c773e0079 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 18 Mar 2021 21:09:03 -0400 Subject: [PATCH 034/350] Put reverse complements for all byte configs --- include/DNA_Utility.hpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index f4ecd3c5..2f7feafa 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -77,6 +77,28 @@ class DNA_Utility 1, 1, 1, 1, 1, 1, 1, 1 // 120 - 127 }; + // TODO: Move these new k-mer specific (and not DNA-base specific) stuffs to a separate class. + // Reverse complement (in the `DNA::Base` representation) of all possible bytes. + static constexpr uint8_t REVERSE_COMPLEMENT_BYTE[256] = + { + 255, 191, 127, 63, 239, 175, 111, 47, 223, 159, 95, 31, 207, 143, 79, 15, + 251, 187, 123, 59, 235, 171, 107, 43, 219, 155, 91, 27, 203, 139, 75, 11, + 247, 183, 119, 55, 231, 167, 103, 39, 215, 151, 87, 23, 199, 135, 71, 7, + 243, 179, 115, 51, 227, 163, 99, 35, 211, 147, 83, 19, 195, 131, 67, 3, + 254, 190, 126, 62, 238, 174, 110, 46, 222, 158, 94, 30, 206, 142, 78, 14, + 250, 186, 122, 58, 234, 170, 106, 42, 218, 154, 90, 26, 202, 138, 74, 10, + 246, 182, 118, 54, 230, 166, 102, 38, 214, 150, 86, 22, 198, 134, 70, 6, + 242, 178, 114, 50, 226, 162, 98, 34, 210, 146, 82, 18, 194, 130, 66, 2, + 253, 189, 125, 61, 237, 173, 109, 45, 221, 157, 93, 29, 205, 141, 77, 13, + 249, 185, 121, 57, 233, 169, 105, 41, 217, 153, 89, 25, 201, 137, 73, 9, + 245, 181, 117, 53, 229, 165, 101, 37, 213, 149, 85, 21, 197, 133, 69, 5, + 241, 177, 113, 49, 225, 161, 97, 33, 209, 145, 81, 17, 193, 129, 65, 1, + 252, 188, 124, 60, 236, 172, 108, 44, 220, 156, 92, 28, 204, 140, 76, 12, + 248, 184, 120, 56, 232, 168, 104, 40, 216, 152, 88, 24, 200, 136, 72, 8, + 244, 180, 116, 52, 228, 164, 100, 36, 212, 148, 84, 20, 196, 132, 68, 4, + 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 80, 16, 192, 128, 64, 0 + }; + public: @@ -109,6 +131,13 @@ class DNA_Utility { return base <= 'T' ? base : (base - ('a' - 'A')); } + + // Returns the reverse completement byte of the 4-mer `byte`; + // both are to be in the `DNA::Base` representation. + static uint8_t reverse_complement(const uint8_t byte) + { + return REVERSE_COMPLEMENT_BYTE[byte]; + } }; From 6ea02a1884e9caa13e18da7f1542e15f62b9b96f Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 18 Mar 2021 21:12:41 -0400 Subject: [PATCH 035/350] Add generic left-shifting of k-mers (fails compilation) --- include/Kmer.hpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 84771fb1..581695fd 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -46,6 +46,10 @@ class Kmer: public DNA_Utility // Right-shifts the collection of the bits at the `kmer_data` array by one base (2-bits). void right_shift(); + // Left-shifts the collection of the bits at the `kmer_data` array by `N` bases (2N-bits). + template void left_shift(char(*)[N != 0] = 0); + template void left_shift(char(*)[N == 0] = 0); + public: @@ -161,6 +165,28 @@ inline void Kmer::right_shift() } +template +template +inline void Kmer::left_shift(char(*)[N != 0]) +{ + // static_assert(N > 0 && N < 4, "invalid bit-shift amount"); + + constexpr uint16_t num_bit_shift = (N > 0 ? 2 * N : 1); + constexpr uint64_t mask_MSNs = ((static_cast(1) << num_bit_shift) - 1) << (64 - num_bit_shift); + + for(uint16_t idx = NUM_INTS - 1; idx > 0; --idx) + kmer_data[idx] = (kmer_data[idx] << num_bit_shift) | ((kmer_data[idx - 1] & mask_MSNs) >> (64 - num_bit_shift)); + + kmer_data[0] <<= num_bit_shift; +} + + +template +template +inline void Kmer::left_shift(char(*)[N == 0]) +{} + + template inline uint64_t Kmer::to_u64(uint64_t seed) const { From eaa40efac6074efc888f623d0a0d1ea9c92c2ad6 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 18 Mar 2021 22:35:25 -0400 Subject: [PATCH 036/350] Fix left-shifter's compile issue Bjarne plz --- include/Kmer.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 581695fd..5ffb5147 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -46,9 +46,9 @@ class Kmer: public DNA_Utility // Right-shifts the collection of the bits at the `kmer_data` array by one base (2-bits). void right_shift(); - // Left-shifts the collection of the bits at the `kmer_data` array by `N` bases (2N-bits). - template void left_shift(char(*)[N != 0] = 0); - template void left_shift(char(*)[N == 0] = 0); + // Left-shifts the collection of the bits at the `kmer_data` array by `B` bases (2B-bits). + template void left_shift(char(*)[B != 0] = 0); + template void left_shift(char(*)[B == 0] = 0); public: @@ -166,12 +166,12 @@ inline void Kmer::right_shift() template -template -inline void Kmer::left_shift(char(*)[N != 0]) +template +inline void Kmer::left_shift(char(*)[B != 0]) { - // static_assert(N > 0 && N < 4, "invalid bit-shift amount"); + static_assert(B < 32, "invalid bit-shift amount"); - constexpr uint16_t num_bit_shift = (N > 0 ? 2 * N : 1); + constexpr uint16_t num_bit_shift = 2 * B; constexpr uint64_t mask_MSNs = ((static_cast(1) << num_bit_shift) - 1) << (64 - num_bit_shift); for(uint16_t idx = NUM_INTS - 1; idx > 0; --idx) @@ -182,8 +182,8 @@ inline void Kmer::left_shift(char(*)[N != 0]) template -template -inline void Kmer::left_shift(char(*)[N == 0]) +template +inline void Kmer::left_shift(char(*)[B == 0]) {} From 5e28dd867b5a89745593421e5885143112e1b98c Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 21 Mar 2021 13:33:21 -0400 Subject: [PATCH 037/350] Update random string generator --- include/utility.hpp | 6 ++++-- src/utility.cpp | 23 +++++++---------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/include/utility.hpp b/include/utility.hpp index 686e28e3..6c1a90de 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -6,8 +6,10 @@ #include -// Returns a random string of length `len`. -std::string get_random_string(size_t len); +// Returns a random string of length `len`, using characters from `alphabet`. +std::string get_random_string(size_t len, const char* alphabet = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"); // Returns `true` iff `pref` is a prefix of `s`. bool is_prefix(const std::string& s, const std::string& pref); diff --git a/src/utility.cpp b/src/utility.cpp index 3a0b1aac..6bb44e76 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -9,24 +9,15 @@ #include -std::string get_random_string(const size_t len) +std::string get_random_string(const size_t len, const char* const alphabet) { - static const char alphabet[] = - "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz"; - char *s = new char[len + 1]; - - const unsigned seed = time(NULL); - srand(seed); - for (size_t i = 0; i < len; ++i) - s[i] = alphabet[(std::rand() % (sizeof(alphabet) - 1))]; - - s[len] = '\0'; - + std::string str; + str.reserve(len); - const std::string str(s); - delete s; + const unsigned int seed = static_cast(std::time(NULL)); + std::srand(seed); + for (size_t i = 0; i < len; ++i) + str += alphabet[(std::rand() % (sizeof(alphabet) - 1))]; return str; } From 2a2e71ac8f33e0d9ac8e3e25a2dbd3e56c2a531d Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 21 Mar 2021 16:54:32 -0400 Subject: [PATCH 038/350] Add byte-wise reverse complementing --- include/Kmer.hpp | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 5ffb5147..177899a0 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -104,6 +104,10 @@ class Kmer: public DNA_Utility // Returns the reverese complement of the k-mer. Kmer reverse_complement() const; + // Gets the k-mer that is the reverse complement of + // the provided k-mer `other`. + void as_reverse_complement(const Kmer& kmer); + // Returns true iff the bitwise encoding of this k-mer is lesser to the // encoding of the other k-mer `rhs`. bool operator<(const Kmer& rhs) const; @@ -348,6 +352,8 @@ inline void Kmer::from_suffix(const Kmer& k_plus_1_mer) template inline Kmer Kmer::reverse_complement() const { + // TODO: define the method using `as_reverse_complement`. + Kmer kmer(*this); Kmer rev_compl; @@ -366,6 +372,38 @@ inline Kmer Kmer::reverse_complement() const } +template +inline void Kmer::as_reverse_complement(const Kmer& other) +{ + // Working with bytes instead of 64-bit words at a time. + + uint8_t* const rev_compl = reinterpret_cast(kmer_data); + const uint8_t* const data = reinterpret_cast(other.kmer_data); + + + // Get the reverse complement for the fully packed bytes. + + constexpr uint16_t packed_byte_count = k / 4; + + for(uint16_t byte_idx = 0; byte_idx < packed_byte_count; ++byte_idx) + rev_compl[packed_byte_count - 1 - byte_idx] = DNA_Utility::reverse_complement(data[byte_idx]); + + + // Get the reverse complement for the only byte that might be partially packed (possible for the highest-indexed byte only). + + constexpr uint16_t rem_base_count = (k & 3); + if(rem_base_count == 0) // if constexpr(rem_base_count == 0) // C++17 compile-time optimization + return; + + rev_compl[packed_byte_count] = 0; + left_shift(); + + for(int i = 0; i < rem_base_count; ++i) + rev_compl[0] |= (DNA_Utility::complement(DNA::Base((data[packed_byte_count] & (0b11 << (2 * i))) >> (2 * i))) + << (2 * (rem_base_count - 1 - i))); +} + + template inline bool Kmer::operator<(const Kmer& rhs) const { From ffe1f3d43fad59649280bfd91ccb4d4736ab93d3 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 21 Mar 2021 17:15:32 -0400 Subject: [PATCH 039/350] Define reverse complement bytes --- src/DNA_Utility.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DNA_Utility.cpp b/src/DNA_Utility.cpp index 35343fa8..1a54742c 100644 --- a/src/DNA_Utility.cpp +++ b/src/DNA_Utility.cpp @@ -6,3 +6,4 @@ constexpr DNA::Base DNA_Utility::MAPPED_BASE[128]; constexpr DNA::Base DNA_Utility::COMPLEMENTED_BASE[5]; constexpr char DNA_Utility::COMPLEMENTED_CHAR[128]; constexpr bool DNA_Utility::IS_PLACEHOLDER[128]; +constexpr uint8_t DNA_Utility::REVERSE_COMPLEMENT_BYTE[256]; From 7abea627b54ea063e6c28a02a76bc01cef12f982 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 22 Mar 2021 19:00:02 -0400 Subject: [PATCH 040/350] Add random k-mer generator --- include/Kmer.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 177899a0..93c3aca5 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -5,6 +5,7 @@ #include "DNA_Utility.hpp" +#include "utility.hpp" #include "kmc_api/kmc_file.h" #include "xxHash/xxh3.h" @@ -138,6 +139,9 @@ class Kmer: public DNA_Utility // Returns the string label of the k-mer. std::string string_label() const; + // Returns a randomly generated k-mer. + static Kmer random_kmer(); + // Prints the literal representation of the K-mer `kmer` to the // stream `ostream`. template @@ -516,6 +520,13 @@ inline std::string Kmer::string_label() const } +template +inline Kmer Kmer::random_kmer() +{ + return Kmer(get_random_string(k, "ACGT")); +} + + template std::ostream& operator<<(std::ostream& out, const Kmer& kmer) { From 7ac3b2f4d7e7433de2607b0969b07046a905afe0 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 30 Mar 2021 16:24:27 -0400 Subject: [PATCH 041/350] Add Base --> Extended_Base mapping --- include/DNA_Utility.hpp | 14 ++++++++++++++ src/DNA_Utility.cpp | 1 + 2 files changed, 15 insertions(+) diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index 2f7feafa..5d29d30c 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -99,6 +99,13 @@ class DNA_Utility 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 80, 16, 192, 128, 64, 0 }; + // Mapped `DNA::Extended_Base` for the corresponding `DNA::Base`, i.e. + // a mapping from [0(A) — T(3)] to [1(A) — 4(T)]. + static constexpr DNA::Extended_Base MAPPED_EXTENDED_BASE[4] = + { + DNA::Extended_Base::A, DNA::Extended_Base::C, DNA::Extended_Base::G, DNA::Extended_Base::T + }; + public: @@ -138,6 +145,13 @@ class DNA_Utility { return REVERSE_COMPLEMENT_BYTE[byte]; } + + // Returns the mapping `DNA::Extended_Base` representation of the + // `DNA::Base` representation `base`. + static DNA::Extended_Base map_extended_base(const DNA::Base base) + { + return MAPPED_EXTENDED_BASE[base]; + } }; diff --git a/src/DNA_Utility.cpp b/src/DNA_Utility.cpp index 1a54742c..e3158554 100644 --- a/src/DNA_Utility.cpp +++ b/src/DNA_Utility.cpp @@ -7,3 +7,4 @@ constexpr DNA::Base DNA_Utility::COMPLEMENTED_BASE[5]; constexpr char DNA_Utility::COMPLEMENTED_CHAR[128]; constexpr bool DNA_Utility::IS_PLACEHOLDER[128]; constexpr uint8_t DNA_Utility::REVERSE_COMPLEMENT_BYTE[256]; +constexpr DNA::Extended_Base DNA_Utility::MAPPED_EXTENDED_BASE[4]; From ad10ff29c91590ac3baa2fa0956b673fb0545cc2 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 30 Mar 2021 17:01:58 -0400 Subject: [PATCH 042/350] Add edge instance class --- include/Edge.hpp | 102 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 include/Edge.hpp diff --git a/include/Edge.hpp b/include/Edge.hpp new file mode 100644 index 00000000..f5ac84ae --- /dev/null +++ b/include/Edge.hpp @@ -0,0 +1,102 @@ + +#ifndef EDGE_HPP +#define EDGE_HPP + + + + +#include "globals.hpp" + +#include + + +// Class for an instance of a bidirected edge. +// NB: for some (k + 1)-mer `e`, `e` and `e_bar` denote the same bidirected edge `e_hat`; +// but these being different (k + 1)-mers, they are treated as different instances of the +// same edge. Semantically, the underlying edges are the same. This edge instance is in the +// tuple form `(u, s_\hat{u}, v, s_\hat{v})`. +template +class Edge +{ +private: + + Kmer e_; // The edge (k + 1)-mer (need not be in canonical form). + Kmer u_; // One endpoint of the edge `e_hat` — source endpoint of the `e` form. + Kmer v_; // One endpoint of the edge `e_hat` — sink endpoint of the `e` form. + Kmer u_hat_; // Canonical form of `u`. + Kmer v_hat_; // Canonical form of `v`. + cuttlefish::side_t s_u_hat_; // The side of the vertex `u_hat` to which this edge instance is incident to. + cuttlefish::side_t s_v_hat_; // The side of the vertex `v_hat` to which this edge instance is incident to. + + +public: + + // Returns a mutable reference to the edge (k + 1)-mer. + Kmer& e(); + + // Reconfigures the edge data. i.e. sets the relevant information of + // the edge from the underlying (k + 1)-mer. Must be used whenever the + // edge (k + 1)-mer (updatable using `e()`) is modified. + void reconfigure(); + + // Returns `true` iff the edge is a loop. + bool is_loop() const; + + // Returns the `DNA::Extended_Base` base-encoding of the underlying edge, + // from the point-of-view of the vertex `u_hat`. + cuttlefish::edge_encoding_t edge_encoding_u() const; + + // Returns the `DNA::Extended_Base` base-encoding of the underlying edge, + // from the point-of-view of the vertex `v_hat`. + cuttlefish::edge_encoding_t edge_encoding_v() const; +}; + + +template +inline Kmer& Edge::e() +{ + return e_; +} + + +template +inline void Edge::reconfigure() +{ + u_.from_prefix(e_), + v_.from_suffix(e_); + + u_hat_.as_reverse_complement(u_), + v_hat_.as_reverse_complement(v_); + + s_u_hat_ = (u_ == u_hat_ ? cuttlefish::side_t::back : cuttlefish::side_t::front); + s_v_hat_ = (v_ == v_hat_ ? cuttlefish::side_t::front : cuttlefish::side_t::back); +} + + +template +inline bool Edge::is_loop() const +{ + return u_hat_ == v_hat_; +} + + +template +inline cuttlefish::edge_encoding_t Edge::edge_encoding_u() const +{ + const DNA::Base base = (s_u_hat_ == cuttlefish::side_t::back ? e_.back() : DNA_Utility::complement(e_.back())); + + return DNA_Utility::map_extended_base(base); +} + + +template +inline cuttlefish::edge_encoding_t Edge::edge_encoding_v() const +{ + const DNA::Base base = (s_v_hat_ == cuttlefish::side_t::front ? e_.front() : DNA_Utility::complement(e_.front())); + + return DNA_Utility::map_extended_base(base); +} + + + +#endif From 9db4f3f5b4dce27f080377781fd5db64792d8248 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 30 Mar 2021 17:02:59 -0400 Subject: [PATCH 043/350] Commit missed stuffs --- include/Kmer.hpp | 33 +++++++++++++++++++++++++++++++++ include/State_Read_Space.hpp | 4 ++-- include/globals.hpp | 1 + 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 93c3aca5..4f7b521f 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -119,6 +119,16 @@ class Kmer: public DNA_Utility // Returns true iff this k-mer is not identical to the other k-mer `rhs`. bool operator!=(const Kmer& rhs) const; + // Returns the `DNA::Base` (2-bit) encoding of the character at the front, + // i.e. at the first index of the literal representation. For a k-mer + // `n_{k - 1} ... n_1 n_0`, this is the base `n_{k - 1}`. + DNA::Base front() const; + + // Returns the `DNA::Base` (2-bit) encoding of the character at the back, + // i.e. at the last index of the literal representation. For a k-mer + // `n_{k - 1} ... n_1 n_0`, this is the base `n_0`. + DNA::Base back() const; + // Returns `true` iff the k-mer is in the forward direction relative to // the other k-mer `kmer_hat`. bool in_forward(const Kmer& kmer_hat) const; @@ -437,6 +447,29 @@ inline bool Kmer::operator!=(const Kmer& rhs) const } +template +inline DNA::Base Kmer::front() const +{ + // Relative index of the most significant nucleotide in it's 64-bit word. + constexpr uint16_t rel_idx_MSN = 2 * ((k - 1) % 32); + + // Mask to extract the most significant nucelotide. + constexpr uint64_t mask_MSN = (static_cast(0b11) << rel_idx_MSN); + + return DNA::Base((kmer_data[NUM_INTS - 1] & mask_MSN) >> rel_idx_MSN); +} + + +template +inline DNA::Base Kmer::back() const +{ + // Mask to extract the least significant nucleotide. + constexpr uint64_t mask_LSN = static_cast(0b11); + + return DNA::Base(kmer_data[0] & mask_LSN); +} + + template inline bool Kmer::in_forward(const Kmer& kmer_hat) const { diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 4b7c4826..6a74a8a3 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -14,8 +14,8 @@ template class Kmer_Hash_Entry_API; class State_Read_Space { friend class Kmer_Hash_Entry_API; - - typedef DNA::Extended_Base edge_encoding_t; + + typedef DNA::Extended_Base edge_encoding_t; // TODO: replace this with `cuttlefish::edge_encoding_t`. private: diff --git a/include/globals.hpp b/include/globals.hpp index c497c3b8..049f5cea 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -42,6 +42,7 @@ namespace cuttlefish typedef bool dir_t; typedef DNA::Base base_t; + typedef DNA::Extended_Base edge_encoding_t; typedef uint8_t state_code_t; From 4aee613c230d7d7092c1da835f2e0fde6f6d3d56 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 1 Apr 2021 12:38:48 -0400 Subject: [PATCH 044/350] Add more access methods for edge instances --- include/Edge.hpp | 60 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/include/Edge.hpp b/include/Edge.hpp index f5ac84ae..402506ec 100644 --- a/include/Edge.hpp +++ b/include/Edge.hpp @@ -21,8 +21,8 @@ class Edge private: Kmer e_; // The edge (k + 1)-mer (need not be in canonical form). - Kmer u_; // One endpoint of the edge `e_hat` — source endpoint of the `e` form. - Kmer v_; // One endpoint of the edge `e_hat` — sink endpoint of the `e` form. + Kmer u_; // One endpoint k-mer of this edge instance — source k-mer of the `e` form. + Kmer v_; // One endpoint k-mer of this edge instance — sink k-mer of the `e` form. Kmer u_hat_; // Canonical form of `u`. Kmer v_hat_; // Canonical form of `v`. cuttlefish::side_t s_u_hat_; // The side of the vertex `u_hat` to which this edge instance is incident to. @@ -34,14 +34,32 @@ class Edge // Returns a mutable reference to the edge (k + 1)-mer. Kmer& e(); - // Reconfigures the edge data. i.e. sets the relevant information of - // the edge from the underlying (k + 1)-mer. Must be used whenever the - // edge (k + 1)-mer (updatable using `e()`) is modified. - void reconfigure(); + // Configures the edge data. i.e. sets the relevant information of the + // edge from the underlying (k + 1)-mer. Must be used whenever the edge + // (k + 1)-mer (updatable using `e()`) is modified. + void configure(); // Returns `true` iff the edge is a loop. bool is_loop() const; + // Returns the vertex (i.e. canonical k-mer) `u_hat` — which corresponds + // to the source endpoint k-mer `u` of this edge instance `e`. + const Kmer& u_hat() const; + + // Returns the vertex (i.e. canonical k-mer) `v_hat` — which corresponds + // to the sink endpoint k-mer `v` of this edge instance `e`. + const Kmer& v_hat() const; + + // Returns the side of the vertex (i.e. canonical k-mer) `u_hat` — which + // corresponds to the source endpoint k-mer `u` of this edge instance `e` + // — to which this edge instance `e` is incident to. + cuttlefish::side_t s_u_hat() const; + + // Returns the side of the vertex (i.e. canonical k-mer) `v_hat` — which + // corresponds to the sink endpoint k-mer `v` of this edge instance `e` + // — to which this edge instance `e` is incident to. + cuttlefish::side_t s_v_hat() const; + // Returns the `DNA::Extended_Base` base-encoding of the underlying edge, // from the point-of-view of the vertex `u_hat`. cuttlefish::edge_encoding_t edge_encoding_u() const; @@ -60,7 +78,7 @@ inline Kmer& Edge::e() template -inline void Edge::reconfigure() +inline void Edge::configure() { u_.from_prefix(e_), v_.from_suffix(e_); @@ -80,6 +98,34 @@ inline bool Edge::is_loop() const } +template +inline const Kmer& Edge::u_hat() const +{ + return u_hat_; +} + + +template +inline const Kmer& Edge::v_hat() const +{ + return v_hat_; +} + + +template +inline cuttlefish::side_t Edge::s_u_hat() const +{ + return s_u_hat_; +} + + +template +inline cuttlefish::side_t Edge::s_v_hat() const +{ + return s_v_hat_; +} + + template inline cuttlefish::edge_encoding_t Edge::edge_encoding_u() const { From 6697f4f8f694db679017213af8833363268010f3 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 1 Apr 2021 17:00:06 -0400 Subject: [PATCH 045/350] Note high-priority TODO --- include/Kmer.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 4f7b521f..6adc9d5d 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -38,6 +38,11 @@ class Kmer: public DNA_Utility // A k-mer `n_{k - 1} ... n_1 n_0` is stored in the array `kmer_data` such that, `kmer_data[0]` // stores the suffix `n_63 ... n_0`, then `kmer_data[1]` stores `n_127 ... n_64`, and so on. // That is, the suffix is aligned with a byte boundary. + // TODO: reverse this store-order of the data — i.e. `n_{k - 1}` as the least significant base + // (so stored in `kmer_data[0]`) and `n_0` as the most significant base (so stored in `kmer_data[0]`). + // This would optimize at least the following: + // i) `from_KMC_data` () — aligning with the KMC data alignment and thus `memcpy` instead of bit-twiddling; + // ii) `operator<` — `memcmp` instead of highest-to-lowest index looping comparison. uint64_t kmer_data[NUM_INTS]; From 0fda8278f91fedc459f95d83e66779328538bcca Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 1 Apr 2021 17:54:45 -0400 Subject: [PATCH 046/350] Fix semantic bug --- include/Edge.hpp | 25 +++++++++++++++---------- include/Kmer.hpp | 12 ++++++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/include/Edge.hpp b/include/Edge.hpp index 402506ec..dd3077e7 100644 --- a/include/Edge.hpp +++ b/include/Edge.hpp @@ -12,7 +12,7 @@ // Class for an instance of a bidirected edge. // NB: for some (k + 1)-mer `e`, `e` and `e_bar` denote the same bidirected edge `e_hat`; -// but these being different (k + 1)-mers, they are treated as different instances of the +// but these being different (k + 1)-mers, they are treated as different *instances* of the // same edge. Semantically, the underlying edges are the same. This edge instance is in the // tuple form `(u, s_\hat{u}, v, s_\hat{v})`. template @@ -23,8 +23,10 @@ class Edge Kmer e_; // The edge (k + 1)-mer (need not be in canonical form). Kmer u_; // One endpoint k-mer of this edge instance — source k-mer of the `e` form. Kmer v_; // One endpoint k-mer of this edge instance — sink k-mer of the `e` form. - Kmer u_hat_; // Canonical form of `u`. - Kmer v_hat_; // Canonical form of `v`. + Kmer u_bar_; // Reverse complement of `u`. + Kmer v_bar_; // Reverse complement of `v`. + const Kmer* u_hat_ptr; // Pointer to the canonical form of the k-mer `u`, i.e. ptr to `min(u, u_bar)`. + const Kmer* v_hat_ptr; // Pointer to the canonical form of the k-mer `v`, i.e. ptr to `min(v, v_bar)`. cuttlefish::side_t s_u_hat_; // The side of the vertex `u_hat` to which this edge instance is incident to. cuttlefish::side_t s_v_hat_; // The side of the vertex `v_hat` to which this edge instance is incident to. @@ -83,32 +85,35 @@ inline void Edge::configure() u_.from_prefix(e_), v_.from_suffix(e_); - u_hat_.as_reverse_complement(u_), - v_hat_.as_reverse_complement(v_); + u_bar_.as_reverse_complement(u_), + v_bar_.as_reverse_complement(v_); - s_u_hat_ = (u_ == u_hat_ ? cuttlefish::side_t::back : cuttlefish::side_t::front); - s_v_hat_ = (v_ == v_hat_ ? cuttlefish::side_t::front : cuttlefish::side_t::back); + u_hat_ptr = Kmer::canonical(u_, u_bar_), + v_hat_ptr = Kmer::canonical(v_, v_bar_); + + s_u_hat_ = (&u_ == u_hat_ptr ? cuttlefish::side_t::back : cuttlefish::side_t::front); + s_v_hat_ = (&v_ == v_hat_ptr ? cuttlefish::side_t::front : cuttlefish::side_t::back); } template inline bool Edge::is_loop() const { - return u_hat_ == v_hat_; + return *u_hat_ptr == *v_hat_ptr; } template inline const Kmer& Edge::u_hat() const { - return u_hat_; + return *u_hat_ptr; } template inline const Kmer& Edge::v_hat() const { - return v_hat_; + return *v_hat_ptr; } diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 6adc9d5d..ad954732 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -151,6 +151,11 @@ class Kmer: public DNA_Utility // Returns the canonical version of the k-mer. Kmer canonical() const; + // Given a k-mer `kmer` and its reverse complement `rev_compl`, + // returns a pointer to one of these, which represents the + // canonical form. + static const Kmer* canonical(const Kmer& kmer, const Kmer& rev_compl); + // Returns the string label of the k-mer. std::string string_label() const; @@ -513,6 +518,13 @@ inline Kmer Kmer::canonical() const } +template +inline const Kmer* Kmer::canonical(const Kmer& kmer, const Kmer& rev_compl) +{ + return kmer < rev_compl ? &kmer : &rev_compl; +} + + template inline std::string Kmer::string_label() const { From dc42640f5b1a8e09e1516408f61dc9c40b8965a3 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 1 Apr 2021 18:22:25 -0400 Subject: [PATCH 047/350] Add DFA-states computation --- include/Read_CdBG_Constructor.hpp | 102 ++++++++++++++++++++++++++++++ include/State_Read_Space.hpp | 9 +++ src/Read_CdBG_Constructor.cpp | 15 ++++- 3 files changed, 124 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 51a6188f..28a7c5a1 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -6,6 +6,7 @@ #include "globals.hpp" #include "Kmer_Hash_Table.hpp" +#include "State_Read_Space.hpp" #include "Build_Params.hpp" #include "Thread_Pool.hpp" #include "Kmer_Container.hpp" @@ -37,6 +38,26 @@ class Read_CdBG_Constructor // the DFA as per the edges provided to that thread. void process_edges(uint16_t thread_id); + // For the vertex `v`, adds information of the incidence of an `e_v`-encoded edge to its side `s_v` + // — making the appropriate state transition for the DFA of `v`. Returns `false` iff an attempted + // state transition failed. + bool add_incident_edge(const Kmer& v, cuttlefish::side_t s_v, cuttlefish::edge_encoding_t e_v); + + // For the vertex `v`, adds information of the incidence of a looping edge that connects its side + // `s_u` to its side `s_v`, which may or may not be the same sides — making the appropriate state + // transition for the DFA of `v`. Returns `false` iff an attempted state transition failed. + bool add_loop(const Kmer& u, cuttlefish::side_t s_u, cuttlefish::side_t s_v); + + // For the vertex `v`, adds information of the incidence of a loop that connects its different + // sides — making the appropriate state transition for the DFA of `v`. Returns `false` iff an + // attempted state transition failed. + bool add_crossing_loop(const Kmer& v); + + // For the vertex `v`, adds information of the incidence of a loop that connects its side `s_v` + // to that side itself — making the appropriate state transition for the DFA of `v`. Returns + // `false` iff an attempted state transition failed. + bool add_one_sided_loop(const Kmer& v, cuttlefish::side_t s_v); + public: @@ -49,5 +70,86 @@ class Read_CdBG_Constructor }; +template +inline bool Read_CdBG_Constructor::add_incident_edge(const Kmer& v, const cuttlefish::side_t s_v, const cuttlefish::edge_encoding_t e_v) +{ + // Fetch the hash table entry for the DFA of the vertex `v`. + + Kmer_Hash_Entry_API bucket = hash_table[v]; + State_Read_Space& state = bucket.get_state(); + cuttlefish::edge_encoding_t e_curr = state.edge_at(s_v); + + // If we've already discarded the incidence information for this side, then a self-transition happens. + if(e_curr == cuttlefish::edge_encoding_t::N) + return true; // Early return w/o updating the same value again is safe — see the note at the end of the method. + + + const cuttlefish::edge_encoding_t e_old = e_curr; + if(e_curr == cuttlefish::edge_encoding_t::E) // This side of the vertex is encountered for the first time. + e_curr = e_v; + else if(e_curr != e_v) // This side has been visited earlier with a different edge — discard the incidence information. + e_curr = cuttlefish::edge_encoding_t::N; + + + // We can get away without updating the same value again, because — (1) even if this DFA's state changes + // in the hash table by the time this method completes, making no updates at this point is theoretically + // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the + // ordering of the edges processed does not matter in the algorithm. + if(e_curr == e_old) + return true; + + state.update_edge_at(s_v, e_curr); + return hash_table.update(bucket); +} + + +template +inline bool Read_CdBG_Constructor::add_loop(const Kmer& v, const cuttlefish::side_t s_u, const cuttlefish::side_t s_v) +{ + return s_u == s_v ? add_one_sided_loop(v, s_u) : add_crossing_loop(v); +} + + +template +inline bool Read_CdBG_Constructor::add_crossing_loop(const Kmer& v) +{ + // Fetch the hash table entry for the DFA of the vertex `v`. + + Kmer_Hash_Entry_API bucket = hash_table[v]; + State_Read_Space& state = bucket.get_state(); + const cuttlefish::edge_encoding_t e_front = state.edge_at(cuttlefish::side_t::front); + const cuttlefish::edge_encoding_t e_back = state.edge_at(cuttlefish::side_t::back); + + const State_Read_Space state_old = state; + + if(e_front != cuttlefish::edge_encoding_t::N) + state.update_edge_at(cuttlefish::side_t::front, cuttlefish::edge_encoding_t::N); + + if(e_back != cuttlefish::edge_encoding_t::N) + state.update_edge_at(cuttlefish::side_t::back, cuttlefish::edge_encoding_t::N); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + return state == state_old ? true : hash_table.update(bucket); +} + + +template +inline bool Read_CdBG_Constructor::add_one_sided_loop(const Kmer& v, const cuttlefish::side_t s_v) +{ + // Fetch the hash table entry for the vertex `v`. + + Kmer_Hash_Entry_API bucket = hash_table[v]; + State_Read_Space& state = bucket.get_state(); + const cuttlefish::edge_encoding_t e_v = state.edge_at(s_v); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + if(e_v == cuttlefish::edge_encoding_t::N) + return true; + + state.update_edge_at(s_v, cuttlefish::edge_encoding_t::N); + return hash_table.update(bucket); +} + + #endif diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 6a74a8a3..40b9b166 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -70,6 +70,9 @@ class State_Read_Space // `edge`. For optimization purposes, only certain edge-updates have defined // behavior: empty-to-rest and unique-to-multi. void update_edge_at(cuttlefish::side_t side, edge_encoding_t edge); + + // Returns `true` iff the underlying code is the same as that one of `rhs`. + bool operator==(const State_Read_Space& rhs) const; }; @@ -119,5 +122,11 @@ inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, cons } +inline bool State_Read_Space::operator==(const State_Read_Space& rhs) const +{ + return code == rhs.code; +} + + #endif diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index a60d82ed..6405d1e1 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -1,5 +1,6 @@ #include "Read_CdBG_Constructor.hpp" +#include "Edge.hpp" #include "chrono" @@ -62,12 +63,22 @@ void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thr template void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) { - Kmer edge; + Edge e; uint64_t edge_count = 0; while(edge_parser.tasks_expected(thread_id)) - if(edge_parser.value_at(thread_id, edge)) + if(edge_parser.value_at(thread_id, e.e())) { + e.configure(); // A new edge (k + 1)-mer has been parsed; set the relevant k-mer and sides information. + + if(e.is_loop()) + while(!add_loop(e.u_hat(), e.s_u_hat(), e.s_v_hat())); + else + { + while(!add_incident_edge(e.u_hat(), e.s_u_hat(), e.edge_encoding_u())); + while(!add_incident_edge(e.v_hat(), e.s_v_hat(), e.edge_encoding_v())); + } + edge_count++; } From 63d0bb8439d0ab84eb0dcc117c7bb56bdba26842 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 5 Apr 2021 11:53:39 -0400 Subject: [PATCH 048/350] Collapse common type --- include/State_Read_Space.hpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 40b9b166..de31cc90 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -14,8 +14,6 @@ template class Kmer_Hash_Entry_API; class State_Read_Space { friend class Kmer_Hash_Entry_API; - - typedef DNA::Extended_Base edge_encoding_t; // TODO: replace this with `cuttlefish::edge_encoding_t`. private: @@ -44,11 +42,11 @@ class State_Read_Space // Sets the back-encoding of the state to the `Extended_Base`-encoding `edge`. // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. - void set_back_encoding(edge_encoding_t edge); + void set_back_encoding(cuttlefish::edge_encoding_t edge); // Sets the front-encoding of the state to the `Extended_Base`-encoding `edge`. // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. - void set_front_encoding(edge_encoding_t edge); + void set_front_encoding(cuttlefish::edge_encoding_t edge); // Returns the wrapped state-code value. cuttlefish::state_code_t get_state() const; @@ -64,12 +62,12 @@ class State_Read_Space // Returns the `Extended_Base`-encoding of the edge(s) incident to the side // `side` of a vertex having this state. - edge_encoding_t edge_at(cuttlefish::side_t side) const; + cuttlefish::edge_encoding_t edge_at(cuttlefish::side_t side) const; // Updates the `Extended_Base` encoding of the side `side` of this state, with // `edge`. For optimization purposes, only certain edge-updates have defined // behavior: empty-to-rest and unique-to-multi. - void update_edge_at(cuttlefish::side_t side, edge_encoding_t edge); + void update_edge_at(cuttlefish::side_t side, cuttlefish::edge_encoding_t edge); // Returns `true` iff the underlying code is the same as that one of `rhs`. bool operator==(const State_Read_Space& rhs) const; @@ -77,7 +75,7 @@ class State_Read_Space inline constexpr State_Read_Space::State_Read_Space(): - code{(static_cast(edge_encoding_t::E) << FRONT_IDX) | static_cast(edge_encoding_t::E)} + code{(static_cast(cuttlefish::edge_encoding_t::E) << FRONT_IDX) | static_cast(cuttlefish::edge_encoding_t::E)} {} @@ -86,13 +84,13 @@ inline State_Read_Space::State_Read_Space(const cuttlefish::state_code_t code): {} -inline void State_Read_Space::set_back_encoding(edge_encoding_t edge) +inline void State_Read_Space::set_back_encoding(cuttlefish::edge_encoding_t edge) { code |= (static_cast(edge) << BACK_IDX); } -inline void State_Read_Space::set_front_encoding(edge_encoding_t edge) +inline void State_Read_Space::set_front_encoding(cuttlefish::edge_encoding_t edge) { code |= (static_cast(edge) << FRONT_IDX); } @@ -110,13 +108,13 @@ inline bool State_Read_Space::is_outputted() const } -inline State_Read_Space::edge_encoding_t State_Read_Space::edge_at(const cuttlefish::side_t side) const +inline cuttlefish::edge_encoding_t State_Read_Space::edge_at(const cuttlefish::side_t side) const { - return static_cast(side == cuttlefish::side_t::front ? (code & FRONT_MASK) >> FRONT_IDX : (code & BACK_MASK) >> BACK_IDX); + return static_cast(side == cuttlefish::side_t::front ? (code & FRONT_MASK) >> FRONT_IDX : (code & BACK_MASK) >> BACK_IDX); } -inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, const edge_encoding_t edge) +inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, const cuttlefish::edge_encoding_t edge) { side == cuttlefish::side_t::front ? set_front_encoding(edge) : set_back_encoding(edge); } From c2a49cb677ccffd28c9c7016e9ca54c757462830 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 6 Apr 2021 18:20:36 -0400 Subject: [PATCH 049/350] Add skeleton extractor for CdBG vertices --- include/Read_CdBG_Extractor.hpp | 54 ++++++++++++++++++++++ include/Task_Params.hpp | 8 ++-- include/Thread_Pool.hpp | 8 ++-- src/CMakeLists.txt | 1 + src/Read_CdBG_Constructor.cpp | 2 +- src/Read_CdBG_Extractor.cpp | 81 +++++++++++++++++++++++++++++++++ src/Thread_Pool.cpp | 16 +++++-- 7 files changed, 157 insertions(+), 13 deletions(-) create mode 100644 include/Read_CdBG_Extractor.hpp create mode 100644 src/Read_CdBG_Extractor.cpp diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp new file mode 100644 index 00000000..0bbbfea8 --- /dev/null +++ b/include/Read_CdBG_Extractor.hpp @@ -0,0 +1,54 @@ + +#ifndef READ_CDBG_EXTRACTPR_HPP +#define READ_CDBG_EXTRACTOR_HPP + + + +#include "globals.hpp" +#include "Kmer_Hash_Table.hpp" +#include "Kmer_Container.hpp" +#include "Kmer_SPMC_Iterator.hpp" +#include "Build_Params.hpp" +#include "Spin_Lock.hpp" +#include "Thread_Pool.hpp" + + +// A class to extract the vertices from a compacted de Bruin graph — which are the maximal unitigs of some ordinary de Bruijn graph. +template +class Read_CdBG_Extractor +{ + friend class Thread_Pool; + +private: + + const Build_Params params; // Required parameters (wrapped inside). + Kmer_Hash_Table& hash_table; // Hash table for the vertices (i.e. canonical k-mers) of the original (uncompacted) de Bruijn graph. + const Kmer_Container vertex_container; // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser; // Parser for the vertices from the vertex-database. + + // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. + mutable Spin_Lock lock; + mutable uint64_t vertices_processed = 0; + + + // Distributes the maximal unitigs extraction task to the worker threads in the thread pool `thread_pool`. + void distribute_unipaths_extraction(Thread_Pool& thread_pool); + + // Processes the vertices provided to the thread with id `thread_id`, i.e. builds the maximal unitigs from + // the flanking vertices provided to that thread. + void process_vertices(uint16_t thread_id); + + +public: + + // Constructs a vertex-extractor object for some compacted read de Bruijn graph, with the required + // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. + Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); + + // Extracts the maximal unitigs of the de Bruijn graph. + void extract_maximal_unitigs(); +}; + + + +#endif diff --git a/include/Task_Params.hpp b/include/Task_Params.hpp index f9f8561c..e9cd8b1d 100644 --- a/include/Task_Params.hpp +++ b/include/Task_Params.hpp @@ -45,15 +45,15 @@ struct Output_Task_Params }; -// Wrapper over the parameters for the DFA-states computation task for read-dBGs. -struct Compute_States_Read_Space_Params +// Wrapper over the parameters for the DFA-states computation and the maximal unitigs extraction tasks for read-dBGs. +struct Read_dBG_Compaction_Params { uint16_t thread_id; - Compute_States_Read_Space_Params() {} + Read_dBG_Compaction_Params() {} - Compute_States_Read_Space_Params(const uint16_t thread_id): + Read_dBG_Compaction_Params(const uint16_t thread_id): thread_id(thread_id) {} }; diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 6906cec5..76645de4 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -31,6 +31,7 @@ class Thread_Pool output_gfa, output_gfa_reduced, compute_states_read_space, + extract_unipaths_read_space, }; @@ -63,7 +64,7 @@ class Thread_Pool // Collection of the task parameters for each thread. std::vector classify_params; std::vector output_params; - std::vector compute_states_read_space_params; + std::vector read_dBG_compaction_params; // Marks the thread number `thread_id` as busy with some task. @@ -80,7 +81,6 @@ class Thread_Pool public: - // Constructs a thread pool with `thread_count` number of threads to operate // on the de Brujin graph `dBG` for tasks of type `task_type`. Thread_Pool(uint16_t thread_count, void* dBG, Task_Type task_type); @@ -97,8 +97,8 @@ class Thread_Pool // Assigns an outputting task to the thread number `thread_id` with the provided parameters. void assign_output_task(uint16_t thread_id, const char* seq, size_t seq_len, size_t left_end, size_t right_end); - // Assigns a DFA-states computation task to the thread number `thread_id`. - void assign_compute_states_read_space_task(uint16_t thread_id); + // Assigns a read-dBG compaction task (either DFA-states computation or maximal unitigs extraction) to the thread number `thread_id`. + void assign_read_dBG_compaction_task(uint16_t thread_id); // Waits until all the threads in the pool have completed their active tasks. void wait_completion() const; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1dfce584..0a954661 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,7 @@ set(PROJECT_SRC CdBG_GFA_Reduced_Writer.cpp Read_CdBG.cpp Read_CdBG_Constructor.cpp + Read_CdBG_Extractor.cpp Validator.cpp Validator_Hash_Table.cpp Sequence_Validator.cpp diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 6405d1e1..b5a67fa4 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -55,7 +55,7 @@ void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thr for(uint16_t t_id = 0; t_id < thread_count; ++t_id) { const uint16_t idle_thread_id = thread_pool.get_idle_thread(); - thread_pool.assign_compute_states_read_space_task(idle_thread_id); + thread_pool.assign_read_dBG_compaction_task(idle_thread_id); } } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp new file mode 100644 index 00000000..8d747ca6 --- /dev/null +++ b/src/Read_CdBG_Extractor.cpp @@ -0,0 +1,81 @@ + +#include "Read_CdBG_Extractor.hpp" + + +template +Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table): + params(params), + hash_table(hash_table), + vertex_container(params.vertex_db_path()), + vertex_parser(&vertex_container, params.thread_count()) +{ + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; +} + + +template +void Read_CdBG_Extractor::extract_maximal_unitigs() +{ + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + + + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_unipaths_read_space); + + // Launch the reading (and parsing per demand) of the vertices from disk. + vertex_parser.launch_production(); + + // Launch (multi-thread) extraction of the maximal unitigs. + distribute_unipaths_extraction(thread_pool); + + // Wait for the vertices to be deplted from the database. + vertex_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing edges. + thread_pool.close(); + + std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; + + + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; +} + + +template +void Read_CdBG_Extractor::distribute_unipaths_extraction(Thread_Pool& thread_pool) +{ + const uint16_t thread_count = params.thread_count(); + + for(uint16_t t_id = 0; t_id < thread_count; ++t_id) + { + const uint16_t idle_thread_id = thread_pool.get_idle_thread(); + thread_pool.assign_read_dBG_compaction_task(idle_thread_id); + } +} + + +template +void Read_CdBG_Extractor::process_vertices(const uint16_t thread_id) +{ + Kmer v; + uint64_t vertex_count = 0; + + while(vertex_parser.tasks_expected(thread_id)) + if(vertex_parser.value_at(thread_id, v)) + { + vertex_count++; + } + + lock.lock(); + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. + vertices_processed += vertex_count; + lock.unlock(); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index 7321710c..a3382e40 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -2,6 +2,7 @@ #include "Thread_Pool.hpp" #include "CdBG.hpp" #include "Read_CdBG_Constructor.hpp" +#include "Read_CdBG_Extractor.hpp" #include @@ -32,7 +33,8 @@ Thread_Pool::Thread_Pool(const uint16_t thread_count, void* const dBG, const break; case Task_Type::compute_states_read_space: - compute_states_read_space_params.resize(thread_count); + case Task_Type::extract_unipaths_read_space: + read_dBG_compaction_params.resize(thread_count); break; default: @@ -89,10 +91,16 @@ void Thread_Pool::task(const uint16_t thread_id) case Task_Type::compute_states_read_space: { - const Compute_States_Read_Space_Params& params = compute_states_read_space_params[thread_id]; + const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; static_cast*>(dBG)->process_edges(params.thread_id); } break; + + case Task_Type::extract_unipaths_read_space: + { + const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; + static_cast*>(dBG)->process_vertices(params.thread_id); + } } @@ -145,9 +153,9 @@ void Thread_Pool::assign_output_task(const uint16_t thread_id, const char* co template -void Thread_Pool::assign_compute_states_read_space_task(const uint16_t thread_id) +void Thread_Pool::assign_read_dBG_compaction_task(const uint16_t thread_id) { - compute_states_read_space_params[thread_id] = Compute_States_Read_Space_Params(thread_id); + read_dBG_compaction_params[thread_id] = Read_dBG_Compaction_Params(thread_id); assign_task(thread_id); } From 615e86b8d15c6641393bccc71094bf679bd06887 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 6 Apr 2021 18:31:47 -0400 Subject: [PATCH 050/350] =?UTF-8?q?Have=20Extended=5FBase=20=E2=80=94>=20B?= =?UTF-8?q?ase=20mapping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/DNA_Utility.hpp | 14 ++++++++++++++ src/DNA_Utility.cpp | 1 + 2 files changed, 15 insertions(+) diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index 5d29d30c..7225017e 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -106,6 +106,13 @@ class DNA_Utility DNA::Extended_Base::A, DNA::Extended_Base::C, DNA::Extended_Base::G, DNA::Extended_Base::T }; + // Mapped `DNA::Base` for the corresponding `DNA::Extended_Base`, i.e. + // a mapping from [1(A) — 4(3)] to [0(A) — 3(T)]. + static constexpr DNA::Base REVERSE_MAPPED_EXTENDED_BASE[5] = + { + DNA::Base::N, DNA::Base::A, DNA::Base::C, DNA::Base::G, DNA::Base::T + }; + public: @@ -152,6 +159,13 @@ class DNA_Utility { return MAPPED_EXTENDED_BASE[base]; } + + // Returns the mapping `DNA::Base` representation of the + // `DNA::Extended_Base` representation `extended_base`. + static DNA::Base map_base(const DNA::Extended_Base extended_base) + { + return REVERSE_MAPPED_EXTENDED_BASE[static_cast(extended_base)]; + } }; diff --git a/src/DNA_Utility.cpp b/src/DNA_Utility.cpp index e3158554..625f8246 100644 --- a/src/DNA_Utility.cpp +++ b/src/DNA_Utility.cpp @@ -8,3 +8,4 @@ constexpr char DNA_Utility::COMPLEMENTED_CHAR[128]; constexpr bool DNA_Utility::IS_PLACEHOLDER[128]; constexpr uint8_t DNA_Utility::REVERSE_COMPLEMENT_BYTE[256]; constexpr DNA::Extended_Base DNA_Utility::MAPPED_EXTENDED_BASE[4]; +constexpr DNA::Base DNA_Utility::REVERSE_MAPPED_EXTENDED_BASE[5]; From cd12b32429312451c078bb5b386ecb9d2b142b51 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 6 Apr 2021 21:12:13 -0400 Subject: [PATCH 051/350] Add extractor driver --- src/Read_CdBG.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index cab9d2a1..d9dbcfa6 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -1,6 +1,7 @@ #include "Read_CdBG.hpp" #include "Read_CdBG_Constructor.hpp" +#include "Read_CdBG_Extractor.hpp" template @@ -20,6 +21,10 @@ void Read_CdBG::construct() Read_CdBG_Constructor cdBg_constructor(params, hash_table); cdBg_constructor.compute_DFA_states(); + std::cout << "\nExtracting the maximal unitigs.\n"; + Read_CdBG_Extractor cdBg_extractor(params, hash_table); + cdBg_extractor.extract_maximal_unitigs(); + hash_table.clear(); } From f8eca5c15acc8d1f23634589f4a85067e5b2c138 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 9 Apr 2021 15:13:51 -0400 Subject: [PATCH 052/350] Have more k-mer rolling interface --- include/Kmer.hpp | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index ad954732..6a46193b 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -139,11 +139,23 @@ class Kmer: public DNA_Utility bool in_forward(const Kmer& kmer_hat) const; // Transforms this k-mer by chopping off the first base and - // appending the next base `next_base` to the end, i.e. - // rolls the k-mer by one base. Also sets the passed reverse - // complement `rev_compl` of the k-mer accordingly. + // appending the next base character `next_base` to the end, + // i.e. rolls the k-mer by one base. Also sets the passed + // reverse complement `rev_compl` of the k-mer accordingly. void roll_to_next_kmer(char next_base, Kmer& rev_compl); + // Transforms this k-mer by chopping off the first base and + // appending the next base `base` to the end, i.e. rolls + // the k-mer by one base. Also sets the passed reverse + // complement `rev_compl` of the k-mer accordingly. + void roll_to_next_kmer(DNA::Base base, Kmer& rev_compl); + + // Transforms this k-mer by chopping off the first base and + // appending the next base coded with the edge encoding `edge`, + // i.e. rolls the k-mer by one base. Also sets the passed + // reverse complement `rev_compl` of the k-mer accordingly. + void roll_to_next_kmer(DNA::Extended_Base edge, Kmer& rev_compl); + // Returns the canonical version of the k-mer, comparing it to its // reverse complement `rev_compl`. Kmer canonical(const Kmer& rev_compl) const; @@ -492,15 +504,31 @@ inline void Kmer::roll_to_next_kmer(const char next_base, Kmer& rev_compl) { const DNA::Base mapped_base = map_base(next_base); + roll_to_next_kmer(mapped_base, rev_compl); +} + + +template +inline void Kmer::roll_to_next_kmer(const DNA::Base base, Kmer& rev_compl) +{ // Logically, since a left shift moves the MSN out of the length `k` boundary, the clearing of the base // may seem redundant. But, the `to_u64` hashing method implementation works with bytes — not clearing // out this base breaks the consistency of the hashing. kmer_data[NUM_INTS - 1] &= CLEAR_MSN_MASK; left_shift(); - kmer_data[0] |= mapped_base; + kmer_data[0] |= base; rev_compl.right_shift(); - rev_compl.kmer_data[NUM_INTS - 1] |= (uint64_t(complement(mapped_base)) << (2 * ((k - 1) & 31))); + rev_compl.kmer_data[NUM_INTS - 1] |= (uint64_t(complement(base)) << (2 * ((k - 1) & 31))); +} + + +template +inline void Kmer::roll_to_next_kmer(const DNA::Extended_Base edge, Kmer& rev_compl) +{ + const DNA::Base mapped_base = map_base(edge); + + roll_to_next_kmer(mapped_base, rev_compl); } From eabaf4a22c6a307b2d292bf9325ca06baa102690 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 11 Apr 2021 18:41:05 -0400 Subject: [PATCH 053/350] Make hash table a functor --- include/Kmer_Hash_Table.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 5d646b0a..668267e2 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -96,6 +96,9 @@ class Kmer_Hash_Table // from scratch), or the newly built MPH is saved there. void construct(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); + // Returns the hash value of the k-mer `kmer`. + uint64_t operator()(const Kmer& kmer) const; + // Returns an API to the entry (in the hash table) for the key `kmer`. The API // wraps the hash table position and the state value at that position. Kmer_Hash_Entry_API operator[](const Kmer& kmer); @@ -124,6 +127,13 @@ inline uint64_t Kmer_Hash_Table::bucket_id(const Kmer& kmer) } +template +inline uint64_t Kmer_Hash_Table::operator()(const Kmer& kmer) const +{ + return bucket_id(kmer); +} + + template inline Kmer_Hash_Entry_API Kmer_Hash_Table::operator[](const uint64_t bucket_id) { From 3f2ed7d7c4d7de8af661a212cafb8887d3f6be75 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 11 Apr 2021 19:37:51 -0400 Subject: [PATCH 054/350] Add edge endpoint data structure --- include/Endpoint.hpp | 163 +++++++++++++++++++++++++++++++++++++++++++ include/Kmer.hpp | 34 +++++++++ 2 files changed, 197 insertions(+) create mode 100644 include/Endpoint.hpp diff --git a/include/Endpoint.hpp b/include/Endpoint.hpp new file mode 100644 index 00000000..6f572ce3 --- /dev/null +++ b/include/Endpoint.hpp @@ -0,0 +1,163 @@ + +#ifndef ENDPOINT_HPP +#define ENDPOINT_HPP + + + + +#include "Kmer.hpp" +#include "globals.hpp" +#include "Kmer_Hash_Table.hpp" + +#include + + +// A class denoting an endpoint of a bidirected edge instance. +template +class Endpoint +{ +private: + + Kmer kmer_; // The endpoint k-mer spelled by the edge instance. + Kmer kmer_bar_; // Reverse complement of the k-mer spelled by the edge instance. + const Kmer* kmer_hat_ptr; // Pointer to the canonical form of the endpoint k-mer. + cuttlefish::side_t s; // The side of the endpoint vertex to which the edge instance is incident to. + cuttlefish::edge_encoding_t e; // The `DNA::Extended_Base` encoding of the edge instance incident to this endpoint. + uint64_t h; // Hash value of the vertex, i.e. canonical k-mer. + + + // Constructs an `Endpoint` object that appears in the form `kmer` in an edge instance, and + // is the source (i.e. prefix) of that edge iff `is_source` is true — which decides the edge + // incidence side to the corresponding vertex. Also gets the hash value of the vertex using + // the hash table `hash`. The sole application of this constructor is get a specific side of + // a vertex where the edge incidence information is to be discarded, hence no edge-encoding + // is provided with the constructor, although the class has such a member. + Endpoint(const Kmer& kmer, bool is_source, const Kmer_Hash_Table& hash); + + // Returns the side of the associated vertex to which the edge instance corresponding to this + // endpoint is incident to, if this endpoint is the source endpoint of the edge. + cuttlefish::side_t exit_side() const; + + // Returns the side of the associated vertex to which the edge instance corresponding to this + // endpoint is incident to, if this endpoint is the sink endpoint of the edge. + cuttlefish::side_t entrance_side() const; + + // Returns the `DNA::Extended_Base` encoding of the edge `e` corresponding to this endpoint, + // given the endpoint is the source endpoint of the edge. + cuttlefish::edge_encoding_t exit_edge(const Kmer& e) const; + + // Returns the `DNA::Extended_Base` encoding of the edge `e` corresponding to this endpoint, + // given the endpoint is the sink endpoint of the edge. + cuttlefish::edge_encoding_t entrance_edge(const Kmer& e) const; + + +public: + + // Constructs an empty endpoint. + Endpoint() + {} + + // Configures the endpoint with the source (i.e. prefix) k-mer of the edge (k + 1)-mer `e`; + // and uses the hash table `hash` to get the hash value of the vertex. + void from_prefix(const Kmer& e, const Kmer_Hash_Table& hash); + + // Configures the endpoint with the sink (i.e. suffix) k-mer of the edge (k + 1)-mer `e`; + // and uses the hash table `hash` to get the hash value of the vertex. + void from_suffix(const Kmer& e, const Kmer_Hash_Table& hash); + + // Returns the neighboring endpoint of this endpoint that's connected with an edge encoded + // with the code `e`, from the point-of-view of this endpoint. Uses the hash table `hash` + // to get the hash value of the corresponding neighbor vertex. + Endpoint neighbor_endpoint(cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash); +}; + + +template +inline Endpoint::Endpoint(const Kmer& kmer, const bool is_source, const Kmer_Hash_Table& hash): + kmer_(kmer) +{ + kmer_bar_.as_reverse_complement(kmer_); + kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + + s = (is_source ? exit_side() : entrance_side()); + + h = hash(*kmer_hat_ptr); +} + + +template +inline void Endpoint::from_prefix(const Kmer& e, const Kmer_Hash_Table& hash) +{ + kmer_.from_prefix(e); + kmer_bar_.as_reverse_complement(kmer_); + kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + + s = exit_side(); + this->e = exit_edge(e); + + h = hash(*kmer_hat_ptr); +} + + +template +inline void Endpoint::from_suffix(const Kmer& e, const Kmer_Hash_Table& hash) +{ + kmer_.from_suffix(e); + kmer_bar_.as_reverse_complement(kmer_); + kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + + s = entrance_side(); + this->e = entrance_edge(e); + + h = hash(*kmer_hat_ptr); +} + + +template +inline cuttlefish::side_t Endpoint::exit_side() const +{ + return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::back : cuttlefish::side_t::front; +} + + +template +inline cuttlefish::side_t Endpoint::entrance_side() const +{ + return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::front : cuttlefish::side_t::back; +} + + +template +inline cuttlefish::edge_encoding_t Endpoint::exit_edge(const Kmer& e) const +{ + return DNA_Utility::map_extended_base(s == cuttlefish::side_t::back ? + e.back() : DNA_Utility::complement(e.back())); +} + + +template +inline cuttlefish::edge_encoding_t Endpoint::entrance_edge(const Kmer& e) const +{ + return DNA_Utility::map_extended_base(s == cuttlefish::side_t::front ? + e.front() : DNA_Utility::complement(e.front())); +} + + +template +inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash) +{ + Kmer kmer(*kmer_hat_ptr); + + if(s == cuttlefish::side_t::back) + { + kmer.roll_forward(e); + return Endpoint(kmer, false, hash); + } + + kmer.roll_backward(e); + return Endpoint(kmer, true, hash); +} + + + +#endif diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 6a46193b..0c827c95 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -156,6 +156,16 @@ class Kmer: public DNA_Utility // reverse complement `rev_compl` of the k-mer accordingly. void roll_to_next_kmer(DNA::Extended_Base edge, Kmer& rev_compl); + // Transforms this k-mer by chopping off the first base and + // appending the base coded with the edge encoding `edge` to + // the end, i.e. rolls the k-mer to the "right" by one base. + void roll_forward(DNA::Extended_Base edge); + + // Transforms this k-mer by chopping off the last base and + // appending the base coded with the edge encoding `edge` to + // the beginning, i.e. rolls the k-mer to the "left" by one base. + void roll_backward(DNA::Extended_Base edge); + // Returns the canonical version of the k-mer, comparing it to its // reverse complement `rev_compl`. Kmer canonical(const Kmer& rev_compl) const; @@ -532,6 +542,30 @@ inline void Kmer::roll_to_next_kmer(const DNA::Extended_Base edge, Kmer& r } +template +inline void Kmer::roll_forward(const DNA::Extended_Base edge) +{ + const DNA::Base mapped_base = map_base(edge); + + kmer_data[NUM_INTS - 1] &= CLEAR_MSN_MASK; + left_shift<1>(); + kmer_data[0] |= static_cast(mapped_base); +} + + +template +inline void Kmer::roll_backward(const DNA::Extended_Base edge) +{ + // Relative index of the most significant nucleotide in it's 64-bit word. + constexpr uint16_t rel_idx_MSN = 2 * ((k - 1) % 32); + + const DNA::Extended_Base mapped_base = map_base(edge); + + right_shift(); + kmer_data[NUM_INTS - 1] |= (static_cast(mapped_base) << rel_idx_MSN); +} + + template inline Kmer Kmer::canonical(const Kmer& rev_compl) const { From f329110ecc4bd6c7b97abac3ec9be9e672a0c82a Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 12 Apr 2021 18:17:14 -0400 Subject: [PATCH 055/350] Fix type bug --- include/Kmer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 0c827c95..25ea8fe9 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -559,7 +559,7 @@ inline void Kmer::roll_backward(const DNA::Extended_Base edge) // Relative index of the most significant nucleotide in it's 64-bit word. constexpr uint16_t rel_idx_MSN = 2 * ((k - 1) % 32); - const DNA::Extended_Base mapped_base = map_base(edge); + const DNA::Base mapped_base = map_base(edge); right_shift(); kmer_data[NUM_INTS - 1] |= (static_cast(mapped_base) << rel_idx_MSN); From 184361ed27f9afa29ff0c5944ff8f12f3f8df967 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 12 Apr 2021 19:10:01 -0400 Subject: [PATCH 056/350] Expose access to buckets through hash values --- include/Kmer_Hash_Table.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 668267e2..368cd787 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -77,11 +77,6 @@ class Kmer_Hash_Table // supposed to store value items for the key `kmer`. uint64_t bucket_id(const Kmer& kmer) const; - // Returns an API to the entry (in the hash table) for a k-mer hashing - // to the bucket number `bucket_id` of the hash table. The API wraps - // the hash table position and the state value at that position. - Kmer_Hash_Entry_API operator[](uint64_t bucket_id); - public: @@ -99,6 +94,11 @@ class Kmer_Hash_Table // Returns the hash value of the k-mer `kmer`. uint64_t operator()(const Kmer& kmer) const; + // Returns an API to the entry (in the hash table) for a k-mer hashing + // to the bucket number `bucket_id` of the hash table. The API wraps + // the hash table position and the state value at that position. + Kmer_Hash_Entry_API operator[](uint64_t bucket_id); + // Returns an API to the entry (in the hash table) for the key `kmer`. The API // wraps the hash table position and the state value at that position. Kmer_Hash_Entry_API operator[](const Kmer& kmer); From 412ee2f551ec0d8cff56c82a8ed33ba65d8f4d1d Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 12 Apr 2021 19:14:08 -0400 Subject: [PATCH 057/350] Have more interface to endpoints --- include/Endpoint.hpp | 45 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/include/Endpoint.hpp b/include/Endpoint.hpp index 6f572ce3..37290186 100644 --- a/include/Endpoint.hpp +++ b/include/Endpoint.hpp @@ -68,7 +68,20 @@ class Endpoint // Returns the neighboring endpoint of this endpoint that's connected with an edge encoded // with the code `e`, from the point-of-view of this endpoint. Uses the hash table `hash` // to get the hash value of the corresponding neighbor vertex. - Endpoint neighbor_endpoint(cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash); + Endpoint neighbor_endpoint(cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash) const; + + // Returns the canonical form of the associated vertex. + const Kmer& canonical() const; + + // Returns the side of the endpoint to which the corresponding edge is incident to. + cuttlefish::side_t side() const; + + // Returns the `DNA::Extended_Base` encoding of the corresponding edge incident to + // the endpoint. + cuttlefish::edge_encoding_t edge() const; + + // Returns the hash value of the vertex associated to this endpoint. + uint64_t hash() const; }; @@ -144,7 +157,7 @@ inline cuttlefish::edge_encoding_t Endpoint::entrance_edge(const Kmer& template -inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash) +inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash) const { Kmer kmer(*kmer_hat_ptr); @@ -159,5 +172,33 @@ inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encodin } +template +inline const Kmer& Endpoint::canonical() const +{ + return *kmer_hat_ptr; +} + + +template +inline cuttlefish::side_t Endpoint::side() const +{ + return s; +} + + +template +inline cuttlefish::edge_encoding_t Endpoint::edge() const +{ + return e; +} + + +template +inline uint64_t Endpoint::hash() const +{ + return h; +} + + #endif From c8ae3a522eb501e0332245ff6f9d5bdaa4c2618d Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 13 Apr 2021 11:49:46 -0400 Subject: [PATCH 058/350] Propagate discarding of incidence --- include/Edge.hpp | 112 +++++---------------- include/Read_CdBG_Constructor.hpp | 159 ++++++++++++++++++++---------- src/Read_CdBG_Constructor.cpp | 37 +++++-- 3 files changed, 163 insertions(+), 145 deletions(-) diff --git a/include/Edge.hpp b/include/Edge.hpp index dd3077e7..04e141a4 100644 --- a/include/Edge.hpp +++ b/include/Edge.hpp @@ -6,6 +6,8 @@ #include "globals.hpp" +#include "Endpoint.hpp" +#include "Kmer_Hash_Table.hpp" #include @@ -21,14 +23,8 @@ class Edge private: Kmer e_; // The edge (k + 1)-mer (need not be in canonical form). - Kmer u_; // One endpoint k-mer of this edge instance — source k-mer of the `e` form. - Kmer v_; // One endpoint k-mer of this edge instance — sink k-mer of the `e` form. - Kmer u_bar_; // Reverse complement of `u`. - Kmer v_bar_; // Reverse complement of `v`. - const Kmer* u_hat_ptr; // Pointer to the canonical form of the k-mer `u`, i.e. ptr to `min(u, u_bar)`. - const Kmer* v_hat_ptr; // Pointer to the canonical form of the k-mer `v`, i.e. ptr to `min(v, v_bar)`. - cuttlefish::side_t s_u_hat_; // The side of the vertex `u_hat` to which this edge instance is incident to. - cuttlefish::side_t s_v_hat_; // The side of the vertex `v_hat` to which this edge instance is incident to. + Endpoint u_; // One endpoint k-mer of this edge instance — source of the `e` form. + Endpoint v_; // One endpoint k-mer of this edge instance — sink of the `e` form. public: @@ -36,39 +32,20 @@ class Edge // Returns a mutable reference to the edge (k + 1)-mer. Kmer& e(); - // Configures the edge data. i.e. sets the relevant information of the - // edge from the underlying (k + 1)-mer. Must be used whenever the edge - // (k + 1)-mer (updatable using `e()`) is modified. - void configure(); + // Returns the source endpoint `u` of the edge instance. + const Endpoint& u() const; - // Returns `true` iff the edge is a loop. - bool is_loop() const; - - // Returns the vertex (i.e. canonical k-mer) `u_hat` — which corresponds - // to the source endpoint k-mer `u` of this edge instance `e`. - const Kmer& u_hat() const; - - // Returns the vertex (i.e. canonical k-mer) `v_hat` — which corresponds - // to the sink endpoint k-mer `v` of this edge instance `e`. - const Kmer& v_hat() const; - - // Returns the side of the vertex (i.e. canonical k-mer) `u_hat` — which - // corresponds to the source endpoint k-mer `u` of this edge instance `e` - // — to which this edge instance `e` is incident to. - cuttlefish::side_t s_u_hat() const; + // Returns the sink endpoint `v` of the edge instance. + const Endpoint& v() const; - // Returns the side of the vertex (i.e. canonical k-mer) `v_hat` — which - // corresponds to the sink endpoint k-mer `v` of this edge instance `e` - // — to which this edge instance `e` is incident to. - cuttlefish::side_t s_v_hat() const; + // Configures the edge data, i.e. sets the relevant information of the + // edge from the underlying (k + 1)-mer. Uses the hash table `hash` to + // get the hash values of the endpoint vertices. Must be used whenever + // the edge (k + 1)-mer (updatable using `e()`) is modified. + void configure(const Kmer_Hash_Table& hash); - // Returns the `DNA::Extended_Base` base-encoding of the underlying edge, - // from the point-of-view of the vertex `u_hat`. - cuttlefish::edge_encoding_t edge_encoding_u() const; - - // Returns the `DNA::Extended_Base` base-encoding of the underlying edge, - // from the point-of-view of the vertex `v_hat`. - cuttlefish::edge_encoding_t edge_encoding_v() const; + // Returns `true` iff the edge is a loop. + bool is_loop() const; }; @@ -80,72 +57,31 @@ inline Kmer& Edge::e() template -inline void Edge::configure() -{ - u_.from_prefix(e_), - v_.from_suffix(e_); - - u_bar_.as_reverse_complement(u_), - v_bar_.as_reverse_complement(v_); - - u_hat_ptr = Kmer::canonical(u_, u_bar_), - v_hat_ptr = Kmer::canonical(v_, v_bar_); - - s_u_hat_ = (&u_ == u_hat_ptr ? cuttlefish::side_t::back : cuttlefish::side_t::front); - s_v_hat_ = (&v_ == v_hat_ptr ? cuttlefish::side_t::front : cuttlefish::side_t::back); -} - - -template -inline bool Edge::is_loop() const +inline const Endpoint& Edge::u() const { - return *u_hat_ptr == *v_hat_ptr; + return u_; } template -inline const Kmer& Edge::u_hat() const +inline const Endpoint& Edge::v() const { - return *u_hat_ptr; + return v_; } template -inline const Kmer& Edge::v_hat() const +inline void Edge::configure(const Kmer_Hash_Table& hash) { - return *v_hat_ptr; + u_.from_prefix(e_, hash), + v_.from_suffix(e_, hash); } template -inline cuttlefish::side_t Edge::s_u_hat() const -{ - return s_u_hat_; -} - - -template -inline cuttlefish::side_t Edge::s_v_hat() const -{ - return s_v_hat_; -} - - -template -inline cuttlefish::edge_encoding_t Edge::edge_encoding_u() const -{ - const DNA::Base base = (s_u_hat_ == cuttlefish::side_t::back ? e_.back() : DNA_Utility::complement(e_.back())); - - return DNA_Utility::map_extended_base(base); -} - - -template -inline cuttlefish::edge_encoding_t Edge::edge_encoding_v() const +inline bool Edge::is_loop() const { - const DNA::Base base = (s_v_hat_ == cuttlefish::side_t::front ? e_.front() : DNA_Utility::complement(e_.front())); - - return DNA_Utility::map_extended_base(base); + return u_.canonical() == v_.canonical(); } diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 28a7c5a1..52553930 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -7,6 +7,7 @@ #include "globals.hpp" #include "Kmer_Hash_Table.hpp" #include "State_Read_Space.hpp" +#include "Endpoint.hpp" #include "Build_Params.hpp" #include "Thread_Pool.hpp" #include "Kmer_Container.hpp" @@ -38,25 +39,45 @@ class Read_CdBG_Constructor // the DFA as per the edges provided to that thread. void process_edges(uint16_t thread_id); - // For the vertex `v`, adds information of the incidence of an `e_v`-encoded edge to its side `s_v` - // — making the appropriate state transition for the DFA of `v`. Returns `false` iff an attempted + // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped + // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the + // DFA of `v`. Also stores the edge encodings of the incidence information of the side `s` before + // and after to this addition, in `e_old` and `e_new` respectively. Returns `false` iff an + // attempted state transition failed. + bool add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new); + + // Adds the information of an incident loop that connects the two different endpoints of some + // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state + // transition for the DFA of `v`. Also stores the edge encodings of the incidence information of + // the front and the back sides before this addition, in `e_front` and `e_back` respectively. + // Returns `false` iff an attempted state transition failed. + bool add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back); + + // Adds the information of an incident loop for some vertex `v` that connects its side `s` to + // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the + // appropriate state transition for the DFA of `v`. Also stores the edge encoding of the incidence + // information of the side `s` before this addition, in `e_old`. Returns `false` iff an attempted // state transition failed. - bool add_incident_edge(const Kmer& v, cuttlefish::side_t s_v, cuttlefish::edge_encoding_t e_v); + bool add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old); - // For the vertex `v`, adds information of the incidence of a looping edge that connects its side - // `s_u` to its side `s_v`, which may or may not be the same sides — making the appropriate state - // transition for the DFA of `v`. Returns `false` iff an attempted state transition failed. - bool add_loop(const Kmer& u, cuttlefish::side_t s_u, cuttlefish::side_t s_v); + // If the endpoint object `v_end` connects to some neighboring endpoint `w_end` through a unique + // edge encoded with `e`, then discards the incidence information of `w_end` — making the + // appropriate state transition for the corresponding neighboring vertex `w`. + void propagate_discard(const Endpoint& v_end, cuttlefish::edge_encoding_t e); - // For the vertex `v`, adds information of the incidence of a loop that connects its different - // sides — making the appropriate state transition for the DFA of `v`. Returns `false` iff an - // attempted state transition failed. - bool add_crossing_loop(const Kmer& v); + // For two neighboring endpoints `u_end` and `v_end`, discards the incidence information from + // `v_end`. Also discards information from any other neighboring side of `u_end` that may have + // connected to it through a unique edge encoded with `e`. Makes the appropriate state transitions + // for these neighbors of `u_end`. + void propagate_discard(const Endpoint& u_end, const Endpoint& v_end, cuttlefish::edge_encoding_t e); + + // Discards the incidence information of the endpoint `v_end`. Returns `false` iff an attempted + // state transition failed. + bool discard_side(const Endpoint& v_end); - // For the vertex `v`, adds information of the incidence of a loop that connects its side `s_v` - // to that side itself — making the appropriate state transition for the DFA of `v`. Returns - // `false` iff an attempted state transition failed. - bool add_one_sided_loop(const Kmer& v, cuttlefish::side_t s_v); + // Discards the incidence information of some endpoint `w_end` that connects to the endpoint + // `v_end` through the unique edge encoded with `e` — making the appropriate state transition. + void discard_neighbor_side(const Endpoint& v, cuttlefish::edge_encoding_t e); public: @@ -71,61 +92,53 @@ class Read_CdBG_Constructor template -inline bool Read_CdBG_Constructor::add_incident_edge(const Kmer& v, const cuttlefish::side_t s_v, const cuttlefish::edge_encoding_t e_v) +inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new) { - // Fetch the hash table entry for the DFA of the vertex `v`. + // Fetch the hash table entry for the vertex associated to the endpoint. - Kmer_Hash_Entry_API bucket = hash_table[v]; + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; State_Read_Space& state = bucket.get_state(); - cuttlefish::edge_encoding_t e_curr = state.edge_at(s_v); + e_new = e_old = state.edge_at(endpoint.side()); + // If we've already discarded the incidence information for this side, then a self-transition happens. - if(e_curr == cuttlefish::edge_encoding_t::N) + if(e_old == cuttlefish::edge_encoding_t::N) return true; // Early return w/o updating the same value again is safe — see the note at the end of the method. - - const cuttlefish::edge_encoding_t e_old = e_curr; - if(e_curr == cuttlefish::edge_encoding_t::E) // This side of the vertex is encountered for the first time. - e_curr = e_v; - else if(e_curr != e_v) // This side has been visited earlier with a different edge — discard the incidence information. - e_curr = cuttlefish::edge_encoding_t::N; + if(e_old == cuttlefish::edge_encoding_t::E) // This side of the vertex is observed for the first time. + e_new = endpoint.edge(); + else if(e_old != endpoint.edge()) // This side has been visited earlier, but with a different edge — discard the incidence information. + e_new = cuttlefish::edge_encoding_t::N; // We can get away without updating the same value again, because — (1) even if this DFA's state changes // in the hash table by the time this method completes, making no updates at this point is theoretically // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the // ordering of the edges processed does not matter in the algorithm. - if(e_curr == e_old) + if(e_new == e_old) return true; - state.update_edge_at(s_v, e_curr); + state.update_edge_at(endpoint.side(), e_new); return hash_table.update(bucket); } template -inline bool Read_CdBG_Constructor::add_loop(const Kmer& v, const cuttlefish::side_t s_u, const cuttlefish::side_t s_v) -{ - return s_u == s_v ? add_one_sided_loop(v, s_u) : add_crossing_loop(v); -} - - -template -inline bool Read_CdBG_Constructor::add_crossing_loop(const Kmer& v) +inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back) { - // Fetch the hash table entry for the DFA of the vertex `v`. - - Kmer_Hash_Entry_API bucket = hash_table[v]; + // Fetch the hash table entry for the DFA of vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; State_Read_Space& state = bucket.get_state(); - const cuttlefish::edge_encoding_t e_front = state.edge_at(cuttlefish::side_t::front); - const cuttlefish::edge_encoding_t e_back = state.edge_at(cuttlefish::side_t::back); + e_front = state.edge_at(cuttlefish::side_t::front); + e_back = state.edge_at(cuttlefish::side_t::back); const State_Read_Space state_old = state; - if(e_front != cuttlefish::edge_encoding_t::N) + if(e_front != cuttlefish::edge_encoding_t::N) // Discard the front-incidence information, if not done already. state.update_edge_at(cuttlefish::side_t::front, cuttlefish::edge_encoding_t::N); - if(e_back != cuttlefish::edge_encoding_t::N) + if(e_back != cuttlefish::edge_encoding_t::N) // Discard the back-incidence information, if not done already. state.update_edge_at(cuttlefish::side_t::back, cuttlefish::edge_encoding_t::N); // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. @@ -134,22 +147,68 @@ inline bool Read_CdBG_Constructor::add_crossing_loop(const Kmer& v) template -inline bool Read_CdBG_Constructor::add_one_sided_loop(const Kmer& v, const cuttlefish::side_t s_v) +inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old) { - // Fetch the hash table entry for the vertex `v`. + // Fetch the hash table entry for the vertex associated to the endpoint. - Kmer_Hash_Entry_API bucket = hash_table[v]; + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; State_Read_Space& state = bucket.get_state(); - const cuttlefish::edge_encoding_t e_v = state.edge_at(s_v); + e_old = state.edge_at(endpoint.side()); // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. - if(e_v == cuttlefish::edge_encoding_t::N) + if(e_old == cuttlefish::edge_encoding_t::N) // The incidence information has already been discarded. return true; - - state.update_edge_at(s_v, cuttlefish::edge_encoding_t::N); + + // Discard the incidence information. + state.update_edge_at(endpoint.side(), cuttlefish::edge_encoding_t::N); + return hash_table.update(bucket); +} + + +template +inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + if(e != cuttlefish::edge_encoding_t::E && e != cuttlefish::edge_encoding_t::N) // The incident edge is unique. + discard_neighbor_side(v_end, e); +} + + +template +inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& u_end, const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + while(!discard_side(v_end)); // Discard the neighbor `v_end`. + + propagate_discard(u_end, e); // Discard the other neighbor. +} + + +template +inline bool Read_CdBG_Constructor::discard_side(const Endpoint& v_end) +{ + // Fetch the hash table entry for the DFA of the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[v_end.hash()]; + State_Read_Space& state = bucket.get_state(); + const cuttlefish::edge_encoding_t e_curr = state.edge_at(v_end.side()); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + if(e_curr == cuttlefish::edge_encoding_t::N) // The incidende information has already been discarded. + return true; + + // Discard the incidence information. + state.update_edge_at(v_end.side(), cuttlefish::edge_encoding_t::N); return hash_table.update(bucket); } +template +inline void Read_CdBG_Constructor::discard_neighbor_side(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + const Endpoint w = v_end.neighbor_endpoint(e, hash_table); // Get the neighboring endpoint connected with `e`. + + while(!discard_side(w)); // Discard the incidence information off that neighbor. +} + + #endif diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index b5a67fa4..ac9ab0f9 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -63,20 +63,43 @@ void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thr template void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) { - Edge e; - uint64_t edge_count = 0; + // Data locations to be reused per each edge processed. + Edge e; // For the edges to be processed one-by-one. + cuttlefish::edge_encoding_t e_front, e_back; // Edges incident to the front and to the back of a vertex with a crossing loop. + cuttlefish::edge_encoding_t e_u_old, e_u_new; // Edges incident to some particular side of a vertex `u`, before and after the addition of a new edge. + cuttlefish::edge_encoding_t e_v_old, e_v_new; // Edges incident to some particular side of a vertex `v`, before and after the addition of a new edge. + + uint64_t edge_count = 0; // Number of edges processed by this thread. while(edge_parser.tasks_expected(thread_id)) if(edge_parser.value_at(thread_id, e.e())) { - e.configure(); // A new edge (k + 1)-mer has been parsed; set the relevant k-mer and sides information. + e.configure(hash_table); // A new edge (k + 1)-mer has been parsed; set information for its two endpoints. if(e.is_loop()) - while(!add_loop(e.u_hat(), e.s_u_hat(), e.s_v_hat())); - else + if(e.u().side() != e.v().side()) // It is a crossing loop. + { + while(!add_crossing_loop(e.u(), e_front, e_back)); + + propagate_discard(e.u(), e.u().side() == cuttlefish::side_t::front ? e_front : e_back); + propagate_discard(e.v(), e.v().side() == cuttlefish::side_t::front ? e_front : e_back); + } + else // A one-sided loop. + { + while(!add_one_sided_loop(e.u(), e_u_old)); + + propagate_discard(e.u(), e_u_old); + } + else // It connects two endpoints `u` and `v` of two distinct vertex. { - while(!add_incident_edge(e.u_hat(), e.s_u_hat(), e.edge_encoding_u())); - while(!add_incident_edge(e.v_hat(), e.s_v_hat(), e.edge_encoding_v())); + while(!add_incident_edge(e.u(), e_u_old, e_u_new)); + while(!add_incident_edge(e.v(), e_v_old, e_v_new)); + + if(e_u_new == cuttlefish::edge_encoding_t::N) + propagate_discard(e.u(), e.v(), e_u_old); + + if(e_v_new == cuttlefish::edge_encoding_t::N) + propagate_discard(e.v(), e.u(), e_v_old); } edge_count++; From 22b82e0060c9089e838e30c2d8a4168b47210b11 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 15 Apr 2021 18:18:20 -0400 Subject: [PATCH 059/350] Add directed vertex data structure --- include/Directed_Vertex.hpp | 118 ++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 include/Directed_Vertex.hpp diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp new file mode 100644 index 00000000..30677d80 --- /dev/null +++ b/include/Directed_Vertex.hpp @@ -0,0 +1,118 @@ + +#ifndef DIRECTED_VERTEX_HPP +#define DIRECTED_VERTEX_HPP + + + +#include "Kmer.hpp" +#include "globals.hpp" +#include "Kmer_Hash_Table.hpp" + +#include + + +// A class denoting an instance of a vertex. It's "directed" in the sense that the k-mer +// observed for the vertex is in a particular orientation — although a vertex `v` has an +// unambiguous canonical k-mer `v_hat`, the vertex can be observed in two different k-mer +// forms: `v_hat` and `{v_hat}_bar` — the class keeps track of the particular k-mer form +// observed for the vertex instance. +template +class Directed_Vertex +{ +private: + + Kmer kmer_; // The observed k-mer for the vertex. + Kmer kmer_bar_; // Reverse complement of the k-mer observed for the vertex. + const Kmer* kmer_hat_ptr; // Pointer to the canonical form of the k-mer associated to the vertex. + uint64_t h; // Hash value of the vertex, i.e. hash of the canonical k-mer. + + // Initialize the data of the class once the observed k-mer `kmer_` is set. + void init(const Kmer_Hash_Table& hash); + + +public: + + // Constructs an empty vertex. + Directed_Vertex() + {} + + // Constructs a vertex observed for the k-mer `kmer`. Gets the hash value of the vertex using + // the hash table `hash`. + Directed_Vertex(const Kmer& kmer, const Kmer_Hash_Table& hash); + + // Returns `true` iff the k-mer observed for the vertex is in its canonical form. + bool in_canonical_form() const; + + // Configures the vertex with the source (i.e. prefix) k-mer of the edge (k + 1)-mer `e`; + // and uses the hash table `hash` to get the hash value of the vertex. + void from_prefix(const Kmer& e, const Kmer_Hash_Table& hash); + + // Configures the vertex with the sink (i.e. suffix) k-mer of the edge (k + 1)-mer `e`; + // and uses the hash table `hash` to get the hash value of the vertex. + void from_suffix(const Kmer& e, const Kmer_Hash_Table& hash); + + // Returns the canonical form of the vertex. + const Kmer& canonical() const; + + // Returns the hash value of the vertex. + uint64_t hash() const; +}; + + +template +inline void Directed_Vertex::init(const Kmer_Hash_Table& hash) +{ + kmer_bar_.as_reverse_complement(kmer_); + kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + + h = hash(*kmer_hat_ptr); +} + + +template +inline Directed_Vertex::Directed_Vertex(const Kmer& kmer, const Kmer_Hash_Table& hash): + kmer_(kmer) +{ + init(hash); +} + + +template +inline bool Directed_Vertex::in_canonical_form() const +{ + return &kmer_ == kmer_hat_ptr; +} + + +template +inline void Directed_Vertex::from_prefix(const Kmer& e, const Kmer_Hash_Table& hash) +{ + kmer_.from_prefix(e); + init(hash); +} + + +template +inline void Directed_Vertex::from_suffix(const Kmer& e, const Kmer_Hash_Table& hash) +{ + kmer_.from_suffix(e); + init(hash); +} + + +template +inline const Kmer& Directed_Vertex::canonical() const +{ + return *kmer_hat_ptr; +} + + +template +inline uint64_t Directed_Vertex::hash() const +{ + return h; +} + + + +#endif From 9fc67c67b228e2060e6d0bc18ca1ec1f1b9dc0b2 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 19 Apr 2021 18:09:03 -0400 Subject: [PATCH 060/350] Eliminate coexistence of multi parser buffers --- include/Read_CdBG_Constructor.hpp | 17 +++++++++-------- include/Read_CdBG_Extractor.hpp | 6 ++---- include/Task_Params.hpp | 4 +++- include/Thread_Pool.hpp | 9 ++++----- src/Read_CdBG_Constructor.cpp | 26 +++++++++++++------------- src/Read_CdBG_Extractor.cpp | 24 ++++++++++++------------ src/Thread_Pool.cpp | 9 +++++---- 7 files changed, 48 insertions(+), 47 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 52553930..9d59cdbe 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -24,20 +24,21 @@ class Read_CdBG_Constructor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (canonical k-mers) of the graph. - const Kmer_Container edge_container; // Wrapper container for the edge-database. - Kmer_SPMC_Iterator edge_parser; // Parser for the edges from the edge-database. - + // Members required to keep track of the total number of edges processed across different threads. mutable Spin_Lock lock; mutable uint64_t edges_processed = 0; - // Distributes the DFA-states computation task to the worker threads in the thread pool `thread_pool`. - void distribute_states_computation(Thread_Pool& thread_pool); + // Distributes the DFA-states computation task — disperses the graph edges (i.e. (k + 1)-mers) + // parsed by the parser `edge_parser` to the worker threads in the thread pool `thread_pool`, + // for the edges to be processed by making appropriate state transitions for their endpoints. + void distribute_states_computation(Kmer_SPMC_Iterator* edge_parser, Thread_Pool& thread_pool); - // Processes the edges provided to the thread with id `thread_id`, i.e. makes state-transitions for - // the DFA as per the edges provided to that thread. - void process_edges(uint16_t thread_id); + // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, + // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge + // `(u, v)` provided to that thread. + void process_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 0bbbfea8..0a3051a1 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -23,8 +23,6 @@ class Read_CdBG_Extractor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (i.e. canonical k-mers) of the original (uncompacted) de Bruijn graph. - const Kmer_Container vertex_container; // Wrapper container for the vertex-database. - Kmer_SPMC_Iterator vertex_parser; // Parser for the vertices from the vertex-database. // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. mutable Spin_Lock lock; @@ -32,11 +30,11 @@ class Read_CdBG_Extractor // Distributes the maximal unitigs extraction task to the worker threads in the thread pool `thread_pool`. - void distribute_unipaths_extraction(Thread_Pool& thread_pool); + void distribute_unipaths_extraction(Kmer_SPMC_Iterator* vertex_parser, Thread_Pool& thread_pool); // Processes the vertices provided to the thread with id `thread_id`, i.e. builds the maximal unitigs from // the flanking vertices provided to that thread. - void process_vertices(uint16_t thread_id); + void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); public: diff --git a/include/Task_Params.hpp b/include/Task_Params.hpp index e9cd8b1d..cf6a9046 100644 --- a/include/Task_Params.hpp +++ b/include/Task_Params.hpp @@ -48,12 +48,14 @@ struct Output_Task_Params // Wrapper over the parameters for the DFA-states computation and the maximal unitigs extraction tasks for read-dBGs. struct Read_dBG_Compaction_Params { + void* parser; uint16_t thread_id; Read_dBG_Compaction_Params() {} - Read_dBG_Compaction_Params(const uint16_t thread_id): + Read_dBG_Compaction_Params(void* const parser, const uint16_t thread_id): + parser(parser), thread_id(thread_id) {} }; diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 76645de4..1ab7eeb8 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -13,9 +13,6 @@ #include -template class CdBG; - - // A basic thread pool class to support avoidance of latency incurred with frequent // construction and destruction of threads throughout the compaction algorithm. template @@ -97,8 +94,10 @@ class Thread_Pool // Assigns an outputting task to the thread number `thread_id` with the provided parameters. void assign_output_task(uint16_t thread_id, const char* seq, size_t seq_len, size_t left_end, size_t right_end); - // Assigns a read-dBG compaction task (either DFA-states computation or maximal unitigs extraction) to the thread number `thread_id`. - void assign_read_dBG_compaction_task(uint16_t thread_id); + // Assigns a read-dBG compaction task, either DFA-states computation or maximal unitigs extraction, + // to the thread number `thread_id`; the edges (i.e. (k + 1)-mers) or vertices (i.e. k-mers), + // respectively, are parsed using `parser`. + void assign_read_dBG_compaction_task(void* parser, uint16_t thread_id); // Waits until all the threads in the pool have completed their active tasks. void wait_completion() const; diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index ac9ab0f9..4c5d05d8 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -8,12 +8,8 @@ template Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table): params(params), - hash_table(hash_table), - edge_container(params.edge_db_path()), - edge_parser(&edge_container, params.thread_count()) -{ - std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; -} + hash_table(hash_table) +{} template @@ -27,10 +23,14 @@ void Read_CdBG_Constructor::compute_DFA_states() Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); // Launch the reading (and parsing per demand) of the edges from disk. + const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. + Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. + std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; + edge_parser.launch_production(); // Launch (multi-threaded) computation of the states. - distribute_states_computation(thread_pool); + distribute_states_computation(&edge_parser, thread_pool); // Wait for the edges to be depleted from the database. edge_parser.seize_production(); @@ -38,7 +38,7 @@ void Read_CdBG_Constructor::compute_DFA_states() // Wait for the consumer threads to finish parsing and processing the edges. thread_pool.close(); - std::cout << "Number of processed egdes: " << edges_processed << "\n"; + std::cout << "Number of processed edges: " << edges_processed << "\n"; std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); @@ -48,20 +48,20 @@ void Read_CdBG_Constructor::compute_DFA_states() template -void Read_CdBG_Constructor::distribute_states_computation(Thread_Pool& thread_pool) +void Read_CdBG_Constructor::distribute_states_computation(Kmer_SPMC_Iterator* const edge_parser, Thread_Pool& thread_pool) { const uint16_t thread_count = params.thread_count(); for(uint16_t t_id = 0; t_id < thread_count; ++t_id) { const uint16_t idle_thread_id = thread_pool.get_idle_thread(); - thread_pool.assign_read_dBG_compaction_task(idle_thread_id); + thread_pool.assign_read_dBG_compaction_task(edge_parser, idle_thread_id); } } template -void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) +void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { // Data locations to be reused per each edge processed. Edge e; // For the edges to be processed one-by-one. @@ -71,8 +71,8 @@ void Read_CdBG_Constructor::process_edges(const uint16_t thread_id) uint64_t edge_count = 0; // Number of edges processed by this thread. - while(edge_parser.tasks_expected(thread_id)) - if(edge_parser.value_at(thread_id, e.e())) + while(edge_parser->tasks_expected(thread_id)) + if(edge_parser->value_at(thread_id, e.e())) { e.configure(hash_table); // A new edge (k + 1)-mer has been parsed; set information for its two endpoints. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 8d747ca6..fc493b62 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -5,12 +5,8 @@ template Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table): params(params), - hash_table(hash_table), - vertex_container(params.vertex_db_path()), - vertex_parser(&vertex_container, params.thread_count()) -{ - std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; -} + hash_table(hash_table) +{} template @@ -24,10 +20,14 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_unipaths_read_space); // Launch the reading (and parsing per demand) of the vertices from disk. + const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; + vertex_parser.launch_production(); // Launch (multi-thread) extraction of the maximal unitigs. - distribute_unipaths_extraction(thread_pool); + distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be deplted from the database. vertex_parser.seize_production(); @@ -45,26 +45,26 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() template -void Read_CdBG_Extractor::distribute_unipaths_extraction(Thread_Pool& thread_pool) +void Read_CdBG_Extractor::distribute_unipaths_extraction(Kmer_SPMC_Iterator* const vertex_parser, Thread_Pool& thread_pool) { const uint16_t thread_count = params.thread_count(); for(uint16_t t_id = 0; t_id < thread_count; ++t_id) { const uint16_t idle_thread_id = thread_pool.get_idle_thread(); - thread_pool.assign_read_dBG_compaction_task(idle_thread_id); + thread_pool.assign_read_dBG_compaction_task(vertex_parser, idle_thread_id); } } template -void Read_CdBG_Extractor::process_vertices(const uint16_t thread_id) +void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { Kmer v; uint64_t vertex_count = 0; - while(vertex_parser.tasks_expected(thread_id)) - if(vertex_parser.value_at(thread_id, v)) + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) { vertex_count++; } diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index a3382e40..a25c3cb7 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -1,5 +1,6 @@ #include "Thread_Pool.hpp" +#include "Kmer_SPMC_Iterator.hpp" #include "CdBG.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" @@ -92,14 +93,14 @@ void Thread_Pool::task(const uint16_t thread_id) case Task_Type::compute_states_read_space: { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)->process_edges(params.thread_id); + static_cast*>(dBG)->process_edges(static_cast*>(params.parser), params.thread_id); } break; case Task_Type::extract_unipaths_read_space: { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)->process_vertices(params.thread_id); + static_cast*>(dBG)->process_vertices(static_cast*>(params.parser), params.thread_id); } } @@ -153,9 +154,9 @@ void Thread_Pool::assign_output_task(const uint16_t thread_id, const char* co template -void Thread_Pool::assign_read_dBG_compaction_task(const uint16_t thread_id) +void Thread_Pool::assign_read_dBG_compaction_task(void* const parser, const uint16_t thread_id) { - read_dBG_compaction_params[thread_id] = Read_dBG_Compaction_Params(thread_id); + read_dBG_compaction_params[thread_id] = Read_dBG_Compaction_Params(parser, thread_id); assign_task(thread_id); } From 6e18fadefa640049a985c33ff7ae2e3acf455c15 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 19 Apr 2021 19:00:24 -0400 Subject: [PATCH 061/350] Revisit descriptor comments for extractor --- include/Read_CdBG_Extractor.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 0a3051a1..fa67a6b3 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -29,11 +29,14 @@ class Read_CdBG_Extractor mutable uint64_t vertices_processed = 0; - // Distributes the maximal unitigs extraction task to the worker threads in the thread pool `thread_pool`. + // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) + // parsed by the parser `vertex_parser` to the worker threads in the thread pool `thread_pool`, + // for the unitpath-flanking vertices to be identified and the corresponding unipaths to be extracted. void distribute_unipaths_extraction(Kmer_SPMC_Iterator* vertex_parser, Thread_Pool& thread_pool); - // Processes the vertices provided to the thread with id `thread_id`, i.e. builds the maximal unitigs from - // the flanking vertices provided to that thread. + // Processes the vertices provided to the thread with id `thread_id` from the parser `vertex_parser`, + // i.e. for each vertex `v` provided to that thread, identifies whether it is a unipath-flanking + // vertex, and if it is, then piece-wise constructs the corresponding unipath. void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); From ac92b1d18ce56a029238f5f4eb8841bc62f40210 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 19 Apr 2021 21:55:28 -0400 Subject: [PATCH 062/350] Add flanking status computer --- include/Read_CdBG_Extractor.hpp | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index fa67a6b3..e7eeb460 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -39,6 +39,26 @@ class Read_CdBG_Extractor // vertex, and if it is, then piece-wise constructs the corresponding unipath. void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + // Returns `true` iff some vertex `v` with the provided state `state` is a flanking vertex for + // the maximal unitig containing it. If it is, then stores the side of `v` that is opposite to + // the flanking side, i.e. the side extending the unipath `p` containing `v`, to `unipath_side`. + // If `p` is trivial, then the side stored is implementation-specific. + // NB: unless for flanking vertices that are branching, this function cannot possibly be defined + // — for non-branching vertices, it's not possible to compute whether they are flanking solely + // from their states. Nevertheless, given that the information-discarding heuristic for branching + // k-mers has been implemented in the DFA states computation phase, this method correctly computes + // the flanking-status for some vertex from just its state. + static bool is_flanking_state(State_Read_Space state, cuttlefish::side_t& unipath_side); + + // Returns `true` iff the vertex-side `side` for a vertex with state `state` flanks the maximal + // unitig containing the vertex. + // NB: this method is only applicable when information-discarding is propagated to the neighbors + // from branching vertices. In absence of the implementation for the heuristic, this function + // cannot possibly be defined from solely the parameters `state` and `side` — for non-branching + // vertices, it's not possible to compute whether some side of them they are flanking solely + // from their states — a vertex `v` is also required. + static bool is_flanking_side(State_Read_Space state, cuttlefish::side_t side); + public: @@ -51,5 +71,34 @@ class Read_CdBG_Extractor }; +template +inline bool Read_CdBG_Extractor::is_flanking_state(const State_Read_Space state, cuttlefish::side_t& unipath_side) +{ + if(is_flanking_side(state, cuttlefish::side_t::front)) + { + unipath_side = cuttlefish::side_t::back; + return true; + } + + if(is_flanking_side(state, cuttlefish::side_t::back)) + { + unipath_side = cuttlefish::side_t::front; + return true; + } + + + return false; +} + + +template +inline bool Read_CdBG_Extractor::is_flanking_side(const State_Read_Space state, const cuttlefish::side_t side) +{ + const cuttlefish::edge_encoding_t edge = state.edge_at(side); + + return edge == cuttlefish::edge_encoding_t::N || edge == cuttlefish::edge_encoding_t::E; +} + + #endif From 853a76082484fb1714042e6f30d3b22d4aaaba14 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 21 Apr 2021 18:45:53 -0400 Subject: [PATCH 063/350] Correct documentation --- include/Read_CdBG_Extractor.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index e7eeb460..c2f06aae 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -40,15 +40,15 @@ class Read_CdBG_Extractor void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); // Returns `true` iff some vertex `v` with the provided state `state` is a flanking vertex for - // the maximal unitig containing it. If it is, then stores the side of `v` that is opposite to - // the flanking side, i.e. the side extending the unipath `p` containing `v`, to `unipath_side`. - // If `p` is trivial, then the side stored is implementation-specific. + // the maximal unitig containing it. // NB: unless for flanking vertices that are branching, this function cannot possibly be defined // — for non-branching vertices, it's not possible to compute whether they are flanking solely - // from their states. Nevertheless, given that the information-discarding heuristic for branching - // k-mers has been implemented in the DFA states computation phase, this method correctly computes - // the flanking-status for some vertex from just its state. - static bool is_flanking_state(State_Read_Space state, cuttlefish::side_t& unipath_side); + // from their states. Nevertheless, given that the heuristic of propagation of information- + // discarding from branching k-mers has been implemented in the DFA states computation phase, + // this method correctly computes the flanking-status for some vertex from just its state. + // (The information-discarding propagation heuristic turns the flanking non-branching vertices + // into branching ones.) + static bool is_flanking_state(State_Read_Space state); // Returns `true` iff the vertex-side `side` for a vertex with state `state` flanks the maximal // unitig containing the vertex. From b1528a93beda47279fb5ae1152d3158cbd9547d2 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 21 Apr 2021 19:24:04 -0400 Subject: [PATCH 064/350] Better compute reverse complement --- include/Kmer.hpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 25ea8fe9..757df638 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -402,17 +402,7 @@ inline Kmer Kmer::reverse_complement() const Kmer kmer(*this); Kmer rev_compl; - - constexpr uint64_t mask_LSN = uint64_t(0b11); - - for(uint16_t idx = 0; idx < k; ++idx) - { - rev_compl.left_shift(); - rev_compl.kmer_data[0] |= complement(DNA::Base(kmer.kmer_data[0] & mask_LSN)); - - kmer.right_shift(); - } - + rev_compl.as_reverse_complement(*this); return rev_compl; } From 77767314968d04b2056564abaec77b169b197232 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 21 Apr 2021 22:39:25 -0400 Subject: [PATCH 065/350] Rectify partial commit --- include/Kmer.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 757df638..1d68aab5 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -398,9 +398,6 @@ inline void Kmer::from_suffix(const Kmer& k_plus_1_mer) template inline Kmer Kmer::reverse_complement() const { - // TODO: define the method using `as_reverse_complement`. - - Kmer kmer(*this); Kmer rev_compl; rev_compl.as_reverse_complement(*this); From e8732ba83802862cb8bd03d711e53282b38d35e7 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 22 Apr 2021 11:56:07 -0400 Subject: [PATCH 066/350] Better code --- include/Annotated_Kmer.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/Annotated_Kmer.hpp b/include/Annotated_Kmer.hpp index 30ed5532..1db05191 100644 --- a/include/Annotated_Kmer.hpp +++ b/include/Annotated_Kmer.hpp @@ -71,10 +71,7 @@ inline void Annotated_Kmer::roll_to_next_kmer(const char next_base, const Kme template inline void Annotated_Kmer::operator=(const Annotated_Kmer& rhs) { - this->kmer_ = rhs.kmer_; - this->rev_compl_ = rhs.rev_compl_; - this->canonical_ = rhs.canonical_; - this->dir_ = rhs.dir_; + Directed_Kmer::operator=(rhs); idx_ = rhs.idx_; state_class_ = rhs.state_class_; From 28023ce82590d6f51a4f421eb56dc6bec3ffd163 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 22 Apr 2021 17:30:05 -0400 Subject: [PATCH 067/350] Add extraction algorithm --- include/DNA_Utility.hpp | 12 +++++++ include/Directed_Vertex.hpp | 60 ++++++++++++++++++++++++++++++++ include/Kmer.hpp | 12 +++++++ include/Kmer_Hash_Entry_API.hpp | 14 +++++--- include/Read_CdBG_Extractor.hpp | 61 +++++++++++++++++++++------------ src/DNA_Utility.cpp | 1 + src/Read_CdBG_Extractor.cpp | 52 ++++++++++++++++++++++++++-- 7 files changed, 185 insertions(+), 27 deletions(-) diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index 7225017e..641d006f 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -41,6 +41,12 @@ class DNA_Utility T, G, C, A, N }; + // Mapped ASCII characters for the `DNA::Base` notations. + static constexpr char MAPPED_CHAR[4] = + { + 'A', 'C', 'G', 'T' + }; + // DNA-complement characters for the ASCII characters in the range [0, 127] static constexpr char COMPLEMENTED_CHAR[128] = { @@ -127,6 +133,12 @@ class DNA_Utility { return COMPLEMENTED_BASE[base]; } + + // Returns the mapping character of the nucleobase `base`. + static char map_char(const DNA::Base base) + { + return MAPPED_CHAR[static_cast(base)]; + } // Returns the DNA-complement (upper-case) character of the character `base`. static char complement(const char base) diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp index 30677d80..d491bb2b 100644 --- a/include/Directed_Vertex.hpp +++ b/include/Directed_Vertex.hpp @@ -40,6 +40,9 @@ class Directed_Vertex // the hash table `hash`. Directed_Vertex(const Kmer& kmer, const Kmer_Hash_Table& hash); + // Copy constructs the vertex from `rhs`. + Directed_Vertex(const Directed_Vertex& rhs); + // Returns `true` iff the k-mer observed for the vertex is in its canonical form. bool in_canonical_form() const; @@ -51,11 +54,28 @@ class Directed_Vertex // and uses the hash table `hash` to get the hash value of the vertex. void from_suffix(const Kmer& e, const Kmer_Hash_Table& hash); + // Returns the observed k-mer for the vertex. + const Kmer& kmer() const; + + // Returns the reverse complement of the observed k-mer for the vertex. + const Kmer& kmer_bar() const; + // Returns the canonical form of the vertex. const Kmer& canonical() const; // Returns the hash value of the vertex. uint64_t hash() const; + + // Transforms this vertex to another by chopping off the first base from the associated + // observed k-mer, and appending the nucleobase `b` to the end, i.e. effecitively + // rolling the associated k-mer by one base "forward". The hash table `hash` is used + // to get the hash value of the new vertex. + void roll_forward(cuttlefish::base_t b, const Kmer_Hash_Table& hash); + + // Returns the side of the vertex which is to be the incidence side of some bidirected + // edge instance if this vertex instance were to be the source vertex (i.e. prefix k-mer) + // of that edge. + cuttlefish::side_t exit_side() const; }; @@ -77,6 +97,15 @@ inline Directed_Vertex::Directed_Vertex(const Kmer& kmer, const Kmer_Hash_ } +template +inline Directed_Vertex::Directed_Vertex(const Directed_Vertex& rhs): + kmer_(rhs.kmer_), + kmer_bar_(rhs.kmer_bar_), + kmer_hat_ptr(Kmer::canonical(kmer_, kmer_bar_)), + h(rhs.h) +{} + + template inline bool Directed_Vertex::in_canonical_form() const { @@ -100,6 +129,20 @@ inline void Directed_Vertex::from_suffix(const Kmer& e, const Kmer_Has } +template +inline const Kmer& Directed_Vertex::kmer() const +{ + return kmer_; +} + + +template +inline const Kmer& Directed_Vertex::kmer_bar() const +{ + return kmer_bar_; +} + + template inline const Kmer& Directed_Vertex::canonical() const { @@ -114,5 +157,22 @@ inline uint64_t Directed_Vertex::hash() const } +template +inline void Directed_Vertex::roll_forward(const cuttlefish::base_t b, const Kmer_Hash_Table& hash) +{ + kmer_.roll_to_next_kmer(b, kmer_bar_); + kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + + h = hash(*kmer_hat_ptr); +} + + +template +inline cuttlefish::side_t Directed_Vertex::exit_side() const +{ + return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::back : cuttlefish::side_t::front; +} + + #endif diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 1d68aab5..43a5111d 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -124,6 +124,11 @@ class Kmer: public DNA_Utility // Returns true iff this k-mer is not identical to the other k-mer `rhs`. bool operator!=(const Kmer& rhs) const; + // Returns true iff the bitwise encoding of this k-mer is either equal + // or greater to the other k-mer `rhs`. The current encoding corresponds + // to the lexical ordering. + bool operator>=(const Kmer& rhs) const; + // Returns the `DNA::Base` (2-bit) encoding of the character at the front, // i.e. at the first index of the literal representation. For a k-mer // `n_{k - 1} ... n_1 n_0`, this is the base `n_{k - 1}`. @@ -466,6 +471,13 @@ inline bool Kmer::operator!=(const Kmer& rhs) const } +template +inline bool Kmer::operator>=(const Kmer& rhs) const +{ + return !operator<(rhs); +} + + template inline DNA::Base Kmer::front() const { diff --git a/include/Kmer_Hash_Entry_API.hpp b/include/Kmer_Hash_Entry_API.hpp index 189aa1df..06b0cdcb 100644 --- a/include/Kmer_Hash_Entry_API.hpp +++ b/include/Kmer_Hash_Entry_API.hpp @@ -91,14 +91,14 @@ class Kmer_Hash_Entry_API const State_Read_Space state_read; // Value read from the bitvector entry when the object is constructed; is mutable. - State_Read_Space state; + State_Read_Space state_; // Constructs an API to the bitvector entry `bv_entry`. Kmer_Hash_Entry_API(const bitvector_entry_t& bv_entry): bv_entry(bv_entry), state_read(bv_entry) { - state = state_read; + state_ = state_read; } // Returns the state value read when the object was constructed. @@ -112,7 +112,7 @@ class Kmer_Hash_Entry_API // possibly have been modified. cuttlefish::state_code_t get_current_state() const { - return state.get_state(); + return state_.get_state(); } @@ -121,7 +121,13 @@ class Kmer_Hash_Entry_API // Returns a reference to the mutable copy of the wrapped state value. State_Read_Space& get_state() { - return state; + return state_; + } + + // Returns a copy of the wrapped state value. + State_Read_Space state() const + { + return state_; } }; diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index c2f06aae..b995063b 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -25,8 +25,10 @@ class Read_CdBG_Extractor Kmer_Hash_Table& hash_table; // Hash table for the vertices (i.e. canonical k-mers) of the original (uncompacted) de Bruijn graph. // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. - mutable Spin_Lock lock; - mutable uint64_t vertices_processed = 0; + mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. + mutable uint64_t vertices_processed = 0; // Total number of vertices scanned from the database. + + uint64_t unipath_count = 0; // Total number of maximal unitigs extracted from the underlying graph. // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) @@ -39,24 +41,42 @@ class Read_CdBG_Extractor // vertex, and if it is, then piece-wise constructs the corresponding unipath. void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); - // Returns `true` iff some vertex `v` with the provided state `state` is a flanking vertex for - // the maximal unitig containing it. - // NB: unless for flanking vertices that are branching, this function cannot possibly be defined - // — for non-branching vertices, it's not possible to compute whether they are flanking solely - // from their states. Nevertheless, given that the heuristic of propagation of information- - // discarding from branching k-mers has been implemented in the DFA states computation phase, - // this method correctly computes the flanking-status for some vertex from just its state. - // (The information-discarding propagation heuristic turns the flanking non-branching vertices - // into branching ones.) - static bool is_flanking_state(State_Read_Space state); - - // Returns `true` iff the vertex-side `side` for a vertex with state `state` flanks the maximal - // unitig containing the vertex. - // NB: this method is only applicable when information-discarding is propagated to the neighbors - // from branching vertices. In absence of the implementation for the heuristic, this function - // cannot possibly be defined from solely the parameters `state` and `side` — for non-branching - // vertices, it's not possible to compute whether some side of them they are flanking solely - // from their states — a vertex `v` is also required. + // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` + // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when + // the k-mer `v_hat` is the first k-mer in the canonical form of `p`. Thus encountering a maximal + // unitig in its non-canonical form results in a failed extraction. + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); + + // Note: The following methods are only applicable when the heuristic of information-discarding + // from branching vertices to their neighbors has been implemented in the DFA states computation + // phase. In the general case, these functions with their specified input parameters and their + // intended output values can not possibly be defined. Given just the state of a vertex, unless + // for flanking vertices that are branching, it's not possible to determine — + // i. whether it is a maximal unitig flanking vertex; + // ii. whether some specific side of it is a flanking side; + // iii. and which side of it may connect to the containing maximal unitig. + // The vertex itself, along with the hash table containing the DFA states, are required in general + // — so that the neighboring vertices can also be probed to answer these queries. Nevertheless, + // given that the heuristic of propagation of information-discarding from branching vertices has + // been implemented in the DFA states computation phase, the following method definitions can + // correctly respond to the queries given just the state. This is because the heuristic transforms + // the flanking non-branching vertices into branching ones. Thus, although their states are not + // technically "correct" as per the theoretical model — we are throwing away more information from + // the model than it already is doing — this does not affect the output for the purposes of maximal + // unitigs extraction. + + // Returns `true` iff some vertex `v` with the provided state `state` is a flanking vertex for the + // maximal unitig `p` containing it. If yes, then stores the side of `v` to `unipath_side` through + // which `v` is connected to `p`. If `p` is trivial, i.e. `p = v`, then the returned side is `back`, + // — necessitated by an optimization for the extraction of unipaths in their canonical forms. + // NB: this method is only applicable if the heuristic of information-propagation from branching vertices + // has been implemented in the DFA states computation phase. See the detailed comment in the class body. + static bool is_flanking_state(State_Read_Space state, cuttlefish::side_t& unipath_side); + + // Returns `true` iff the vertex-side `side` for a vertex with state `state` flanks the maximal unitig + // containing the vertex. + // NB: this method is only applicable if the heuristic of information-propagation from branching vertices + // has been implemented in the DFA states computation phase. See the detailed comment in the class body. static bool is_flanking_side(State_Read_Space state, cuttlefish::side_t side); @@ -86,7 +106,6 @@ inline bool Read_CdBG_Extractor::is_flanking_state(const State_Read_Space sta return true; } - return false; } diff --git a/src/DNA_Utility.cpp b/src/DNA_Utility.cpp index 625f8246..ccbdc7b5 100644 --- a/src/DNA_Utility.cpp +++ b/src/DNA_Utility.cpp @@ -9,3 +9,4 @@ constexpr bool DNA_Utility::IS_PLACEHOLDER[128]; constexpr uint8_t DNA_Utility::REVERSE_COMPLEMENT_BYTE[256]; constexpr DNA::Extended_Base DNA_Utility::MAPPED_EXTENDED_BASE[4]; constexpr DNA::Base DNA_Utility::REVERSE_MAPPED_EXTENDED_BASE[5]; +constexpr char DNA_Utility::MAPPED_CHAR[4]; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index fc493b62..d7eb4f43 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,5 +1,6 @@ #include "Read_CdBG_Extractor.hpp" +#include "Directed_Vertex.hpp" template @@ -36,6 +37,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() thread_pool.close(); std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; + std::cout << "Number of unipaths extracted: " << unipath_count << "\n"; std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); @@ -60,22 +62,68 @@ void Read_CdBG_Extractor::distribute_unipaths_extraction(Kmer_SPMC_Iterator void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { - Kmer v; - uint64_t vertex_count = 0; + // Data structures to be reused per each vertex processed. + Kmer v; // For the vertex to be processed one-by-one. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. + State_Read_Space state; // State of the vertex `v`. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. while(vertex_parser->tasks_expected(thread_id)) if(vertex_parser->value_at(thread_id, v)) { + state = hash_table[v].state(); + + if(is_flanking_state(state, s_v)) + if(extract_maximal_unitig(v, s_v)) + unipaths_extracted++; + vertex_count++; } lock.lock(); std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. vertices_processed += vertex_count; + unipath_count += unipaths_extracted; lock.unlock(); } +template +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat) +{ + // Data structures to be reused per each vertex extension of the maximal unitig. + cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. + Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. + + const Directed_Vertex init_vertex(v); + // std::string unipath(init_vertex.kmer().string_label()); + + while(!is_flanking_side(state, s_v)) + { + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + + // unipath += Kmer::map_char(b_ext); + } + + + if(init_vertex.kmer() >= v.kmer_bar()) + return false; + + // TODO: Output the built maximal unitig. + return true; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) From d22538c4405b1a36ee8f779bb19370bca7e67efd Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 25 Apr 2021 18:09:35 -0400 Subject: [PATCH 068/350] Reduce hash lookup through output marking --- include/Read_CdBG_Extractor.hpp | 26 ++++++++++++++++++++++++++ include/State_Read_Space.hpp | 9 +++++++++ src/Read_CdBG_Extractor.cpp | 23 +++++++++++++++++++---- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index b995063b..2825bcc3 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -8,6 +8,7 @@ #include "Kmer_Hash_Table.hpp" #include "Kmer_Container.hpp" #include "Kmer_SPMC_Iterator.hpp" +#include "Directed_Vertex.hpp" #include "Build_Params.hpp" #include "Spin_Lock.hpp" #include "Thread_Pool.hpp" @@ -47,6 +48,14 @@ class Read_CdBG_Extractor // unitig in its non-canonical form results in a failed extraction. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); + // Marks the vertex `v` as outputted, and returns `true` iff the hash table update is successful. + bool mark_vertex(const Directed_Vertex& v); + + // Marks the two endpoint vertices of a maximal unitig `p` as outputted: the first vertex in the + // canonical form of `p`, `sign_vertex`, and the last vertex in the form, `cosign_vertex`; returns + // `true` iff the corresponding hash table updates are successful. + bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); + // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation // phase. In the general case, these functions with their specified input parameters and their @@ -91,6 +100,23 @@ class Read_CdBG_Extractor }; +template +inline bool Read_CdBG_Extractor::mark_vertex(const Directed_Vertex& v) +{ + Kmer_Hash_Entry_API bucket = hash_table[v.hash()]; + bucket.get_state().mark_outputted(); + + return hash_table.update(bucket); +} + + +template +inline bool Read_CdBG_Extractor::mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex) +{ + return mark_vertex(sign_vertex) && (sign_vertex.hash() == cosign_vertex.hash() || mark_vertex(cosign_vertex)); +} + + template inline bool Read_CdBG_Extractor::is_flanking_state(const State_Read_Space state, cuttlefish::side_t& unipath_side) { diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index de31cc90..a686fa5e 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -69,6 +69,9 @@ class State_Read_Space // behavior: empty-to-rest and unique-to-multi. void update_edge_at(cuttlefish::side_t side, cuttlefish::edge_encoding_t edge); + // Marks the state as already been outputted. + void mark_outputted(); + // Returns `true` iff the underlying code is the same as that one of `rhs`. bool operator==(const State_Read_Space& rhs) const; }; @@ -120,6 +123,12 @@ inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, cons } +inline void State_Read_Space::mark_outputted() +{ + code = OUTPUTTED; +} + + inline bool State_Read_Space::operator==(const State_Read_Space& rhs) const { return code == rhs.code; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index d7eb4f43..53337f5f 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,6 +1,5 @@ #include "Read_CdBG_Extractor.hpp" -#include "Directed_Vertex.hpp" template @@ -75,7 +74,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte { state = hash_table[v].state(); - if(is_flanking_state(state, s_v)) + if(!state.is_outputted() && is_flanking_state(state, s_v)) if(extract_maximal_unitig(v, s_v)) unipaths_extracted++; @@ -103,8 +102,15 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const const Directed_Vertex init_vertex(v); // std::string unipath(init_vertex.kmer().string_label()); - while(!is_flanking_side(state, s_v)) + while(true) { + if(state.is_outputted()) // The opposite end of the maximal unitig has been reached, and the unitig is found to have already been outputted. + return false; + + if(is_flanking_side(state, s_v)) + break; + + e_v = state.edge_at(s_v); b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); @@ -115,10 +121,19 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const // unipath += Kmer::map_char(b_ext); } + const Directed_Vertex& term_vertex = v; + - if(init_vertex.kmer() >= v.kmer_bar()) + if(init_vertex.kmer() >= term_vertex.kmer_bar()) // The maximal unitig has been encountered in its non-canonical form. return false; + // Mark the flanking vertices as outputted. + if(!mark_flanking_vertices(init_vertex, term_vertex)) + { + std::cerr << "Hash table update failed while marking some flanking vertices as outputted. Aborting.\n"; + std::exit(EXIT_FAILURE); + } + // TODO: Output the built maximal unitig. return true; } From 61b7a6f6eff60fec2162d3b2447de1d900dd8fa6 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 25 Apr 2021 18:55:20 -0400 Subject: [PATCH 069/350] Reduce hash lookup by traversing unipaths only once (expected) --- include/Read_CdBG_Extractor.hpp | 16 +++++++++++----- src/Read_CdBG_Extractor.cpp | 14 +++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 2825bcc3..80082059 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -48,12 +48,14 @@ class Read_CdBG_Extractor // unitig in its non-canonical form results in a failed extraction. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); - // Marks the vertex `v` as outputted, and returns `true` iff the hash table update is successful. + // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash + // table update is successful. bool mark_vertex(const Directed_Vertex& v); // Marks the two endpoint vertices of a maximal unitig `p` as outputted: the first vertex in the - // canonical form of `p`, `sign_vertex`, and the last vertex in the form, `cosign_vertex`; returns - // `true` iff the corresponding hash table updates are successful. + // canonical form of `p`, `sign_vertex`, and the last vertex in the form, `cosign_vertex`. Returns + // `true` iff the vertices have not been marked yet and the corresponding hash table updates are + // successful. bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); // Note: The following methods are only applicable when the heuristic of information-discarding @@ -104,8 +106,12 @@ template inline bool Read_CdBG_Extractor::mark_vertex(const Directed_Vertex& v) { Kmer_Hash_Entry_API bucket = hash_table[v.hash()]; - bucket.get_state().mark_outputted(); - + State_Read_Space& state = bucket.get_state(); + + if(state.is_outputted()) + return false; + + state.mark_outputted(); return hash_table.update(bucket); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 53337f5f..8b8fb709 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -122,17 +122,13 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } const Directed_Vertex& term_vertex = v; - - - if(init_vertex.kmer() >= term_vertex.kmer_bar()) // The maximal unitig has been encountered in its non-canonical form. - return false; + const bool in_canonical = (init_vertex.kmer() < term_vertex.kmer_bar()); + const Directed_Vertex& sign_vertex = (in_canonical ? init_vertex : term_vertex); + const Directed_Vertex& cosign_vertex = (in_canonical ? term_vertex : init_vertex); // Mark the flanking vertices as outputted. - if(!mark_flanking_vertices(init_vertex, term_vertex)) - { - std::cerr << "Hash table update failed while marking some flanking vertices as outputted. Aborting.\n"; - std::exit(EXIT_FAILURE); - } + if(!mark_flanking_vertices(sign_vertex, cosign_vertex)) + return false; // TODO: Output the built maximal unitig. return true; From b79d7b5ffd08984f3e70e32e8444e23106f10423 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 25 Apr 2021 19:31:30 -0400 Subject: [PATCH 070/350] Expose more interface to hash table --- include/Kmer_Hash_Table.hpp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 368cd787..8cde5ee9 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -15,14 +15,9 @@ #include "Sparse_Lock.hpp" -template class CdBG; - - template class Kmer_Hash_Table { - friend class CdBG; - typedef boomphf::mphf, Kmer_Hasher> mphf_t; // The MPH function type. typedef compact::ts_vector> bitvector_t; @@ -67,16 +62,6 @@ class Kmer_Hash_Table // Saves the MPH function `mph` into a file at `file_path`. void save_mph_function(const std::string& file_path) const; - // Saves the hash table buckets `hash_table` into a file at `file_path`. - void save_hash_buckets(const std::string& file_path) const; - - // Loads the hash table buckets `hash_table` from the file at `file_path`. - void load_hash_buckets(const std::string& file_path); - - // Returns the id / number of the bucket in the hash table that is - // supposed to store value items for the key `kmer`. - uint64_t bucket_id(const Kmer& kmer) const; - public: @@ -91,6 +76,10 @@ class Kmer_Hash_Table // from scratch), or the newly built MPH is saved there. void construct(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); + // Returns the id / number of the bucket in the hash table that is + // supposed to store value items for the key `kmer`. + uint64_t bucket_id(const Kmer& kmer) const; + // Returns the hash value of the k-mer `kmer`. uint64_t operator()(const Kmer& kmer) const; @@ -115,6 +104,12 @@ class Kmer_Hash_Table // Clears the hash-table. Do not invoke on an unused object. void clear(); + // Saves the hash table buckets `hash_table` into a file at `file_path`. + void save_hash_buckets(const std::string& file_path) const; + + // Loads the hash table buckets `hash_table` from the file at `file_path`. + void load_hash_buckets(const std::string& file_path); + // Destructs the hash table. ~Kmer_Hash_Table(); }; From fa4f3ec1c008507129587eaec12a21cc6208c696 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 25 Apr 2021 20:42:16 -0400 Subject: [PATCH 071/350] Separate file-existence checker --- include/utility.hpp | 4 ++++ src/CdBG_Builder.cpp | 3 +-- src/utility.cpp | 8 ++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/utility.hpp b/include/utility.hpp index 6c1a90de..c4cad56a 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -14,6 +14,10 @@ std::string get_random_string(size_t len, const char* alphabet = "0123456789" // Returns `true` iff `pref` is a prefix of `s`. bool is_prefix(const std::string& s, const std::string& pref); +// Returns `true` iff there exists a file in the file system with the path +// `file_path`. +bool file_exists(const std::string& file_path); + // Returns `true` iff there exists some file in the file system path // `path` with its name being prefixed by `prefix`. bool file_prefix_exists(const std::string& path, const std::string& prefix); diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index bd94b314..aebafc83 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -18,8 +18,7 @@ void CdBG::classify_vertices() const std::string& buckets_file_path = params.buckets_file_path(); // The serialized hash table buckets (saved from some earlier execution) exists. - struct stat buffer; - if(!buckets_file_path.empty() && stat(buckets_file_path.c_str(), &buffer) == 0) + if(!buckets_file_path.empty() && file_exists(buckets_file_path)) { std::cout << "Found the hash table buckets at file " << buckets_file_path << "\n"; std::cout << "Loading the buckets.\n"; diff --git a/src/utility.cpp b/src/utility.cpp index 6bb44e76..5bc34e6d 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -35,6 +35,14 @@ bool is_prefix(const std::string& s, const std::string& pref) } +bool file_exists(const std::string& file_path) +{ + struct stat stat_buf; + + return stat(file_path.c_str(), &stat_buf) == 0; +} + + bool file_prefix_exists(const std::string& path, const std::string& prefix) { for(const auto& entry: ghc::filesystem::directory_iterator(path)) From a54f84644eca637083abc02bea430101638bd810 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Sun, 25 Apr 2021 20:46:26 -0400 Subject: [PATCH 072/350] Add save interface for read-dBG states --- src/Read_CdBG_Constructor.cpp | 50 ++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 4c5d05d8..dc6875ba 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -1,6 +1,7 @@ #include "Read_CdBG_Constructor.hpp" #include "Edge.hpp" +#include "utility.hpp" #include "chrono" @@ -18,27 +19,46 @@ void Read_CdBG_Constructor::compute_DFA_states() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - // Construct a thread pool. - const uint16_t thread_count = params.thread_count(); - Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); + const std::string& buckets_file_path = params.buckets_file_path(); + if(!buckets_file_path.empty() && file_exists(buckets_file_path)) // The serialized hash table buckets, saved from some earlier execution, exists. + { + std::cout << "Found the hash table buckets at file " << buckets_file_path << ".\n" + "Loading the buckets.\n"; + hash_table.load_hash_buckets(buckets_file_path); + std::cout << "Loaded the buckets into memory.\n"; + } + else + { + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); - // Launch the reading (and parsing per demand) of the edges from disk. - const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. - Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. - std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; + // Launch the reading (and parsing per demand) of the edges from disk. + const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. + Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. + std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; - edge_parser.launch_production(); + edge_parser.launch_production(); - // Launch (multi-threaded) computation of the states. - distribute_states_computation(&edge_parser, thread_pool); + // Launch (multi-threaded) computation of the states. + distribute_states_computation(&edge_parser, thread_pool); - // Wait for the edges to be depleted from the database. - edge_parser.seize_production(); + // Wait for the edges to be depleted from the database. + edge_parser.seize_production(); - // Wait for the consumer threads to finish parsing and processing the edges. - thread_pool.close(); + // Wait for the consumer threads to finish parsing and processing the edges. + thread_pool.close(); - std::cout << "Number of processed edges: " << edges_processed << "\n"; + std::cout << "Number of processed edges: " << edges_processed << "\n"; + + + if(!buckets_file_path.empty()) // Save the hash table buckets, if a file path is provided. + { + std::cout << "Saving the hash table buckets in file " << buckets_file_path << ".\n"; + hash_table.save_hash_buckets(buckets_file_path); + std::cout << "Saved the buckets in disk.\n"; + } + } std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); From 78d3fffcc9ffd636a4dac01a8279e4ce57725c38 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 28 Apr 2021 20:59:21 -0400 Subject: [PATCH 073/350] Correct documentation --- include/Read_CdBG_Extractor.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 80082059..e39e69fc 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -44,8 +44,8 @@ class Read_CdBG_Extractor // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when - // the k-mer `v_hat` is the first k-mer in the canonical form of `p`. Thus encountering a maximal - // unitig in its non-canonical form results in a failed extraction. + // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If + // the attempt is successful, then the maximal unitig is extracted in its canonical form. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash From 013cae5fb5f85e8d2affc78eb1ab7bc6cc592937 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 28 Apr 2021 22:19:19 -0400 Subject: [PATCH 074/350] Add string buffer class --- include/String_Buffer.hpp | 140 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 include/String_Buffer.hpp diff --git a/include/String_Buffer.hpp b/include/String_Buffer.hpp new file mode 100644 index 00000000..5cf0691f --- /dev/null +++ b/include/String_Buffer.hpp @@ -0,0 +1,140 @@ + +#ifndef STRING_BUFFER_HPP +#define STRING_BUFFER_HPP + + + +#include "Spin_Lock.hpp" + +#include +#include +#include + + +// A buffer class to contain contiguous strings. The buffer is to have a maximum +// capacity of `CAPACITY` (although it is non-binding when a string with length +// larger than that is added), and it flushes to a sink of type `T_sink_` when it +// overflows or is destructed. Writing to the provided sink (in the constructor) +// is thread-safe — w/ a limiting contention for access per sink-type, not per sink. +template +class String_Buffer +{ +private: + + std::vector buffer; // The string buffer. + T_sink_& sink; // Reference to the sink to flush the buffer content to. + + + // Flushes the buffer content to the sink, and clears the buffer. + void flush(); + + +public: + + // Constructs a string buffer object that would flush its content to `sink`. + String_Buffer(T_sink_& sink); + + // Appends the content of the string `str` to the buffer. Flushes are possible. + void operator+=(const std::string& str); + + // Destructs the buffer object, flushing it if content are present. + ~String_Buffer(); +}; + + +// Helper class to actually flush the content of the `String_Buffer` class to its +// sink of type `T_sink`. +// It's used to circumvent the C++ constraint that partial specialization of a +// a member function is not possible without partially specializing the entire +// class. We need to specialize the actual flushing mechanism to support various +// types of sinks, e.g. `std::ofstream`, `spdlog::logger` etc. +template +class String_Buffer_Flusher +{ + // Since the sole purpose of the class is to support the `String_Buffer` class + // circumvent some contraint, everything is encapsulated here as private, with + // `String_Buffer` as friend. + template friend class String_Buffer; + +private: + + // Mutual-exclusion lock to control multi-threaded access to otherwise not thread- + // safe sinks (e.g. `std::ofstream`). Note that, the lock is per sink-type, not per + // actual sink — which is a limitation. + static Spin_Lock lock; + + + // Writes `len` characters from the memory location `str_buf` to the sink `sink`. + static void write(const char* str_buf, std::size_t len, T_sink_& sink); +}; + + +template +inline String_Buffer::String_Buffer(T_sink_& sink): + sink(sink) +{ + buffer.reserve(CAPACITY); +} + + +template +inline void String_Buffer::operator+=(const std::string& str) +{ + if(buffer.size() + str.length() >= CAPACITY) + { + flush(); + + if(str.length() >= CAPACITY) + { + std::cerr << "A single output string overflows the string-buffer capacity.\n" + "Output string length: " << str.length() << ", string-buffer capacity: " << CAPACITY << ".\n" + "Please consider increasing the buffer capacity parameter in build for future use.\n"; + + buffer.reserve(str.length()); + } + } + + + buffer.insert(buffer.end(), str.begin(), str.end()); +} + + +template +inline void String_Buffer::flush() +{ + String_Buffer_Flusher::write(buffer.data(), buffer.size(), sink); + + buffer.clear(); +} + + +template +inline String_Buffer::~String_Buffer() +{ + if(!buffer.empty()) + flush(); +} + + +template <> +inline void String_Buffer_Flusher::write(const char* const str_buf, const std::size_t len, std::ofstream& output) +{ + lock.lock(); + + output.write(str_buf, len); + + if(output.fail()) + { + std::cerr << "Error writing the output. Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + lock.unlock(); +} + + +template Spin_Lock String_Buffer_Flusher::lock; // Definition of the static lock of `String_Buffer_Flusher`. + + + +#endif From 3526dc91d7a93d18343819e9efd760d5d74e6d34 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 28 Apr 2021 22:30:35 -0400 Subject: [PATCH 075/350] Write out maximal unitigs in canonical / non-canonical form --- include/Read_CdBG_Extractor.hpp | 19 ++++++++-- src/Read_CdBG_Extractor.cpp | 62 ++++++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index e39e69fc..8c964d37 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -13,6 +13,8 @@ #include "Spin_Lock.hpp" #include "Thread_Pool.hpp" +#include + // A class to extract the vertices from a compacted de Bruin graph — which are the maximal unitigs of some ordinary de Bruijn graph. template @@ -25,6 +27,9 @@ class Read_CdBG_Extractor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (i.e. canonical k-mers) of the original (uncompacted) de Bruijn graph. + std::ofstream output_; // Sink for the output maximal unitigs. + static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB. + // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. mutable uint64_t vertices_processed = 0; // Total number of vertices scanned from the database. @@ -45,8 +50,9 @@ class Read_CdBG_Extractor // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If - // the attempt is successful, then the maximal unitig is extracted in its canonical form. - bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); + // the attempt is successful, then the maximal unitig is extracted in its canonical form, into the + // string `unipath` (it is overwritten). If not, `unipath` may contain partial form of the unitig. + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, std::string& unipath); // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. @@ -58,6 +64,15 @@ class Read_CdBG_Extractor // successful. bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); + // Clears the output file content. + void clear_output_file() const; + + // Initializes the output logger. + void init_output_logger(); + + // Closes the output logger. + void close_output_logger(); + // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation // phase. In the general case, these functions with their specified input parameters and their diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 8b8fb709..03919f2b 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,5 +1,10 @@ #include "Read_CdBG_Extractor.hpp" +#include "String_Buffer.hpp" + + +// Definition of static members. +template constexpr std::size_t Read_CdBG_Extractor::BUFF_SZ; template @@ -26,6 +31,10 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() vertex_parser.launch_production(); + // Clear the output file and initialize the output logger. + clear_output_file(); + init_output_logger(); + // Launch (multi-thread) extraction of the maximal unitigs. distribute_unipaths_extraction(&vertex_parser, thread_pool); @@ -35,6 +44,9 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() // Wait for the consumer threads to finish parsing and processing edges. thread_pool.close(); + // Close the output logger. + close_output_logger(); + std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; std::cout << "Number of unipaths extracted: " << unipath_count << "\n"; @@ -65,18 +77,29 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte Kmer v; // For the vertex to be processed one-by-one. cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. + std::string unipath; // The extracted maximal unitig from the vertex `v`. + // TODO: maybe use `std::vector` instead of `std::string`, as `std::string` does not guarantee a fixed capacity during execution. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. + String_Buffer output_buffer(output_); // The output buffer for maximal unitigs. + unipath.reserve(BUFF_SZ); + while(vertex_parser->tasks_expected(thread_id)) if(vertex_parser->value_at(thread_id, v)) { state = hash_table[v].state(); if(!state.is_outputted() && is_flanking_state(state, s_v)) - if(extract_maximal_unitig(v, s_v)) + if(extract_maximal_unitig(v, s_v, unipath)) + { + unipath += "\n"; + output_buffer += unipath; + // unipath.clear(); + unipaths_extracted++; + } vertex_count++; } @@ -90,7 +113,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte template -bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat) +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, std::string& unipath) { // Data structures to be reused per each vertex extension of the maximal unitig. cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. @@ -100,7 +123,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. const Directed_Vertex init_vertex(v); - // std::string unipath(init_vertex.kmer().string_label()); + unipath = init_vertex.kmer().string_label(); // TODO: optimize. while(true) { @@ -118,7 +141,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const s_v = v.exit_side(); state = hash_table[v.hash()].state(); - // unipath += Kmer::map_char(b_ext); + unipath += Kmer::map_char(b_ext); } const Directed_Vertex& term_vertex = v; @@ -130,11 +153,40 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const if(!mark_flanking_vertices(sign_vertex, cosign_vertex)) return false; - // TODO: Output the built maximal unitig. return true; } +template +void Read_CdBG_Extractor::clear_output_file() const +{ + const std::string& output_file_path = params.output_file_path(); + + std::ofstream output(output_file_path.c_str(), std::ofstream::out | std::ofstream::trunc); + if(output.fail()) + { + std::cerr << "Error opening output file " << output_file_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + output.close(); +} + + +template +void Read_CdBG_Extractor::init_output_logger() +{ + output_ = std::ofstream(params.output_file_path()); +} + + +template +void Read_CdBG_Extractor::close_output_logger() +{ + output_.close(); +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) From b20dcd1f0f39d351a2dfd77cf3e5736a5836a08f Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 11:11:07 -0400 Subject: [PATCH 076/350] Templatize buffer append --- include/String_Buffer.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/String_Buffer.hpp b/include/String_Buffer.hpp index 5cf0691f..ec66b668 100644 --- a/include/String_Buffer.hpp +++ b/include/String_Buffer.hpp @@ -34,8 +34,9 @@ class String_Buffer // Constructs a string buffer object that would flush its content to `sink`. String_Buffer(T_sink_& sink); - // Appends the content of the string `str` to the buffer. Flushes are possible. - void operator+=(const std::string& str); + // Appends the content of `str` to the buffer. Flushes are possible. + template + void operator+=(const T_container_& str); // Destructs the buffer object, flushing it if content are present. ~String_Buffer(); @@ -78,19 +79,20 @@ inline String_Buffer::String_Buffer(T_sink_& sink): template -inline void String_Buffer::operator+=(const std::string& str) +template +inline void String_Buffer::operator+=(const T_container_& str) { - if(buffer.size() + str.length() >= CAPACITY) + if(buffer.size() + str.size() >= CAPACITY) { flush(); - if(str.length() >= CAPACITY) + if(str.size() >= CAPACITY) { std::cerr << "A single output string overflows the string-buffer capacity.\n" - "Output string length: " << str.length() << ", string-buffer capacity: " << CAPACITY << ".\n" + "Output string length: " << str.size() << ", string-buffer capacity: " << CAPACITY << ".\n" "Please consider increasing the buffer capacity parameter in build for future use.\n"; - buffer.reserve(str.length()); + buffer.reserve(str.size()); } } From 0749646ab324606b1c6ad8d968af82f385af3e53 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 16:39:51 -0400 Subject: [PATCH 077/350] Have better k-mer stringizator --- include/Kmer.hpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 43a5111d..6953dbe4 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -186,6 +186,10 @@ class Kmer: public DNA_Utility // Returns the string label of the k-mer. std::string string_label() const; + // Gets the string label of the k-mer into the container `label`. + template + void get_label(T_container_& label) const; + // Returns a randomly generated k-mer. static Kmer random_kmer(); @@ -631,6 +635,25 @@ inline std::string Kmer::string_label() const } +template +template +inline void Kmer::get_label(T_container_& label) const +{ + label.resize(k); + + // Get the fully packed words' representations. + for(uint16_t data_idx = 0; data_idx < NUM_INTS - 1; ++data_idx) + for(uint16_t bit_pair_idx = 0; bit_pair_idx < 32; ++bit_pair_idx) + label[(k - 1) - ((data_idx << 5) + bit_pair_idx)] = + map_char(static_cast((kmer_data[data_idx] & (0b11ULL << (2 * bit_pair_idx))) >> (2 * bit_pair_idx))); + + // Get the partially packed (highest index) word's representation. + for(uint16_t bit_pair_idx = 0; bit_pair_idx < (k & 31); ++bit_pair_idx) + label[(k - 1) - (((NUM_INTS - 1) << 5) + bit_pair_idx)] = + map_char(static_cast((kmer_data[NUM_INTS - 1] & (0b11ULL << (2 * bit_pair_idx))) >> (2 * bit_pair_idx))); +} + + template inline Kmer Kmer::random_kmer() { From b4621d1b26eab3852320fc8820d293c51f4c0726 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 16:54:42 -0400 Subject: [PATCH 078/350] Contain unipath stitching in vector --- include/Read_CdBG_Extractor.hpp | 2 +- src/Read_CdBG_Extractor.cpp | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 8c964d37..e70cf177 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -52,7 +52,7 @@ class Read_CdBG_Extractor // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If // the attempt is successful, then the maximal unitig is extracted in its canonical form, into the // string `unipath` (it is overwritten). If not, `unipath` may contain partial form of the unitig. - bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, std::string& unipath); + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, std::vector& unipath); // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 03919f2b..44d25dcb 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -77,8 +77,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte Kmer v; // For the vertex to be processed one-by-one. cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. - std::string unipath; // The extracted maximal unitig from the vertex `v`. - // TODO: maybe use `std::vector` instead of `std::string`, as `std::string` does not guarantee a fixed capacity during execution. + std::vector unipath; // The extracted maximal unitig from the vertex `v`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. @@ -94,7 +93,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte if(!state.is_outputted() && is_flanking_state(state, s_v)) if(extract_maximal_unitig(v, s_v, unipath)) { - unipath += "\n"; + unipath.emplace_back('\n'); output_buffer += unipath; // unipath.clear(); @@ -113,7 +112,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte template -bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, std::string& unipath) +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, std::vector& unipath) { // Data structures to be reused per each vertex extension of the maximal unitig. cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. @@ -123,7 +122,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. const Directed_Vertex init_vertex(v); - unipath = init_vertex.kmer().string_label(); // TODO: optimize. + init_vertex.kmer().get_label(unipath); while(true) { @@ -141,7 +140,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const s_v = v.exit_side(); state = hash_table[v.hash()].state(); - unipath += Kmer::map_char(b_ext); + unipath.emplace_back(Kmer::map_char(b_ext)); } const Directed_Vertex& term_vertex = v; From 726334a9d6d81676ac4bbe2fbbe3fdcf11f9dd12 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 17:02:34 -0400 Subject: [PATCH 079/350] Better name character buffer --- ...String_Buffer.hpp => Character_Buffer.hpp} | 40 +++++++++---------- src/Read_CdBG_Extractor.cpp | 4 +- 2 files changed, 22 insertions(+), 22 deletions(-) rename include/{String_Buffer.hpp => Character_Buffer.hpp} (69%) diff --git a/include/String_Buffer.hpp b/include/Character_Buffer.hpp similarity index 69% rename from include/String_Buffer.hpp rename to include/Character_Buffer.hpp index ec66b668..d397a000 100644 --- a/include/String_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -1,6 +1,6 @@ -#ifndef STRING_BUFFER_HPP -#define STRING_BUFFER_HPP +#ifndef CHARACTER_BUFFER_HPP +#define CHARACTER_BUFFER_HPP @@ -11,17 +11,17 @@ #include -// A buffer class to contain contiguous strings. The buffer is to have a maximum +// A buffer class to contain contiguous characters. The buffer is to have a maximum // capacity of `CAPACITY` (although it is non-binding when a string with length // larger than that is added), and it flushes to a sink of type `T_sink_` when it // overflows or is destructed. Writing to the provided sink (in the constructor) // is thread-safe — w/ a limiting contention for access per sink-type, not per sink. template -class String_Buffer +class Character_Buffer { private: - std::vector buffer; // The string buffer. + std::vector buffer; // The character buffer. T_sink_& sink; // Reference to the sink to flush the buffer content to. @@ -31,31 +31,31 @@ class String_Buffer public: - // Constructs a string buffer object that would flush its content to `sink`. - String_Buffer(T_sink_& sink); + // Constructs a character buffer object that would flush its content to `sink`. + Character_Buffer(T_sink_& sink); // Appends the content of `str` to the buffer. Flushes are possible. template void operator+=(const T_container_& str); // Destructs the buffer object, flushing it if content are present. - ~String_Buffer(); + ~Character_Buffer(); }; -// Helper class to actually flush the content of the `String_Buffer` class to its +// Helper class to actually flush the content of the `Character_Buffer` class to its // sink of type `T_sink`. // It's used to circumvent the C++ constraint that partial specialization of a // a member function is not possible without partially specializing the entire // class. We need to specialize the actual flushing mechanism to support various // types of sinks, e.g. `std::ofstream`, `spdlog::logger` etc. template -class String_Buffer_Flusher +class Character_Buffer_Flusher { - // Since the sole purpose of the class is to support the `String_Buffer` class + // Since the sole purpose of the class is to support the `Character_Buffer` class // circumvent some contraint, everything is encapsulated here as private, with - // `String_Buffer` as friend. - template friend class String_Buffer; + // `Character_Buffer` as friend. + template friend class Character_Buffer; private: @@ -71,7 +71,7 @@ class String_Buffer_Flusher template -inline String_Buffer::String_Buffer(T_sink_& sink): +inline Character_Buffer::Character_Buffer(T_sink_& sink): sink(sink) { buffer.reserve(CAPACITY); @@ -80,7 +80,7 @@ inline String_Buffer::String_Buffer(T_sink_& sink): template template -inline void String_Buffer::operator+=(const T_container_& str) +inline void Character_Buffer::operator+=(const T_container_& str) { if(buffer.size() + str.size() >= CAPACITY) { @@ -102,16 +102,16 @@ inline void String_Buffer::operator+=(const T_container_& str template -inline void String_Buffer::flush() +inline void Character_Buffer::flush() { - String_Buffer_Flusher::write(buffer.data(), buffer.size(), sink); + Character_Buffer_Flusher::write(buffer.data(), buffer.size(), sink); buffer.clear(); } template -inline String_Buffer::~String_Buffer() +inline Character_Buffer::~Character_Buffer() { if(!buffer.empty()) flush(); @@ -119,7 +119,7 @@ inline String_Buffer::~String_Buffer() template <> -inline void String_Buffer_Flusher::write(const char* const str_buf, const std::size_t len, std::ofstream& output) +inline void Character_Buffer_Flusher::write(const char* const str_buf, const std::size_t len, std::ofstream& output) { lock.lock(); @@ -135,7 +135,7 @@ inline void String_Buffer_Flusher::write(const char* const str_bu } -template Spin_Lock String_Buffer_Flusher::lock; // Definition of the static lock of `String_Buffer_Flusher`. +template Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 44d25dcb..595f1bcd 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,6 +1,6 @@ #include "Read_CdBG_Extractor.hpp" -#include "String_Buffer.hpp" +#include "Character_Buffer.hpp" // Definition of static members. @@ -82,7 +82,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. - String_Buffer output_buffer(output_); // The output buffer for maximal unitigs. + Character_Buffer output_buffer(output_); // The output buffer for maximal unitigs. unipath.reserve(BUFF_SZ); while(vertex_parser->tasks_expected(thread_id)) From e443da24bfa7866349326212b31e360dfae1c6fa Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 17:57:28 -0400 Subject: [PATCH 080/350] For $#+s and giggles --- include/Kmer.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 6953dbe4..82f7a1bb 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -190,6 +190,9 @@ class Kmer: public DNA_Utility template void get_label(T_container_& label) const; + // Implicitly converts the k-mer to a `std::string`. + operator std::string() const; + // Returns a randomly generated k-mer. static Kmer random_kmer(); @@ -654,6 +657,16 @@ inline void Kmer::get_label(T_container_& label) const } +template +inline Kmer::operator std::string() const +{ + std::string label; + get_label(label); + + return label; +} + + template inline Kmer Kmer::random_kmer() { From 400b3891486d524707953ec6fbed0ff352bddd56 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 19:44:12 -0400 Subject: [PATCH 081/350] =?UTF-8?q?More=20$=C3=9E1=C2=A3s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/Character_Buffer.hpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index d397a000..5b007135 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -15,7 +15,7 @@ // capacity of `CAPACITY` (although it is non-binding when a string with length // larger than that is added), and it flushes to a sink of type `T_sink_` when it // overflows or is destructed. Writing to the provided sink (in the constructor) -// is thread-safe — w/ a limiting contention for access per sink-type, not per sink. +// is thread-safe. template class Character_Buffer { @@ -49,12 +49,17 @@ class Character_Buffer // a member function is not possible without partially specializing the entire // class. We need to specialize the actual flushing mechanism to support various // types of sinks, e.g. `std::ofstream`, `spdlog::logger` etc. +// Since the sole purpose of the class is to support the `Character_Buffer` class +// circumvent some contraint, everything is encapsulated in its specializations +// as private, with `Character_Buffer` as friend. template class Character_Buffer_Flusher +{}; + + +template <> +class Character_Buffer_Flusher { - // Since the sole purpose of the class is to support the `Character_Buffer` class - // circumvent some contraint, everything is encapsulated here as private, with - // `Character_Buffer` as friend. template friend class Character_Buffer; private: @@ -66,7 +71,7 @@ class Character_Buffer_Flusher // Writes `len` characters from the memory location `str_buf` to the sink `sink`. - static void write(const char* str_buf, std::size_t len, T_sink_& sink); + static void write(const char* str_buf, std::size_t len, std::ofstream& sink); }; @@ -118,7 +123,6 @@ inline Character_Buffer::~Character_Buffer() } -template <> inline void Character_Buffer_Flusher::write(const char* const str_buf, const std::size_t len, std::ofstream& output) { lock.lock(); @@ -135,7 +139,7 @@ inline void Character_Buffer_Flusher::write(const char* const str } -template Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. +Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. From e80e3a93478f2fe455e505834e35be34ddb6cdf7 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 29 Apr 2021 20:17:34 -0400 Subject: [PATCH 082/350] Update buffer flusher interface --- include/Character_Buffer.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index 5b007135..d4651eeb 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -70,8 +70,8 @@ class Character_Buffer_Flusher static Spin_Lock lock; - // Writes `len` characters from the memory location `str_buf` to the sink `sink`. - static void write(const char* str_buf, std::size_t len, std::ofstream& sink); + // Writes the content of the vector `buf` to the sink `sink`. + static void write(std::vector& buf, std::ofstream& sink); }; @@ -109,7 +109,7 @@ inline void Character_Buffer::operator+=(const T_container_& template inline void Character_Buffer::flush() { - Character_Buffer_Flusher::write(buffer.data(), buffer.size(), sink); + Character_Buffer_Flusher::write(buffer, sink); buffer.clear(); } @@ -123,11 +123,11 @@ inline Character_Buffer::~Character_Buffer() } -inline void Character_Buffer_Flusher::write(const char* const str_buf, const std::size_t len, std::ofstream& output) +inline void Character_Buffer_Flusher::write(std::vector& buf, std::ofstream& output) { lock.lock(); - output.write(str_buf, len); + output.write(buf.data(), buf.size()); if(output.fail()) { From 83b2cb8137d53c207badf64139676aff98e72c81 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 30 Apr 2021 17:21:01 -0400 Subject: [PATCH 083/350] Have asynchronous parallel logger wrapper --- include/Async_Logger_Wrapper.hpp | 54 ++++++++++++++++++++++++++++++++ include/Character_Buffer.hpp | 23 ++++++++++++++ src/Async_Logger_Wrapper.cpp | 21 +++++++++++++ src/CMakeLists.txt | 1 + 4 files changed, 99 insertions(+) create mode 100644 include/Async_Logger_Wrapper.hpp create mode 100644 src/Async_Logger_Wrapper.cpp diff --git a/include/Async_Logger_Wrapper.hpp b/include/Async_Logger_Wrapper.hpp new file mode 100644 index 00000000..f2404027 --- /dev/null +++ b/include/Async_Logger_Wrapper.hpp @@ -0,0 +1,54 @@ + +#ifndef ASYNC_LOGGER_WRAPPER_HPP +#define ASYNC_LOGGER_WRAPPER_HPP + + + +#include "spdlog/async_logger.h" + +#include +#include +#include +#include + + +// A class wrapping the `spdlog` library's asynchronous logger. +class Async_Logger_Wrapper +{ +private: + + // `spdlog`'s queue size, i.e. the maximum number of log message units it can contain before a flush to sink. + static constexpr std::size_t QUEUE_CAP = 1024; + + // Number of backing worker threads for `spdlog`, i.e. the threads that actually make the writes to sink. + static constexpr uint16_t NUM_THREADS = 1; + + // `spdlog` thread pool for performing the outputting task to sink. Unless multiple distinct sinks are to be + // present (e.g. in the writing algorithm for the GFA-variants in reference dBG compaction), only one thread + // pool is needed, and it does not make much sense in having multiple backing threads in that pool. And through + // having a dedicated thread pool for the actual (aynchronous) sink-flushes, the disk-write happens in parallel + // to the algorithm operation. + std::shared_ptr tp; + + // Output logger. + std::shared_ptr logger; + + +public: + + // Initializes the `spdlog` logger wrapper that writes to a file with path `output_file_path`. + void init_logger(const std::string& output_file_path); + + // Log the passed null-terminated message `str`. + void write(const char* str) const; +}; + + +inline void Async_Logger_Wrapper::write(const char* const str) const +{ + logger->info(str); +} + + + +#endif diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index d4651eeb..f3840983 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -5,6 +5,7 @@ #include "Spin_Lock.hpp" +#include "Async_Logger_Wrapper.hpp" #include #include @@ -75,6 +76,21 @@ class Character_Buffer_Flusher }; +template <> +class Character_Buffer_Flusher +{ + template friend class Character_Buffer; + +private: + + // Writes the content of the vector `buf` to the sink `sink`. Note that the vector + // `buf` is modified in the process — a null-terminator (`\0`) is appended at the + // end — which is expected to be not problematic under the assumption that the + // buffer is cleared after the write (i.e. flush). + static void write(std::vector& buf, const Async_Logger_Wrapper& sink); +}; + + template inline Character_Buffer::Character_Buffer(T_sink_& sink): sink(sink) @@ -142,5 +158,12 @@ inline void Character_Buffer_Flusher::write(std::vector& bu Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. +inline void Character_Buffer_Flusher::write(std::vector& buf, const Async_Logger_Wrapper& sink) +{ + buf.emplace_back('\0'); + + sink.write(buf.data()); +} + #endif diff --git a/src/Async_Logger_Wrapper.cpp b/src/Async_Logger_Wrapper.cpp new file mode 100644 index 00000000..b94b2dea --- /dev/null +++ b/src/Async_Logger_Wrapper.cpp @@ -0,0 +1,21 @@ + +#include "Async_Logger_Wrapper.hpp" + +#include "spdlog/sinks/basic_file_sink.h" + + +void Async_Logger_Wrapper::init_logger(const std::string& output_file_path) +{ + // Instantiate an `spdlog` thread pool for background output operations. The logger constructed with this + // pool may contain up-to `QUEUE_CAP` log message units. If each log unit has a maximum length of `MSG_LEN`, + // then the logger can take up memory up-to `(QUEUE_CAP x MSG_LEN)` in background. + tp = std::make_shared(QUEUE_CAP, NUM_THREADS); + + // Instantiate an asynchronous `spdlog` logger that uses the thread pool `tp`, and writes to the provided + // sink file at `output_file_path`. + std::shared_ptr sink = std::make_shared(output_file_path); + logger = std::make_shared("async_output", sink, tp, spdlog::async_overflow_policy::block); + + // Set the log message pattern. + logger->set_pattern("%v"); +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0a954661..aaf2dd2a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,6 +7,7 @@ set(PROJECT_SRC xxHash/xxhash.c xxHash/xxhsum.c Parser.cpp + Async_Logger_Wrapper.cpp Thread_Pool.cpp DNA_Utility.cpp Kmer_u64.cpp From be281b0aec77b9565083c50d1234e2692c222b17 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 30 Apr 2021 18:23:47 -0400 Subject: [PATCH 084/350] Wrap output sink types --- include/Output_Sink.hpp | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 include/Output_Sink.hpp diff --git a/include/Output_Sink.hpp b/include/Output_Sink.hpp new file mode 100644 index 00000000..f7aa257c --- /dev/null +++ b/include/Output_Sink.hpp @@ -0,0 +1,74 @@ + +#ifndef OUTPUT_SINK_HPP +#define OUTPUT_SINK_HPP + + + +#include "Async_Logger_Wrapper.hpp" +#include "spdlog/spdlog.h" + +#include + + +// A basic sink wrapper with minimal functionality — open, get reference to the wrapped sink, and close. +template +class Output_Sink +{}; + + +template <> +class Output_Sink +{ +private: + + std::ofstream output_; + + +public: + + void init_sink(const std::string& output_file_path) + { + output_ = std::ofstream(output_file_path); + } + + std::ofstream& sink() + { + return output_; + } + + void close_sink() + { + output_.close(); + } +}; + + +template <> +class Output_Sink +{ +private: + + Async_Logger_Wrapper output_; + + +public: + + void init_sink(const std::string& output_file_path) + { + output_.init_logger(output_file_path); + } + + Async_Logger_Wrapper& sink() + { + return output_; + } + + void close_sink() + { + spdlog::drop_all(); + } +}; + + + +#endif From 9fb9ddf6df314bf7752ede96ede05c43a54c0b0a Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 30 Apr 2021 18:27:44 -0400 Subject: [PATCH 085/350] Use async logger for output --- include/Read_CdBG_Extractor.hpp | 14 +++++++++----- src/Read_CdBG_Extractor.cpp | 18 +++++++++--------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index e70cf177..7a19812f 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -12,6 +12,8 @@ #include "Build_Params.hpp" #include "Spin_Lock.hpp" #include "Thread_Pool.hpp" +#include "Async_Logger_Wrapper.hpp" +#include "Output_Sink.hpp" #include @@ -27,7 +29,9 @@ class Read_CdBG_Extractor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (i.e. canonical k-mers) of the original (uncompacted) de Bruijn graph. - std::ofstream output_; // Sink for the output maximal unitigs. + // typedef std::ofstream sink_t; + typedef Async_Logger_Wrapper sink_t; + Output_Sink output_sink; // Sink for the output maximal unitigs. static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB. // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. @@ -67,11 +71,11 @@ class Read_CdBG_Extractor // Clears the output file content. void clear_output_file() const; - // Initializes the output logger. - void init_output_logger(); + // Initializes the output sink. + void init_output_sink(); - // Closes the output logger. - void close_output_logger(); + // Closes the output sink. + void close_output_sink(); // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 595f1bcd..7f1aa59e 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -31,9 +31,9 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() vertex_parser.launch_production(); - // Clear the output file and initialize the output logger. + // Clear the output file and initialize the output sink. clear_output_file(); - init_output_logger(); + init_output_sink(); // Launch (multi-thread) extraction of the maximal unitigs. distribute_unipaths_extraction(&vertex_parser, thread_pool); @@ -44,8 +44,8 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() // Wait for the consumer threads to finish parsing and processing edges. thread_pool.close(); - // Close the output logger. - close_output_logger(); + // Close the output sink. + close_output_sink(); std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; std::cout << "Number of unipaths extracted: " << unipath_count << "\n"; @@ -82,7 +82,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. - Character_Buffer output_buffer(output_); // The output buffer for maximal unitigs. + Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(BUFF_SZ); while(vertex_parser->tasks_expected(thread_id)) @@ -173,16 +173,16 @@ void Read_CdBG_Extractor::clear_output_file() const template -void Read_CdBG_Extractor::init_output_logger() +void Read_CdBG_Extractor::init_output_sink() { - output_ = std::ofstream(params.output_file_path()); + output_sink.init_sink(params.output_file_path()); } template -void Read_CdBG_Extractor::close_output_logger() +void Read_CdBG_Extractor::close_output_sink() { - output_.close(); + output_sink.close_sink(); } From 096ec15c0ef6cfd5250041271b9daf4b88aa2c42 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 3 May 2021 12:03:09 -0400 Subject: [PATCH 086/350] Separate file clearer --- include/Read_CdBG_Extractor.hpp | 3 --- include/utility.hpp | 3 +++ src/CdBG_Writer.cpp | 20 +++----------------- src/Read_CdBG_Extractor.cpp | 19 ++----------------- src/utility.cpp | 13 +++++++++++++ 5 files changed, 21 insertions(+), 37 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 7a19812f..9d5533d6 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -68,9 +68,6 @@ class Read_CdBG_Extractor // successful. bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); - // Clears the output file content. - void clear_output_file() const; - // Initializes the output sink. void init_output_sink(); diff --git a/include/utility.hpp b/include/utility.hpp index c4cad56a..1e0fc5bf 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -28,6 +28,9 @@ std::string remove_whitespaces(const char* s); // Removes the k-mer set (KMC database) with the path prefix `kmc_file_pref`. void remove_kmer_set(const std::string& kmc_file_pref); +// Clears the content of the file at path `file_path`. +void clear_file(const std::string& file_path); + #endif diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index acc9298a..95a4f1d1 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -477,28 +477,14 @@ void CdBG::clear_output_file() const const std::string& output_file_path = params.output_file_path(); if(op_format == cuttlefish::txt || op_format == cuttlefish::gfa1 || op_format == cuttlefish::gfa2) - { - std::ofstream output(output_file_path.c_str(), std::ofstream::out | std::ofstream::trunc); - if(!output) - { - std::cerr << "Error opening output file " << output_file_path << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } - - output.close(); - } + clear_file(output_file_path); else if(op_format == cuttlefish::gfa_reduced) { const std::string seg_file_path(output_file_path + SEG_FILE_EXT); const std::string seq_file_path(output_file_path + SEQ_FILE_EXT); - std::ofstream output_seg(seg_file_path.c_str(), std::ofstream::out | std::ofstream::trunc), - output_seq(seq_file_path.c_str(), std::ofstream::out | std::ofstream::trunc); - if(!output_seg || !output_seq) - { - std::cerr << "Error opening output files " << seg_file_path << " and " << seq_file_path << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } + clear_file(seg_file_path); + clear_file(seq_file_path); } } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 7f1aa59e..dd531c5d 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,6 +1,7 @@ #include "Read_CdBG_Extractor.hpp" #include "Character_Buffer.hpp" +#include "utility.hpp" // Definition of static members. @@ -32,7 +33,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() vertex_parser.launch_production(); // Clear the output file and initialize the output sink. - clear_output_file(); + clear_file(params.output_file_path()); init_output_sink(); // Launch (multi-thread) extraction of the maximal unitigs. @@ -156,22 +157,6 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } -template -void Read_CdBG_Extractor::clear_output_file() const -{ - const std::string& output_file_path = params.output_file_path(); - - std::ofstream output(output_file_path.c_str(), std::ofstream::out | std::ofstream::trunc); - if(output.fail()) - { - std::cerr << "Error opening output file " << output_file_path << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } - - output.close(); -} - - template void Read_CdBG_Extractor::init_output_sink() { diff --git a/src/utility.cpp b/src/utility.cpp index 5bc34e6d..bcb5ad24 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -77,3 +77,16 @@ void remove_kmer_set(const std::string& kmc_file_pref) std::exit(EXIT_FAILURE); } } + + +void clear_file(const std::string& file_path) +{ + std::ofstream file(file_path.c_str(), std::ofstream::out | std::ofstream::trunc); + if(file.fail()) + { + std::cerr << "Error opening file " << file_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + file.close(); +} From 48a7faef03547709a91e4135a8cafbc2de5061d5 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 3 May 2021 13:27:35 -0400 Subject: [PATCH 087/350] Output in canonical form --- include/Read_CdBG_Extractor.hpp | 26 ++++++++++++++++++++++++++ src/Read_CdBG_Extractor.cpp | 7 +++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 9d5533d6..84a44143 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -74,6 +74,10 @@ class Read_CdBG_Extractor // Closes the output sink. void close_output_sink(); + // Replaces the character sequence `seq` in-place with its reverse complement. + template + static void reverse_complement(T_container_& seq); + // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation // phase. In the general case, these functions with their specified input parameters and their @@ -167,5 +171,27 @@ inline bool Read_CdBG_Extractor::is_flanking_side(const State_Read_Space stat } +template +template +inline void Read_CdBG_Extractor::reverse_complement(T_container_& seq) +{ + assert(!seq.empty()); + + auto fwd = seq.begin(); + auto bwd = seq.end() - 1; + + for(; fwd < bwd; ++fwd, --bwd) + { + std::swap(*fwd, *bwd); + + *fwd = DNA_Utility::complement(*fwd), + *bwd = DNA_Utility::complement(*bwd); + } + + if(fwd == bwd) + *fwd = DNA_Utility::complement(*fwd); +} + + #endif diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index dd531c5d..e778eaab 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -39,7 +39,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() // Launch (multi-thread) extraction of the maximal unitigs. distribute_unipaths_extraction(&vertex_parser, thread_pool); - // Wait for the vertices to be deplted from the database. + // Wait for the vertices to be depleted from the database. vertex_parser.seize_production(); // Wait for the consumer threads to finish parsing and processing edges. @@ -75,7 +75,7 @@ template void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { // Data structures to be reused per each vertex processed. - Kmer v; // For the vertex to be processed one-by-one. + Kmer v; // The vertex copy to be processed one-by-one. cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. std::vector unipath; // The extracted maximal unitig from the vertex `v`. @@ -153,6 +153,9 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const if(!mark_flanking_vertices(sign_vertex, cosign_vertex)) return false; + if(!in_canonical) + reverse_complement(unipath); + return true; } From a7f678b4047fe401ac51a8893c5540357ebf0039 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 3 May 2021 21:07:25 -0400 Subject: [PATCH 088/350] Track more meta-info --- include/Read_CdBG_Extractor.hpp | 2 ++ src/Read_CdBG_Extractor.cpp | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 84a44143..4a661583 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -39,6 +39,8 @@ class Read_CdBG_Extractor mutable uint64_t vertices_processed = 0; // Total number of vertices scanned from the database. uint64_t unipath_count = 0; // Total number of maximal unitigs extracted from the underlying graph. + uint64_t kmer_count = 0; // Total number of k-mers in the extracted maximal unitigs. + std::size_t max_unipath_len = 0; // Length of the longest extracted maximal unitig. // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index e778eaab..a5a551d0 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -49,7 +49,9 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() close_output_sink(); std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; - std::cout << "Number of unipaths extracted: " << unipath_count << "\n"; + std::cout << "Number of maximal unitigs: " << unipath_count << ".\n"; + std::cout << "Number of k-mers in the maximal unitigs: " << kmer_count << ".\n"; + std::cout << "Length of the longest maximal unitig: " << max_unipath_len << ".\n"; std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); @@ -81,7 +83,9 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte std::vector unipath; // The extracted maximal unitig from the vertex `v`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread, in the canonical form. + uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread. + uint64_t kmers_extracted = 0; // Number of k-mers in the extracted maximal unitigs by this thread. + std::size_t max_len = 0; // Length of the longest extracted maximal unitig by this thread. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(BUFF_SZ); @@ -99,15 +103,23 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte // unipath.clear(); unipaths_extracted++; + kmers_extracted += (unipath.size() - 1 - (k - 1)); + if(max_len < unipath.size()) + max_len = unipath.size() - 1; } vertex_count++; } + lock.lock(); + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. vertices_processed += vertex_count; unipath_count += unipaths_extracted; + kmer_count += kmers_extracted; + max_unipath_len = std::max(max_unipath_len, max_len); + lock.unlock(); } From 645a1ee1b32855fcdabb84c8489300be79e67a9d Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 10 May 2021 18:25:11 -0400 Subject: [PATCH 089/350] Add minimizer structure --- include/Kmer.hpp | 93 ++++++++++++++++++++ include/Minimizer_Policy.hpp | 74 ++++++++++++++++ include/Sparse_Lock.hpp | 2 + src/CMakeLists.txt | 1 + src/Minimizer_Policy.cpp | 165 +++++++++++++++++++++++++++++++++++ 5 files changed, 335 insertions(+) create mode 100644 include/Minimizer_Policy.hpp create mode 100644 src/Minimizer_Policy.cpp diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 82f7a1bb..d13297de 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -26,6 +26,9 @@ class Kmer: public DNA_Utility // may access private information (the raw data) from edges, i.e. (k + 1)-mers. friend class Kmer; + // Minimizers can be represented using 32-bit integers. + typedef uint32_t minimizer_t; + private: // Number of 64-bit integers required to compactly represent the underlying k-mer with 2-bits/base encoding. @@ -200,6 +203,20 @@ class Kmer: public DNA_Utility // stream `ostream`. template friend std::ostream& operator<<(std::ostream& out, const Kmer& kmer); + + // Returns the lexicographic l-minimizer for the k-mer. + template + minimizer_t minimizer() const; + + // Returns the l-minimizer for the k-mer where the vector `order` + // determines the minimizer-ordering of the l-mers, i.e. the order + // of the l-mer `i` is `order[i]`. + template + minimizer_t minimizer(const std::vector& order) const; + + // Accumulates the counts of the l-mers of the k-mer into `count`. + template + void count_lmers(std::vector& count) const; }; @@ -683,5 +700,81 @@ std::ostream& operator<<(std::ostream& out, const Kmer& kmer) } +template +template +inline typename Kmer::minimizer_t Kmer::minimizer() const +{ + // static_assert(l <= k); + + // TODO: SIMD? + + minimizer_t lmer = kmer_data[0] & ((1ULL << (2 * l)) - 1); + minimizer_t minmzr = lmer; + + for(uint16_t idx = l; idx < k; ++idx) + { + const uint16_t word_idx = (idx >> 5); + const uint16_t base_idx = (idx & 31); + lmer = (lmer >> 2) | + (((kmer_data[word_idx] & (0b11ULL << (2 * base_idx))) >> (2 * base_idx)) << (2 * (l - 1))); + + if(minmzr > lmer) + minmzr = lmer; + } + + + return minmzr; +} + + +template +template +inline typename Kmer::minimizer_t Kmer::minimizer(const std::vector& order) const +{ + // static_assert(l <= k); + + // TODO: SIMD? + + minimizer_t lmer = kmer_data[0] & ((1ULL << (2 * l)) - 1); + minimizer_t minmzr = lmer; + + for(uint16_t idx = l; idx < k; ++idx) + { + const uint16_t word_idx = (idx >> 5); + const uint16_t base_idx = (idx & 31); + lmer = (lmer >> 2) | + (((kmer_data[word_idx] & (0b11ULL << (2 * base_idx))) >> (2 * base_idx)) << (2 * (l - 1))); + + if(order[minmzr] > order[lmer]) + minmzr = lmer; + } + + + return minmzr; +} + + +template +template +inline void Kmer::count_lmers(std::vector& count) const +{ + // static_assert(l <= k); + + std::size_t lmer = kmer_data[0] & ((1ULL << (2 * l)) - 1); + count[lmer]++; + + for(uint16_t idx = l; idx < k; ++idx) + { + const uint16_t word_idx = (idx >> 5); + const uint16_t base_idx = (idx & 31); + + lmer = (lmer >> 2) | + (((kmer_data[word_idx] & (0b11ULL << (2 * base_idx))) >> (2 * base_idx)) << (2 * (l - 1))); + + count[lmer]++; + } +} + + #endif diff --git a/include/Minimizer_Policy.hpp b/include/Minimizer_Policy.hpp new file mode 100644 index 00000000..69f9d9cd --- /dev/null +++ b/include/Minimizer_Policy.hpp @@ -0,0 +1,74 @@ + +#ifndef MINIMIZER_POLICY_HPP +#define MINIMIZER_POLICY_HPP + + + +#include +#include +#include +#include + + +// Forward declarations. +template class Kmer_SPMC_Iterator; +class Spin_Lock; + + +// A class to manage l-minimizer related policies and functions for k-mers. +template +class Minimizer_Policy +{ +private: + + // Maximum supported length for minimizers. Note that, memory usage for the associated data structures + // may grow exponentially, as `4 ^ l` different minimizers are possible. + static constexpr uint8_t MAX_LEN = 16; + + static constexpr uint32_t NUM_LMERS = 0b1U << (2 * l); // Number of different possible `l`-mers. + std::string kmer_db_path; // Path to the underlying k-mer database. + std::vector order; // `order[i]` denotes the order of the minimizer `i` in the policy. + + + // Sets the lexicographic ordering for the l-minimizers of the k-mers. + void set_lexicographic_ordering(); + + // Sets a random ordering for the l-minimizers of the k-mers. + void set_random_ordering(); + + // Sets the frequency-based ordering for the l-minimizers of the k-mers, using up-to `thread_count` + // number of processor threads. + void set_frequency_ordering(uint16_t thread_count); + + // Counts the l-mers provided to the consumer thread with ID `thread_id` by the k-mer parser `parser`. + // The count results are stored into the vector `count` — `count[i]` is the frequency of the l-mer `i`. + // The spin-lock `lock` is used for thread-safe access to `count`. + void count_lmers(Kmer_SPMC_Iterator& parser, uint16_t thread_id, std::vector& count, Spin_Lock& lock); + + // Counts the l-mer minimizers of the k-mers provided to the consumer thread with ID `thread_id` by the + // k-mer parser `parser`. The count results are stored into the vector `count` — `count[i]` is the + // frequency of the minimizer `i`. The spin-lock `lock` is used for thread-safe access to count. + void count_minimizers(Kmer_SPMC_Iterator& parser, uint16_t thread_id, std::vector& count, Spin_Lock& lock); + +public: + + // Minimizer-ordering policies supported. + enum class Policy: uint8_t + { + lexicographic, + random, + frequency, + }; + + // Constructs an l-minimizer policy object for the k-mers in the database with path `kmer_db_path`, + // ordering policy set as per `policy`. Construction may use up-to `thread_count` number of processor + // threads depending upon `policy`. + Minimizer_Policy(const std::string& kmer_db_path, Policy policy = Policy::lexicographic, uint16_t thread_count = 1); + + // Prints some statistics over the l-minimizers of the underlying k-mer set. + void print_minimizer_stats(uint16_t thread_count); +}; + + + +#endif diff --git a/include/Sparse_Lock.hpp b/include/Sparse_Lock.hpp index 63cc32c6..6703339c 100644 --- a/include/Sparse_Lock.hpp +++ b/include/Sparse_Lock.hpp @@ -4,6 +4,8 @@ +#include +#include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aaf2dd2a..992462cd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,6 +29,7 @@ set(PROJECT_SRC Sequence_Validator.cpp Kmers_Validator.cpp utility.cpp + Minimizer_Policy.cpp ) diff --git a/src/Minimizer_Policy.cpp b/src/Minimizer_Policy.cpp new file mode 100644 index 00000000..6f9defb1 --- /dev/null +++ b/src/Minimizer_Policy.cpp @@ -0,0 +1,165 @@ + +#include "Minimizer_Policy.hpp" +#include "Kmer_Container.hpp" +#include "Kmer_SPMC_Iterator.hpp" +#include "Spin_Lock.hpp" +#include "globals.hpp" + +#include +#include +#include +#include +#include +#include +#include + + +template +Minimizer_Policy::Minimizer_Policy(const std::string& kmer_db_path, const Policy policy, const uint16_t thread_count): + kmer_db_path(kmer_db_path), + order(NUM_LMERS) +{ + switch(policy) + { + default: + case Policy::lexicographic: + set_lexicographic_ordering(); + break; + + case Policy::random: + set_random_ordering(); + break; + + case Policy::frequency: + set_frequency_ordering(thread_count); + break; + } +} + + +template +void Minimizer_Policy::set_lexicographic_ordering() +{ + std::iota(order.begin(), order.end(), 0U); +} + + +template +void Minimizer_Policy::set_random_ordering() +{ + set_lexicographic_ordering(); + + std::shuffle(order.begin(), order.end(), std::mt19937(std::random_device()())); +} + + +template +void Minimizer_Policy::set_frequency_ordering(const uint16_t thread_count) +{ + const Kmer_Container kmer_container(kmer_db_path); + Kmer_SPMC_Iterator parser(&kmer_container, thread_count); + + + parser.launch_production(); + + std::vector count(NUM_LMERS); + Spin_Lock lock; + std::vector> T(thread_count); + + for(uint16_t thread_id = 0; thread_id < thread_count; ++thread_id) + T[thread_id].reset( + new std::thread(&Minimizer_Policy::count_lmers, this, std::ref(parser), thread_id, std::ref(count), std::ref(lock)) + ); + + parser.seize_production(); + + for(uint16_t thread_id = 0; thread_id < thread_count; ++thread_id) + T[thread_id]->join(); + + + const std::size_t max_freq_lmer = std::max_element(count.begin(), count.end()) - count.begin(); + std::cout << "Most frequent l-mer: " << max_freq_lmer << ".\n"; + std::cout << "Associated frequency: " << count[max_freq_lmer] << ".\n"; + + const std::size_t min_freq_lmer = std::min_element(count.begin(), count.end()) - count.begin(); + std::cout << "Least frequent l-mer: " << min_freq_lmer << ".\n"; + std::cout << "Associated frequency: " << count[min_freq_lmer] << ".\n"; + + + std::vector> freq_lmer_pair; + freq_lmer_pair.reserve(NUM_LMERS); + + for(uint32_t lmer = 0; lmer < NUM_LMERS; ++lmer) + freq_lmer_pair.emplace_back(count[lmer], lmer); + + std::sort(freq_lmer_pair.begin(), freq_lmer_pair.end()); + + + for(uint32_t idx = 0; idx < NUM_LMERS; ++idx) + order[freq_lmer_pair[idx].second] = idx; +} + + +template +void Minimizer_Policy::count_lmers(Kmer_SPMC_Iterator& parser, const uint16_t thread_id, std::vector& count, Spin_Lock& lock) +{ + std::vector local_count(NUM_LMERS); + Kmer kmer; + + while(parser.tasks_expected(thread_id)) + if(parser.value_at(thread_id, kmer)) + kmer.template count_lmers(local_count); + + lock.lock(); + std::transform(count.begin(), count.end(), local_count.begin(), count.begin(), std::plus()); + lock.unlock(); +} + + +template +void Minimizer_Policy::print_minimizer_stats(const uint16_t thread_count) +{ + const Kmer_Container kmer_container(kmer_db_path); + Kmer_SPMC_Iterator parser(&kmer_container, thread_count); + + parser.launch_production(); + + std::vector count(NUM_LMERS); + Spin_Lock lock; + std::vector> T(thread_count); + + for(uint16_t thread_id = 0; thread_id < thread_count; ++thread_id) + T[thread_id].reset( + new std::thread(&Minimizer_Policy::count_minimizers, this, std::ref(parser), thread_id, std::ref(count), std::ref(lock)) + ); + + parser.seize_production(); + + for(uint16_t thread_id = 0; thread_id < thread_count; ++thread_id) + T[thread_id]->join(); + + + const std::size_t max_freq_minmzr = std::max_element(count.begin(), count.end()) - count.begin(); + std::cout << "Most frequent l-minmizer: " << max_freq_minmzr << ".\n"; + std::cout << "Associated frequency: " << count[max_freq_minmzr] << ".\n"; + + const std::size_t min_freq_minmzr = std::min_element(count.begin(), count.end()) - count.begin(); + std::cout << "Least frequent l-minimizer: " << min_freq_minmzr << ".\n"; + std::cout << "Associated frequency: " << count[min_freq_minmzr] << ".\n"; +} + + +template +void Minimizer_Policy::count_minimizers(Kmer_SPMC_Iterator& parser, const uint16_t thread_id, std::vector& count, Spin_Lock& lock) +{ + std::vector local_count(NUM_LMERS); + Kmer kmer; + + while(parser.tasks_expected(thread_id)) + if(parser.value_at(thread_id, kmer)) + local_count[kmer.template minimizer(order)]++; + + lock.lock(); + std::transform(count.begin(), count.end(), local_count.begin(), count.begin(), std::plus()); + lock.unlock(); +} From 98890494af91cb8a7cbd36abaa047539430d7f7c Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 19 May 2021 13:17:56 -0400 Subject: [PATCH 090/350] Clean app driver header --- include/Application.hpp | 92 ++------------------------------- src/Application.cpp | 109 ++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 3 files changed, 115 insertions(+), 87 deletions(-) create mode 100644 src/Application.cpp diff --git a/include/Application.hpp b/include/Application.hpp index c21b82f4..63bdeb3b 100644 --- a/include/Application.hpp +++ b/include/Application.hpp @@ -58,99 +58,17 @@ class Application<1, T_App> public: - Application(const Build_Params& params): - app(params.k() == 1 ? new T_App<1>(params) : nullptr), - validator(nullptr) - {} - - - Application(const Validation_Params& params): - app(nullptr), - validator(params.k() == 1 ? new Validator<1>(params) : nullptr) - {} - - - ~Application() - { - if(app != nullptr) - delete app; - - if(validator != nullptr) - delete validator; - } - + Application(const Build_Params& params); - void execute() const - { - if(app != nullptr) - app->construct(); - else - { - std::cerr << "The provided k is not valid. Aborting.\n"; - std::exit(EXIT_FAILURE); - } - } + Application(const Validation_Params& params); + ~Application(); - bool validate() const - { - if(validator != nullptr) - return validator->validate(); + void execute() const; - std::cerr << "The provided k is not valid. Aborting.\n"; - std::exit(EXIT_FAILURE); - } + bool validate() const; }; -template typename T_App> -inline Application::Application(const Build_Params& params): - app_next_level(new Application(params)), - app(params.k() == k ? new T_App(params) : nullptr), - validator(nullptr) -{} - - -template typename T_App> -inline Application::Application(const Validation_Params& params): - app_next_level(new Application(params)), - app(nullptr), - validator(params.k() == k ? new Validator(params): nullptr) -{} - - -template typename T_App> -inline Application::~Application() -{ - delete app_next_level; - - if(app != nullptr) - delete app; - - if(validator != nullptr) - delete validator; -} - - -template typename T_App> -inline void Application::execute() const -{ - if(app != nullptr) - app->construct(); - else - app_next_level->execute(); -} - - -template typename T_App> -inline bool Application::validate() const -{ - if(validator != nullptr) - return validator->validate(); - - return app_next_level->validate(); -} - - #endif diff --git a/src/Application.cpp b/src/Application.cpp new file mode 100644 index 00000000..cc91e0eb --- /dev/null +++ b/src/Application.cpp @@ -0,0 +1,109 @@ + +#include "Application.hpp" +#include "globals.hpp" +#include "CdBG.hpp" +#include "Read_CdBG.hpp" + + +template typename T_App> +Application::Application(const Build_Params& params): + app_next_level(new Application(params)), + app(params.k() == k ? new T_App(params) : nullptr), + validator(nullptr) +{} + + +template typename T_App> +Application::Application(const Validation_Params& params): + app_next_level(new Application(params)), + app(nullptr), + validator(params.k() == k ? new Validator(params): nullptr) +{} + + +template typename T_App> +Application::~Application() +{ + delete app_next_level; + + if(app != nullptr) + delete app; + + if(validator != nullptr) + delete validator; +} + + +template typename T_App> +void Application::execute() const +{ + if(app != nullptr) + app->construct(); + else + app_next_level->execute(); +} + + +template typename T_App> +bool Application::validate() const +{ + if(validator != nullptr) + return validator->validate(); + + return app_next_level->validate(); +} + + +template typename T_App> +Application<1, T_App>::Application(const Build_Params& params): + app(params.k() == 1 ? new T_App<1>(params) : nullptr), + validator(nullptr) +{} + + +template typename T_App> +Application<1, T_App>::Application(const Validation_Params& params): + app(nullptr), + validator(params.k() == 1 ? new Validator<1>(params) : nullptr) +{} + + +template typename T_App> +Application<1, T_App>::~Application() +{ + if(app != nullptr) + delete app; + + if(validator != nullptr) + delete validator; +} + + +template typename T_App> +void Application<1, T_App>::execute() const +{ + if(app != nullptr) + app->construct(); + else + { + std::cerr << "The provided k is not valid. Aborting.\n"; + std::exit(EXIT_FAILURE); + } +} + + +template typename T_App> +bool Application<1, T_App>::validate() const +{ + if(validator != nullptr) + return validator->validate(); + + std::cerr << "The provided k is not valid. Aborting.\n"; + std::exit(EXIT_FAILURE); +} + + + +// Template instantiations for the required instances. +template class Application; +template class Application; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 992462cd..27d1fdf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ set(PROJECT_SRC kmc_api/mmer.cpp xxHash/xxhash.c xxHash/xxhsum.c + Application.cpp Parser.cpp Async_Logger_Wrapper.cpp Thread_Pool.cpp From dadc6ff4029ff1ce973dce5be50a126ffabb2e78 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 20 May 2021 14:29:08 -0400 Subject: [PATCH 091/350] Misc. --- include/Endpoint.hpp | 2 ++ src/main.cpp | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/Endpoint.hpp b/include/Endpoint.hpp index 37290186..6ad79eda 100644 --- a/include/Endpoint.hpp +++ b/include/Endpoint.hpp @@ -18,6 +18,8 @@ class Endpoint { private: + // TODO: Refactor the class with inclusion of a `Directed_Vertex` instance, replacing four fields. + Kmer kmer_; // The endpoint k-mer spelled by the edge instance. Kmer kmer_bar_; // Reverse complement of the k-mer spelled by the edge instance. const Kmer* kmer_hat_ptr; // Pointer to the canonical form of the endpoint k-mer. diff --git a/src/main.cpp b/src/main.cpp index c14b46e9..8a6251b5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -47,7 +47,7 @@ void build(int argc, char** argv) return; } - const bool is_read_graph = result["read"].as(); + const auto is_read_graph = result["read"].as(); const auto refs = result["refs"].as>(); const auto lists = result["lists"].as>(); const auto dirs = result["dirs"].as>(); @@ -72,15 +72,15 @@ void build(int argc, char** argv) std::cout.precision(3); - const std::string dBG_type = (params.is_read_graph() ? "read" : "reference"); + const std::string dBg_type(params.is_read_graph() ? "read" : "reference"); - std::cout << "\nConstructing the " << dBG_type << " compacted de Bruijn graph for k = " << k << ".\n"; + std::cout << "\nConstructing the compacted " << dBg_type << " de Bruijn graph for k = " << k << ".\n"; - const Application app_ref_dBG(params); - const Application app_read_dBG(params); - params.is_read_graph() ? app_read_dBG.execute() : app_ref_dBG.execute(); + params.is_read_graph() ? + Application(params).execute() : + Application(params).execute(); - std::cout << "\nConstructed the " << dBG_type << " compacted de Bruijn graph at " << output_file << ".\n"; + std::cout << "\nConstructed the " << dBg_type << " compacted de Bruijn graph at " << output_file << ".\n"; } catch(const std::exception& e) { @@ -136,8 +136,8 @@ void validate(int argc, char** argv) std::cout << "\nValidating the compacted de Bruijn graph for k = " << k << "\n"; - const Application app(params); - std::cout << (app.validate() ? "\nValidation successful" : "\nValidation failed") << std::endl; + std::cout << (Application(params).validate() ? + "\nValidation successful" : "\nValidation failed") << std::endl; } catch(const std::exception& e) { @@ -157,7 +157,9 @@ int main(int argc, char** argv) } else { - const std::string command = argv[1]; + std::string command(argv[1]); + std::transform(command.begin(), command.end(), command.begin(), [](const char ch) { return std::tolower(ch); }); + if(command == "build") build(argc - 1, argv + 1); else if(command == "validate") From c3ca48c747eaff54a47787d6767c563ebdedb28b Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 20 May 2021 17:36:37 -0400 Subject: [PATCH 092/350] Better max cap for long unipaths --- include/Character_Buffer.hpp | 10 +++++----- include/Read_CdBG_Extractor.hpp | 5 ++++- src/Read_CdBG_Extractor.cpp | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index f3840983..2cf348f4 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -103,21 +103,21 @@ template template inline void Character_Buffer::operator+=(const T_container_& str) { - if(buffer.size() + str.size() >= CAPACITY) + if(buffer.size() + str.size() >= CAPACITY) // Using `>=` since for async logging, a `\0` is inserted at the end of `buffer`. { flush(); if(str.size() >= CAPACITY) { - std::cerr << "A single output string overflows the string-buffer capacity.\n" - "Output string length: " << str.size() << ", string-buffer capacity: " << CAPACITY << ".\n" - "Please consider increasing the buffer capacity parameter in build for future use.\n"; + // std::cerr << "A single output string overflows the string-buffer capacity.\n" + // "Output string length: " << str.size() << ", string-buffer capacity: " << CAPACITY << ".\n" + // "Please consider increasing the buffer capacity parameter in build for future use.\n"; buffer.reserve(str.size()); } } - + // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. buffer.insert(buffer.end(), str.begin(), str.end()); } diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 4a661583..5038f08f 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -32,7 +32,10 @@ class Read_CdBG_Extractor // typedef std::ofstream sink_t; typedef Async_Logger_Wrapper sink_t; Output_Sink output_sink; // Sink for the output maximal unitigs. - static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB. + + // TODO: give these limits more thoughts, especially their exact impact on the memory usage. + static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitigs can be retained in memory, at most, before flushing. + static constexpr std::size_t SEQ_SZ = 5 * 1024ULL * 1024ULL; // 5 MB (soft limit) sized maximal unitig, at most, is constructed at a time. // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index a5a551d0..8b779013 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -88,7 +88,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte std::size_t max_len = 0; // Length of the longest extracted maximal unitig by this thread. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. - unipath.reserve(BUFF_SZ); + unipath.reserve(SEQ_SZ); while(vertex_parser->tasks_expected(thread_id)) if(vertex_parser->value_at(thread_id, v)) From 8dcff69898f305f5a30ba39842883e9832a7e607 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 4 Jun 2021 18:05:11 -0400 Subject: [PATCH 093/350] Better meta-info tracking --- include/Read_CdBG_Extractor.hpp | 9 +++-- include/Unipaths_Meta_info.hpp | 60 +++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Read_CdBG_Extractor.cpp | 21 ++++-------- src/Unipaths_Meta_info.cpp | 44 ++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 19 deletions(-) create mode 100644 include/Unipaths_Meta_info.hpp create mode 100644 src/Unipaths_Meta_info.cpp diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 5038f08f..c75fbd09 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -14,7 +14,9 @@ #include "Thread_Pool.hpp" #include "Async_Logger_Wrapper.hpp" #include "Output_Sink.hpp" +#include "Unipaths_Meta_info.hpp" +#include #include @@ -37,13 +39,10 @@ class Read_CdBG_Extractor static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitigs can be retained in memory, at most, before flushing. static constexpr std::size_t SEQ_SZ = 5 * 1024ULL * 1024ULL; // 5 MB (soft limit) sized maximal unitig, at most, is constructed at a time. - // Members required to keep track of the total number of vertices processed across different worker (i.e. extractor) threads. - mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. mutable uint64_t vertices_processed = 0; // Total number of vertices scanned from the database. + mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. - uint64_t unipath_count = 0; // Total number of maximal unitigs extracted from the underlying graph. - uint64_t kmer_count = 0; // Total number of k-mers in the extracted maximal unitigs. - std::size_t max_unipath_len = 0; // Length of the longest extracted maximal unitig. + Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp new file mode 100644 index 00000000..df3e12cd --- /dev/null +++ b/include/Unipaths_Meta_info.hpp @@ -0,0 +1,60 @@ + +#ifndef UNIPATHS_META_INFO_HPP +#define UNIPATHS_META_INFO_HPP + + + +#include +#include + + +// A class to track meta-information over maximal unipaths extracted by some worker thread. +template +class Unipaths_Meta_info +{ +private: + + uint64_t unipath_count; // Total number of maximal unitigs. + uint64_t kmer_count; // Total number of k-mers in the maximal unitigs. + std::size_t max_len; // Length of the longest maximal unitig. + std::size_t min_len; // Length of the shortest maximal unitig. + uint64_t sum_len; // Sum length of the maximal unitigs. + + +public: + + // Constructs a meta-information tracker for maximal unitigs. + Unipaths_Meta_info(); + + // Adds information of the maximal unitig `unipath` to the tracker. + template + void add_maximal_unitig(const T_container_& unipath); + + // Aggregates the information of the tracker `other` to this tracker. + void aggregate(const Unipaths_Meta_info& other); + + // Prints the tracked information to the standard output. + void print() const; +}; + + +template +template +inline void Unipaths_Meta_info::add_maximal_unitig(const T_container_& unipath) +{ + unipath_count++; + + kmer_count += unipath.size() - (k - 1); + + if(max_len < unipath.size()) + max_len = unipath.size(); + + if(min_len > unipath.size()) + min_len = unipath.size(); + + sum_len += unipath.size(); +} + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 27d1fdf4..ca8b6255 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ set(PROJECT_SRC Read_CdBG.cpp Read_CdBG_Constructor.cpp Read_CdBG_Extractor.cpp + Unipaths_Meta_info.cpp Validator.cpp Validator_Hash_Table.cpp Sequence_Validator.cpp diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 8b779013..16579a81 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -49,9 +49,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() close_output_sink(); std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; - std::cout << "Number of maximal unitigs: " << unipath_count << ".\n"; - std::cout << "Number of k-mers in the maximal unitigs: " << kmer_count << ".\n"; - std::cout << "Length of the longest maximal unitig: " << max_unipath_len << ".\n"; + unipaths_meta_info.print(); std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); @@ -83,9 +81,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte std::vector unipath; // The extracted maximal unitig from the vertex `v`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - uint64_t unipaths_extracted = 0; // Number of maximal unitigs successfully extracted by this thread. - uint64_t kmers_extracted = 0; // Number of k-mers in the extracted maximal unitigs by this thread. - std::size_t max_len = 0; // Length of the longest extracted maximal unitig by this thread. + Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(SEQ_SZ); @@ -98,27 +94,24 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte if(!state.is_outputted() && is_flanking_state(state, s_v)) if(extract_maximal_unitig(v, s_v, unipath)) { + extracted_unipaths_info.add_maximal_unitig(unipath); + unipath.emplace_back('\n'); output_buffer += unipath; // unipath.clear(); - - unipaths_extracted++; - kmers_extracted += (unipath.size() - 1 - (k - 1)); - if(max_len < unipath.size()) - max_len = unipath.size() - 1; } vertex_count++; } + // Aggregate the meta-information over the extracted maximal unitigs and the thread-executions. lock.lock(); std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. + vertices_processed += vertex_count; - unipath_count += unipaths_extracted; - kmer_count += kmers_extracted; - max_unipath_len = std::max(max_unipath_len, max_len); + unipaths_meta_info.aggregate(extracted_unipaths_info); lock.unlock(); } diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp new file mode 100644 index 00000000..d6fd388f --- /dev/null +++ b/src/Unipaths_Meta_info.cpp @@ -0,0 +1,44 @@ + +#include "Unipaths_Meta_info.hpp" +#include "globals.hpp" + +#include +#include + + +template +Unipaths_Meta_info::Unipaths_Meta_info(): + unipath_count(0), + kmer_count(0), + max_len(0), + min_len(std::numeric_limits::max()), + sum_len(0) +{} + + +template +void Unipaths_Meta_info::aggregate(const Unipaths_Meta_info& other) +{ + unipath_count += other.unipath_count; + kmer_count += other.kmer_count; + + max_len = std::max(max_len, other.max_len); + min_len = std::min(min_len, other.min_len); + sum_len += other.sum_len; +} + + +template +void Unipaths_Meta_info::print() const +{ + std::cout << "Number of maximal unitigs: " << unipath_count << ".\n"; + std::cout << "Number of k-mers in the maximal unitigs: " << kmer_count << ".\n"; + std::cout << "Length of the longest maximal unitig (in bases): " << max_len << ".\n"; + std::cout << "Length of the shortest maximal unitig (in bases): " << min_len << ".\n"; + std::cout << "Sum length of the maximal unitigs (in bases): " << sum_len << ".\n"; +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Unipaths_Meta_info) From 3d684825cebac375d57ed2e5e7fd32d19e3c0e8e Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 7 Jun 2021 15:32:46 -0400 Subject: [PATCH 094/350] Output fasta --- include/Character_Buffer.hpp | 41 ++++++++++++++--- include/FASTA_Record.hpp | 82 +++++++++++++++++++++++++++++++++ include/Read_CdBG_Extractor.hpp | 7 +-- src/Read_CdBG_Extractor.cpp | 19 ++++---- 4 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 include/FASTA_Record.hpp diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index 2cf348f4..92dd95b5 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -6,6 +6,7 @@ #include "Spin_Lock.hpp" #include "Async_Logger_Wrapper.hpp" +#include "FASTA_Record.hpp" #include #include @@ -26,6 +27,10 @@ class Character_Buffer T_sink_& sink; // Reference to the sink to flush the buffer content to. + // Ensures that `buffer` has enough space for additional `append_size` + // number of bytes, using flush and allocation as necessary. + void ensure_space(std::size_t append_size); + // Flushes the buffer content to the sink, and clears the buffer. void flush(); @@ -39,6 +44,11 @@ class Character_Buffer template void operator+=(const T_container_& str); + // Appends the content of the FASTA record `fasta_rec` to the buffer. Flushes + // are possible. + template + void operator+=(const FASTA_Record& fasta_rec); + // Destructs the buffer object, flushing it if content are present. ~Character_Buffer(); }; @@ -103,22 +113,41 @@ template template inline void Character_Buffer::operator+=(const T_container_& str) { - if(buffer.size() + str.size() >= CAPACITY) // Using `>=` since for async logging, a `\0` is inserted at the end of `buffer`. + ensure_space(str.size()); + + // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. + buffer.insert(buffer.end(), str.begin(), str.end()); +} + + +template +template +inline void Character_Buffer::operator+=(const FASTA_Record& fasta_rec) +{ + ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size()); + + fasta_rec.append_header(buffer); // Append the header. + buffer.emplace_back('\n'); // Break-line. + fasta_rec.append_seq(buffer); // Append the sequence. +} + + +template +inline void Character_Buffer::ensure_space(const std::size_t append_size) +{ + if(buffer.size() + append_size >= CAPACITY) // Using `>=` since for async logging, a `\0` is inserted at the end of `buffer`. { flush(); - if(str.size() >= CAPACITY) + if(append_size >= CAPACITY) { // std::cerr << "A single output string overflows the string-buffer capacity.\n" // "Output string length: " << str.size() << ", string-buffer capacity: " << CAPACITY << ".\n" // "Please consider increasing the buffer capacity parameter in build for future use.\n"; - buffer.reserve(str.size()); + buffer.reserve(append_size); } } - - // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. - buffer.insert(buffer.end(), str.begin(), str.end()); } diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp new file mode 100644 index 00000000..20ddc71d --- /dev/null +++ b/include/FASTA_Record.hpp @@ -0,0 +1,82 @@ + +#ifndef FASTA_RECORD_HPP +#define FASTA_RECORD_HPP + + + +#include "fmt/format.h" + +// ============================================================================= +// A class wrapping a basic FASTA record: the sequence of type `T_seq_` and its +// header/identifier of type `T_id`. The class is specifically designed for +// writing purposed of output maximal unitigs in the FASTA format. +template +class FASTA_Record +{ +private: + + const T_id_ id_; // Identifier for the FASTA sequence. + const T_seq_& seq_; // The FASTA sequence. + + +public: + + // Constructs a FASTA header with identifier `id` and the sequence `seq`. + // Only a constant reference to the sequence is captured, so the record's + // correctness holds as long as the referred sequence itself remains unaltered. + FASTA_Record(uint64_t id, const T_seq_& str); + + // Returns the length of the header line of the record. + std::size_t header_size() const; + + // Returns the length of the sequence of the record. + std::size_t seq_size() const; + + // Appends the header line to the vector `buffer`. + void append_header(std::vector& buffer) const; + + // Appends the FASTA sequence to the vector `buffer`. + void append_seq(std::vector& buffer) const; +}; + + +template +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): + id_(id), + seq_(seq) +{} + + +template +inline std::size_t FASTA_Record::header_size() const +{ + return id_.size() + static_cast(1U); // One additional byte for `>`. +} + + +template +inline std::size_t FASTA_Record::seq_size() const +{ + return seq_.size(); +} + + +template +inline void FASTA_Record::append_header(std::vector& buffer) const +{ + buffer.emplace_back('>'); + + buffer.insert(buffer.end(), id_.data(), id_.data() + id_.size()); +} + + +template +inline void FASTA_Record::append_seq(std::vector& buffer) const +{ + // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. + buffer.insert(buffer.end(), seq_.begin(), seq_.end()); +} + + + +#endif diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index c75fbd09..5a3e660a 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -58,9 +58,10 @@ class Read_CdBG_Extractor // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If - // the attempt is successful, then the maximal unitig is extracted in its canonical form, into the - // string `unipath` (it is overwritten). If not, `unipath` may contain partial form of the unitig. - bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, std::vector& unipath); + // the attempt is successful, then the maximal unitig is extracted in its canonical form into + // `unipath` (it is overwritten); also, a unique ID for it is put in `id`. If not, `unipath` may + // contain partial form of the unitig, and `id` is unaltered. + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath); // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 16579a81..c1515bd9 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,13 +1,10 @@ #include "Read_CdBG_Extractor.hpp" +#include "FASTA_Record.hpp" #include "Character_Buffer.hpp" #include "utility.hpp" -// Definition of static members. -template constexpr std::size_t Read_CdBG_Extractor::BUFF_SZ; - - template Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table): params(params), @@ -76,9 +73,10 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte { // Data structures to be reused per each vertex processed. Kmer v; // The vertex copy to be processed one-by-one. - cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `u` containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. - std::vector unipath; // The extracted maximal unitig from the vertex `v`. + uint64_t id; // The unique ID of the maximal unitig `u`. + std::vector unipath; // The extracted maximal unitig `u`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. @@ -92,12 +90,13 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte state = hash_table[v].state(); if(!state.is_outputted() && is_flanking_state(state, s_v)) - if(extract_maximal_unitig(v, s_v, unipath)) + if(extract_maximal_unitig(v, s_v, id, unipath)) { extracted_unipaths_info.add_maximal_unitig(unipath); unipath.emplace_back('\n'); - output_buffer += unipath; + // output_buffer += unipath; + output_buffer += FASTA_Record>(id, unipath); // unipath.clear(); } @@ -118,7 +117,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte template -bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, std::vector& unipath) +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath) { // Data structures to be reused per each vertex extension of the maximal unitig. cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. @@ -161,6 +160,8 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const if(!in_canonical) reverse_complement(unipath); + id = sign_vertex.hash(); + return true; } From a0832bdd2907906bab5ac8e001969b92cca62724 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 7 Jun 2021 18:17:03 -0400 Subject: [PATCH 095/350] Refactor --- include/Unipaths_Meta_info.hpp | 24 ++++++++++++------------ src/Unipaths_Meta_info.cpp | 30 +++++++++++++++--------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index df3e12cd..261a2356 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -14,11 +14,11 @@ class Unipaths_Meta_info { private: - uint64_t unipath_count; // Total number of maximal unitigs. - uint64_t kmer_count; // Total number of k-mers in the maximal unitigs. - std::size_t max_len; // Length of the longest maximal unitig. - std::size_t min_len; // Length of the shortest maximal unitig. - uint64_t sum_len; // Sum length of the maximal unitigs. + uint64_t unipath_count_; // Total number of maximal unitigs. + uint64_t kmer_count_; // Total number of k-mers in the maximal unitigs. + std::size_t max_len_; // Length of the longest maximal unitig. + std::size_t min_len_; // Length of the shortest maximal unitig. + uint64_t sum_len_; // Sum length of the maximal unitigs. public: @@ -42,17 +42,17 @@ template template inline void Unipaths_Meta_info::add_maximal_unitig(const T_container_& unipath) { - unipath_count++; + unipath_count_++; - kmer_count += unipath.size() - (k - 1); + kmer_count_ += unipath.size() - (k - 1); - if(max_len < unipath.size()) - max_len = unipath.size(); + if(max_len_ < unipath.size()) + max_len_ = unipath.size(); - if(min_len > unipath.size()) - min_len = unipath.size(); + if(min_len_ > unipath.size()) + min_len_ = unipath.size(); - sum_len += unipath.size(); + sum_len_ += unipath.size(); } diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index d6fd388f..410be089 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -8,34 +8,34 @@ template Unipaths_Meta_info::Unipaths_Meta_info(): - unipath_count(0), - kmer_count(0), - max_len(0), - min_len(std::numeric_limits::max()), - sum_len(0) + unipath_count_(0), + kmer_count_(0), + max_len_(0), + min_len_(std::numeric_limits::max()), + sum_len_(0) {} template void Unipaths_Meta_info::aggregate(const Unipaths_Meta_info& other) { - unipath_count += other.unipath_count; - kmer_count += other.kmer_count; + unipath_count_ += other.unipath_count_; + kmer_count_ += other.kmer_count_; - max_len = std::max(max_len, other.max_len); - min_len = std::min(min_len, other.min_len); - sum_len += other.sum_len; + max_len_ = std::max(max_len_, other.max_len_); + min_len_ = std::min(min_len_, other.min_len_); + sum_len_ += other.sum_len_; } template void Unipaths_Meta_info::print() const { - std::cout << "Number of maximal unitigs: " << unipath_count << ".\n"; - std::cout << "Number of k-mers in the maximal unitigs: " << kmer_count << ".\n"; - std::cout << "Length of the longest maximal unitig (in bases): " << max_len << ".\n"; - std::cout << "Length of the shortest maximal unitig (in bases): " << min_len << ".\n"; - std::cout << "Sum length of the maximal unitigs (in bases): " << sum_len << ".\n"; + std::cout << "Number of maximal unitigs: " << unipath_count_ << ".\n"; + std::cout << "Number of k-mers in the maximal unitigs: " << kmer_count_ << ".\n"; + std::cout << "Length of the longest maximal unitig (in bases): " << max_len_ << ".\n"; + std::cout << "Length of the shortest maximal unitig (in bases): " << min_len_ << ".\n"; + std::cout << "Sum length of the maximal unitigs (in bases): " << sum_len_ << ".\n"; } From 7c005efc29ccfdb567388279ecb124069d58a5b7 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 7 Jun 2021 18:31:16 -0400 Subject: [PATCH 096/350] Detect detached cycle-existences --- data/cycles.fa | 4 ++++ include/Unipaths_Meta_info.hpp | 3 +++ src/Read_CdBG_Extractor.cpp | 5 +++++ src/Unipaths_Meta_info.cpp | 7 +++++++ 4 files changed, 19 insertions(+) create mode 100644 data/cycles.fa diff --git a/data/cycles.fa b/data/cycles.fa new file mode 100644 index 00000000..e2d6cbf2 --- /dev/null +++ b/data/cycles.fa @@ -0,0 +1,4 @@ +>1 +AACAC +>2 +TAGATAG diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index 261a2356..32a42ecf 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -33,6 +33,9 @@ class Unipaths_Meta_info // Aggregates the information of the tracker `other` to this tracker. void aggregate(const Unipaths_Meta_info& other); + // Returns the total number of k-mers in the extracted maximal unitigs. + uint64_t kmer_count() const; + // Prints the tracked information to the standard output. void print() const; }; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index c1515bd9..c2323d50 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -48,6 +48,11 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; unipaths_meta_info.print(); + // Check for the existence of cycle(s). + if(unipaths_meta_info.kmer_count() != vertex_container.size()) + std::cout << "\nCycles disconnected from the rest of the graph are present." + " I.e. the cycles are graph components exclusively on their own.\n\n"; + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index 410be089..81bc4ae8 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -28,6 +28,13 @@ void Unipaths_Meta_info::aggregate(const Unipaths_Meta_info& other) } +template +uint64_t Unipaths_Meta_info::kmer_count() const +{ + return kmer_count_; +} + + template void Unipaths_Meta_info::print() const { From d986206f31cd07d7bf5381098447f25bf6c10561 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 8 Jun 2021 10:51:15 -0400 Subject: [PATCH 097/350] Reduce in-header include directives --- include/Read_CdBG_Extractor.hpp | 8 +++++--- src/Read_CdBG_Extractor.cpp | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 5a3e660a..935fd156 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -6,12 +6,9 @@ #include "globals.hpp" #include "Kmer_Hash_Table.hpp" -#include "Kmer_Container.hpp" -#include "Kmer_SPMC_Iterator.hpp" #include "Directed_Vertex.hpp" #include "Build_Params.hpp" #include "Spin_Lock.hpp" -#include "Thread_Pool.hpp" #include "Async_Logger_Wrapper.hpp" #include "Output_Sink.hpp" #include "Unipaths_Meta_info.hpp" @@ -20,6 +17,11 @@ #include +// Forward declarations. +template class Kmer_SPMC_Iterator; +template class Thread_Pool; + + // A class to extract the vertices from a compacted de Bruin graph — which are the maximal unitigs of some ordinary de Bruijn graph. template class Read_CdBG_Extractor diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index c2323d50..19253dd5 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -1,8 +1,11 @@ #include "Read_CdBG_Extractor.hpp" +#include "Kmer_Container.hpp" +#include "Kmer_SPMC_Iterator.hpp" #include "FASTA_Record.hpp" #include "Character_Buffer.hpp" #include "utility.hpp" +#include "Thread_Pool.hpp" template From ed139c7ab62e18086f12de43c1c2288e0c5c6fa6 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 8 Jun 2021 17:49:29 -0400 Subject: [PATCH 098/350] Relocate build params validator --- include/Build_Params.hpp | 60 -------------------------------------- src/Build_Params.cpp | 62 ++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 3 files changed, 63 insertions(+), 60 deletions(-) create mode 100644 src/Build_Params.cpp diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 8bd531fc..e44b0251 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -153,65 +153,5 @@ class Build_Params }; -inline bool Build_Params::is_valid() const -{ - bool valid = true; - - - // Check if read and reference de Bruijn graph parameters are being mixed with. - if(is_read_graph_) - { - if(!reference_input_.empty()) - { - std::cout << "No reference is to be provided for a compacted read de Bruijn graph construction.\n"; - valid = false; - } - - if(edge_db_path_.empty()) - { - std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; - valid = false; - } - } - else - { - if(!edge_db_path_.empty()) - { - std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; - valid = false; - } - } - - - // Even `k` values are not consistent with the theory. - // Also, `k` needs to be in the range `[1, MAX_K]`. - if((k_ & 1) == 0 || (k_ > cuttlefish::MAX_K)) - { - std::cout << "The k-mer length (k) needs to be odd and within " << cuttlefish::MAX_K << ".\n"; - valid = false; - } - - - // Discard unsupported thread counts. - const auto num_threads = std::thread::hardware_concurrency(); - if(num_threads > 0 && thread_count_ > num_threads) - { - std::cout << "At most " << num_threads << " concurrent threads are supported at the machine.\n"; - valid = false; - } - - - // Discard invalid output formats. - if(output_format_ >= cuttlefish::num_op_formats) - { - std::cout << "Invalid output file format.\n"; - valid = false; - } - - - return valid; -} - - #endif diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp new file mode 100644 index 00000000..131ee9e7 --- /dev/null +++ b/src/Build_Params.cpp @@ -0,0 +1,62 @@ + +#include "Build_Params.hpp" + + +bool Build_Params::is_valid() const +{ + bool valid = true; + + + // Check if read and reference de Bruijn graph parameters are being mixed with. + if(is_read_graph_) + { + if(!reference_input_.empty()) + { + std::cout << "No reference is to be provided for a compacted read de Bruijn graph construction.\n"; + valid = false; + } + + if(edge_db_path_.empty()) + { + std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; + valid = false; + } + } + else + { + if(!edge_db_path_.empty()) + { + std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; + valid = false; + } + } + + + // Even `k` values are not consistent with the theory. + // Also, `k` needs to be in the range `[1, MAX_K]`. + if((k_ & 1) == 0 || (k_ > cuttlefish::MAX_K)) + { + std::cout << "The k-mer length (k) needs to be odd and within " << cuttlefish::MAX_K << ".\n"; + valid = false; + } + + + // Discard unsupported thread counts. + const auto num_threads = std::thread::hardware_concurrency(); + if(num_threads > 0 && thread_count_ > num_threads) + { + std::cout << "At most " << num_threads << " concurrent threads are supported at the machine.\n"; + valid = false; + } + + + // Discard invalid output formats. + if(output_format_ >= cuttlefish::num_op_formats) + { + std::cout << "Invalid output file format.\n"; + valid = false; + } + + + return valid; +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ca8b6255..38e3b6f3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ set(PROJECT_SRC kmc_api/mmer.cpp xxHash/xxhash.c xxHash/xxhsum.c + Build_Params.cpp Application.cpp Parser.cpp Async_Logger_Wrapper.cpp From 72175ddda7f054cd72efaf3b0eb629fb66406e02 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 9 Jun 2021 12:31:17 -0400 Subject: [PATCH 099/350] Add DC-cycles extraction parameter Detached Chordless cycles --- include/Build_Params.hpp | 14 ++++++++++-- src/Build_Params.cpp | 48 +++++++++++++++++++++++++++++++++++----- src/main.cpp | 4 +++- 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index e44b0251..ce037d29 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -30,6 +30,7 @@ class Build_Params const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string& mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string& buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. public: @@ -48,7 +49,8 @@ class Build_Params const std::string& working_dir_path, const bool remove_kmc_db, const std::string& mph_file_path, - const std::string& buckets_file_path): + const std::string& buckets_file_path, + const bool extract_cycles): is_read_graph_(is_read_graph), reference_input_(ref_paths, list_paths, dir_paths), k_(k), @@ -60,7 +62,8 @@ class Build_Params working_dir_path_(working_dir_path), remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), - buckets_file_path_(buckets_file_path) + buckets_file_path_(buckets_file_path), + extract_cycles_(extract_cycles) {} @@ -148,6 +151,13 @@ class Build_Params } + // Returns whether the option of extracting detached chordless cycles is specified. + bool extract_cycles() const + { + return extract_cycles_; + } + + // Returns `true` iff the parameters selections are valid. bool is_valid() const; }; diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 131ee9e7..995f264d 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -1,5 +1,6 @@ #include "Build_Params.hpp" +#include "utility.hpp" bool Build_Params::is_valid() const @@ -8,7 +9,7 @@ bool Build_Params::is_valid() const // Check if read and reference de Bruijn graph parameters are being mixed with. - if(is_read_graph_) + if(is_read_graph_) // Is a read de Bruijn graph. { if(!reference_input_.empty()) { @@ -16,25 +17,60 @@ bool Build_Params::is_valid() const valid = false; } - if(edge_db_path_.empty()) + if(!extract_cycles_) // Construction of the compacted dBG is requested, not the detached chordless cycles extraction. { - std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; - valid = false; + if(edge_db_path_.empty()) + { + std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; + valid = false; + } + } + else // Detached chordless cycles extraction is requested. + { + if(vertex_db_path_.empty()) + { + std::cout << "The path prefix to the KMC-database for vertices (i.e. k-mers) is required for the cycles' extraction.\n"; + valid = false; + } + + if(mph_file_path_.empty() || !file_exists(mph_file_path_)) + { + std::cout << "The Minimal Perfect Hash Function (MPHF) file (*.bbh) is required for the cycles' extraction.\n"; + valid = false; + } + + if(buckets_file_path_.empty() || !file_exists(buckets_file_path_)) + { + std::cout << "The hash table buckets file (*.cf) is required for the cycles' extraction.\n"; + valid = false; + } + + if(output_file_path_.empty() || !file_exists(output_file_path_)) + { + std::cout << "The output maximal unitigs file (*.fasta) is required for the cycles' extraction.\n"; + valid = false; + } } } - else + else // Is a reference de Bruijn graph. { if(!edge_db_path_.empty()) { std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; valid = false; } + + if(extract_cycles_) + { + std::cout << "Existence of detached chordless cycles are impossible for reference de Bruijn graphs by definition.\n"; + valid = false; + } } // Even `k` values are not consistent with the theory. // Also, `k` needs to be in the range `[1, MAX_K]`. - if((k_ & 1) == 0 || (k_ > cuttlefish::MAX_K)) + if((k_ & 1U) == 0 || (k_ > cuttlefish::MAX_K)) { std::cout << "The k-mer length (k) needs to be odd and within " << cuttlefish::MAX_K << ".\n"; valid = false; diff --git a/src/main.cpp b/src/main.cpp index 8a6251b5..b96a0aa6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,7 @@ void build(int argc, char** argv) ("rm", "remove the KMC database") ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("cycles", "extract the detached chordless cycles of the graph") ("h,help", "print usage"); try @@ -61,8 +62,9 @@ void build(int argc, char** argv) const auto working_dir = result["work_dir"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); + const auto extract_cycles = result["cycles"].as(); - const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file); + const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, extract_cycles); if(!params.is_valid()) { std::cerr << "Invalid input configuration. Aborting.\n"; From 84af1ab177919b0fdc4c2a80a297706b0ba6e8ac Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 9 Jun 2021 15:43:02 -0400 Subject: [PATCH 100/350] Fix latent bug --- src/Thread_Pool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index a25c3cb7..f7ff2ae8 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -102,6 +102,7 @@ void Thread_Pool::task(const uint16_t thread_id) const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; static_cast*>(dBG)->process_vertices(static_cast*>(params.parser), params.thread_id); } + break; } From bf48b01f74100c8512ae6e07a1ed4da502dbb7c1 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 9 Jun 2021 16:34:01 -0400 Subject: [PATCH 101/350] Better interface Avoid `hash_table[key] = state` from compiling --- include/Kmer_Hash_Table.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 8cde5ee9..3ccae92f 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -93,7 +93,7 @@ class Kmer_Hash_Table Kmer_Hash_Entry_API operator[](const Kmer& kmer); // Returns the value (in the hash-table) for the key `kmer`. - State operator[](const Kmer& kmer) const; + const State operator[](const Kmer& kmer) const; // Attempts to update the entry (in the hash-table) for the API object according // to its wrapped state values, and returns `true` or `false` as per success @@ -148,7 +148,7 @@ inline Kmer_Hash_Entry_API Kmer_Hash_Table::opera template -inline State Kmer_Hash_Table::operator[](const Kmer& kmer) const +inline const State Kmer_Hash_Table::operator[](const Kmer& kmer) const { const uint64_t bucket = bucket_id(kmer); From 4def0c60f33bf72d4d01352d55e8a3ce4acbaf28 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 9 Jun 2021 19:37:48 -0400 Subject: [PATCH 102/350] Rephrase terminology process -> scan --- include/Read_CdBG_Extractor.hpp | 11 ++++++----- src/Read_CdBG_Extractor.cpp | 10 +++++----- src/Thread_Pool.cpp | 3 ++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 935fd156..ac314726 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -41,7 +41,7 @@ class Read_CdBG_Extractor static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitigs can be retained in memory, at most, before flushing. static constexpr std::size_t SEQ_SZ = 5 * 1024ULL * 1024ULL; // 5 MB (soft limit) sized maximal unitig, at most, is constructed at a time. - mutable uint64_t vertices_processed = 0; // Total number of vertices scanned from the database. + mutable uint64_t vertices_scanned = 0; // Total number of vertices scanned from the database. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. @@ -52,10 +52,11 @@ class Read_CdBG_Extractor // for the unitpath-flanking vertices to be identified and the corresponding unipaths to be extracted. void distribute_unipaths_extraction(Kmer_SPMC_Iterator* vertex_parser, Thread_Pool& thread_pool); - // Processes the vertices provided to the thread with id `thread_id` from the parser `vertex_parser`, - // i.e. for each vertex `v` provided to that thread, identifies whether it is a unipath-flanking - // vertex, and if it is, then piece-wise constructs the corresponding unipath. - void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` + // for potential unipath-flanking vertices, i.e. for each vertex `v` provided to that thread, + // identifies whether it is a unipath-flanking vertex, and if it is, then piece-wise constructs + // the corresponding unipath. + void scan_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 19253dd5..eb8198e2 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -48,7 +48,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() // Close the output sink. close_output_sink(); - std::cout << "Number of processed vertices: " << vertices_processed << ".\n"; + std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; unipaths_meta_info.print(); // Check for the existence of cycle(s). @@ -77,10 +77,10 @@ void Read_CdBG_Extractor::distribute_unipaths_extraction(Kmer_SPMC_Iterator -void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) +void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { - // Data structures to be reused per each vertex processed. - Kmer v; // The vertex copy to be processed one-by-one. + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `u` containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. uint64_t id; // The unique ID of the maximal unitig `u`. @@ -117,7 +117,7 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. - vertices_processed += vertex_count; + vertices_scanned += vertex_count; unipaths_meta_info.aggregate(extracted_unipaths_info); lock.unlock(); diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index f7ff2ae8..63533aec 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -100,7 +100,8 @@ void Thread_Pool::task(const uint16_t thread_id) case Task_Type::extract_unipaths_read_space: { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)->process_vertices(static_cast*>(params.parser), params.thread_id); + static_cast*>(dBG)-> + scan_vertices(static_cast*>(params.parser), params.thread_id); } break; } From 4fd21fd6415d7c2db266ec52cfa44a717f373707 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Wed, 9 Jun 2021 19:41:18 -0400 Subject: [PATCH 103/350] Add DC-cycles extractor interface --- include/Read_CdBG_Extractor.hpp | 4 ++++ src/Read_CdBG.cpp | 7 +++++-- src/Read_CdBG_Extractor.cpp | 5 +++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index ac314726..01340d05 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -127,6 +127,10 @@ class Read_CdBG_Extractor // Extracts the maximal unitigs of the de Bruijn graph. void extract_maximal_unitigs(); + + // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the + // rest of the graph. A precondition for the algorithm is the availability of the maximal unitigs. + void extract_detached_cycles(); }; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index d9dbcfa6..66d87b8a 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -21,9 +21,12 @@ void Read_CdBG::construct() Read_CdBG_Constructor cdBg_constructor(params, hash_table); cdBg_constructor.compute_DFA_states(); - std::cout << "\nExtracting the maximal unitigs.\n"; + std::cout << (!params.extract_cycles() ? + "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); Read_CdBG_Extractor cdBg_extractor(params, hash_table); - cdBg_extractor.extract_maximal_unitigs(); + !params.extract_cycles() ? + cdBg_extractor.extract_maximal_unitigs(): + cdBg_extractor.extract_detached_cycles(); hash_table.clear(); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index eb8198e2..1fb1bff0 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -174,6 +174,11 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } +template +void Read_CdBG_Extractor::extract_detached_cycles() +{} + + template void Read_CdBG_Extractor::init_output_sink() { From f90f7043c1ec78a0f3be9fee01bb851d74dbc071 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Thu, 10 Jun 2021 11:02:06 -0400 Subject: [PATCH 104/350] Mark vertices in maximal unitigs --- include/Read_CdBG_Extractor.hpp | 20 ++++++ include/Thread_Pool.hpp | 1 + src/Read_CdBG_Extractor.cpp | 122 ++++++++++++++++++++++++++++++-- src/Thread_Pool.cpp | 12 +++- 4 files changed, 149 insertions(+), 6 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 01340d05..45c75d1f 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -43,6 +43,8 @@ class Read_CdBG_Extractor mutable uint64_t vertices_scanned = 0; // Total number of vertices scanned from the database. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. + + mutable uint64_t vertices_marked = 0; // Total number of vertices marked as present in maximal unitigs; used for the extraction of detached chordless cycle(s), if any. Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. @@ -66,6 +68,24 @@ class Read_CdBG_Extractor // contain partial form of the unitig, and `id` is unaltered. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath); + // Marks all the vertices that are present in the maximal unitigs of the graph. + void mark_maximal_unitig_vertices(); + + // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` + // for potential unipath-flanking vertices. If a vertex `v` is found to be a flanking one, then + // piece-wise constructs the corresponding (partial) maximal unitig starting the traversal from + // `v`, and marks the vertices along the way. Premature halts before traversing the entire unitig + // `p` is possible, in cases when some other thread is concurrently constructing `p`, but from + // the opposite flank — the halt happens at the threads' meeting-point. + void mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + + // Marks (partially) the vertices of the maximal unitig `p` that is flanked by the vertex `v_hat` + // from one side and connects to `v_hat` through its side `s_v_hat`. `p` might not be marked + // completely by this thread if some other thread is concurrently traversing `p`, but from the + // opposite flank. However, together these two threads mark `p` completely. Also returns the number + // of vertices marked in this execution. + std::size_t mark_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); + // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. bool mark_vertex(const Directed_Vertex& v); diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 1ab7eeb8..06d52f6d 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -29,6 +29,7 @@ class Thread_Pool output_gfa_reduced, compute_states_read_space, extract_unipaths_read_space, + mark_unipath_vertices, }; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 1fb1bff0..bd2d6431 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -36,7 +36,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() clear_file(params.output_file_path()); init_output_sink(); - // Launch (multi-thread) extraction of the maximal unitigs. + // Launch (multi-threaded) extraction of the maximal unitigs. distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. @@ -81,10 +81,10 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p { // Data structures to be reused per each vertex scanned. Kmer v; // The vertex copy to be scanned one-by-one. - cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `u` containing it, if `v` is flanking. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `p` containing it, if `v` is flanking. State_Read_Space state; // State of the vertex `v`. - uint64_t id; // The unique ID of the maximal unitig `u`. - std::vector unipath; // The extracted maximal unitig `u`. + uint64_t id; // The unique ID of the maximal unitig `p`. + std::vector unipath; // The extracted maximal unitig `p`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. @@ -176,7 +176,119 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const template void Read_CdBG_Extractor::extract_detached_cycles() -{} +{ + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + + std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; + mark_maximal_unitig_vertices(); + std::cout << "Done marking the vertices.\n"; + + + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + std::cout << "Done extracting the detached chordless cycles. Time taken = " << elapsed_seconds << " seconds.\n"; +} + + +template +void Read_CdBG_Extractor::mark_maximal_unitig_vertices() +{ + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::mark_unipath_vertices); + + // Launch the reading (and parsing per demand) of the vertices from disk. + const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; + + vertex_parser.launch_production(); + + + // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + distribute_unipaths_extraction(&vertex_parser, thread_pool); + + // Wait for the vertices to be depleted from the database. + vertex_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing edges. + thread_pool.close(); + + std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; + std::cout << "Number of marked vertices: " << vertices_marked << ".\n"; +} + + + +template +void Read_CdBG_Extractor::mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. + State_Read_Space state; // State of the vertex `v`. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + uint64_t marked_count = 0; // Number of vertices marked as present in maximal unitigs by this thread. + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) + { + state = hash_table[v].state(); + + if(!state.is_outputted() && is_flanking_state(state, s_v)) + marked_count += mark_maximal_unitig(v, s_v); + + vertex_count++; + } + + + // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. + lock.lock(); + + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. + " and marked " << marked_count << " vertices.\n"; + + vertices_scanned += vertex_count; + vertices_marked += marked_count; + + lock.unlock(); +} + + +template +std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat) +{ + // Data structures to be reused per each vertex extension of the maximal unitig. + cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. + Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. + + std::size_t marked_count = 0; // Number of vertices successfully marked by this thread. + + while(true) + { + if(!mark_vertex(v)) + break; + + marked_count++; + + if(is_flanking_side(state, s_v)) + break; + + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + } + + + return marked_count; +} template diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index 63533aec..2fdb7f80 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -35,6 +35,7 @@ Thread_Pool::Thread_Pool(const uint16_t thread_count, void* const dBG, const case Task_Type::compute_states_read_space: case Task_Type::extract_unipaths_read_space: + case Task_Type::mark_unipath_vertices: read_dBG_compaction_params.resize(thread_count); break; @@ -93,7 +94,8 @@ void Thread_Pool::task(const uint16_t thread_id) case Task_Type::compute_states_read_space: { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)->process_edges(static_cast*>(params.parser), params.thread_id); + static_cast*>(dBG)-> + process_edges(static_cast*>(params.parser), params.thread_id); } break; @@ -104,6 +106,14 @@ void Thread_Pool::task(const uint16_t thread_id) scan_vertices(static_cast*>(params.parser), params.thread_id); } break; + + case Task_Type::mark_unipath_vertices: + { + const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; + static_cast*>(dBG)-> + mark_maximal_unitig_vertices(static_cast*>(params.parser), params.thread_id); + } + break; } From 84c90eb9ca684140fdf7ceed7fbd754d47f29655 Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 11 Jun 2021 12:07:39 -0400 Subject: [PATCH 105/350] Extract detached chordless cycles --- include/Directed_Vertex.hpp | 28 ++++++- include/Kmer.hpp | 15 ++++ include/Read_CdBG_Extractor.hpp | 24 +++++- include/Thread_Pool.hpp | 1 + src/Read_CdBG_Extractor.cpp | 143 +++++++++++++++++++++++++++++++- src/Thread_Pool.cpp | 9 ++ 6 files changed, 216 insertions(+), 4 deletions(-) diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp index d491bb2b..67b43dc7 100644 --- a/include/Directed_Vertex.hpp +++ b/include/Directed_Vertex.hpp @@ -43,6 +43,9 @@ class Directed_Vertex // Copy constructs the vertex from `rhs`. Directed_Vertex(const Directed_Vertex& rhs); + // Assigns the vertex `rhs` to this one, and returns a constant reference to this object. + const Directed_Vertex& operator=(const Directed_Vertex& rhs); + // Returns `true` iff the k-mer observed for the vertex is in its canonical form. bool in_canonical_form() const; @@ -76,6 +79,10 @@ class Directed_Vertex // edge instance if this vertex instance were to be the source vertex (i.e. prefix k-mer) // of that edge. cuttlefish::side_t exit_side() const; + + // Returns `true` iff this vertex and the vertex `v` are the same vertex, without the + // directionality. + bool is_same_vertex(const Directed_Vertex& v) const; }; @@ -101,11 +108,23 @@ template inline Directed_Vertex::Directed_Vertex(const Directed_Vertex& rhs): kmer_(rhs.kmer_), kmer_bar_(rhs.kmer_bar_), - kmer_hat_ptr(Kmer::canonical(kmer_, kmer_bar_)), + kmer_hat_ptr(Kmer::canonical(kmer_, kmer_bar_)), // TODO: replace with pointer-check based assignment (check `operator=`). h(rhs.h) {} +template +inline const Directed_Vertex& Directed_Vertex::operator=(const Directed_Vertex& rhs) +{ + kmer_ = rhs.kmer_; + kmer_bar_ = rhs.kmer_bar_; + kmer_hat_ptr = (rhs.kmer_hat_ptr == &rhs.kmer_ ? &kmer_ : &kmer_bar_); + h = rhs.h; + + return *this; +} + + template inline bool Directed_Vertex::in_canonical_form() const { @@ -174,5 +193,12 @@ inline cuttlefish::side_t Directed_Vertex::exit_side() const } +template +inline bool Directed_Vertex::is_same_vertex(const Directed_Vertex& v) const +{ + return hash() == v.hash(); +} + + #endif diff --git a/include/Kmer.hpp b/include/Kmer.hpp index d13297de..7d63114c 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -121,6 +121,10 @@ class Kmer: public DNA_Utility // encoding of the other k-mer `rhs`. bool operator<(const Kmer& rhs) const; + // Returns `true` iff the bitwise encoding of this k-mer is larger to + // the encoding of the other k-mer `rhs`. + bool operator>(const Kmer& rhs) const; + // Returns true iff this k-mer is identical to the other k-mer `rhs`. bool operator==(const Kmer& rhs) const; @@ -477,6 +481,17 @@ inline bool Kmer::operator<(const Kmer& rhs) const } +template +inline bool Kmer::operator>(const Kmer& rhs) const +{ + for(int16_t idx = NUM_INTS - 1; idx >= 0; --idx) + if(kmer_data[idx] != rhs.kmer_data[idx]) + return kmer_data[idx] > rhs.kmer_data[idx]; + + return false; +} + + template inline bool Kmer::operator==(const Kmer& rhs) const { diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 45c75d1f..de0c36ab 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -45,6 +45,8 @@ class Read_CdBG_Extractor mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. mutable uint64_t vertices_marked = 0; // Total number of vertices marked as present in maximal unitigs; used for the extraction of detached chordless cycle(s), if any. + mutable uint64_t cycle_count = 0; // Total number of detached chordless cycles. + mutable uint64_t cycle_vertex_count = 0; // Total number of vertices present in the detached chordless cycles. Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. @@ -82,10 +84,28 @@ class Read_CdBG_Extractor // Marks (partially) the vertices of the maximal unitig `p` that is flanked by the vertex `v_hat` // from one side and connects to `v_hat` through its side `s_v_hat`. `p` might not be marked // completely by this thread if some other thread is concurrently traversing `p`, but from the - // opposite flank. However, together these two threads mark `p` completely. Also returns the number - // of vertices marked in this execution. + // opposite flank. However, together these two threads mark `p` completely. Also returns the + // number of vertices marked in this execution. std::size_t mark_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); + // Extracts all the detached chordless cycles present in the graph. + void extract_detached_chordless_cycles(); + + // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` + // for potential detached chordless cycles. If a vertex `v` is found to be not marked as present + // in the earlier extracted maximal unitigs, then it implies — by definition from the Cuttlefish + // algorithm — that `v` belongs to a chordless cycle that is detached completely from the rest of + // the graph. The method piece-wise constructs the cycle, starting the traversal from `v`. + void extract_detached_chordless_cycles(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + + // Extracts the detached chordless cycle `p` that contains the vertex `v_hat`. Returns `true` iff + // the extraction is successful, which happens when the cycle is encountered and attempted for + // output-marking _first_ by this thread. If the attempt is successful, then the cycle is extracted + // in its literal form that starts with `v_hat`'s canonical representation, into `cycle` (it is + // overwritten); also, a unique ID for it is put in `id`. If not, `cycle` may contain partial form + // of the unitig, and `id` is unaltered. + bool extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle); + // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. bool mark_vertex(const Directed_Vertex& v); diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 06d52f6d..1b6fa8ac 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -30,6 +30,7 @@ class Thread_Pool compute_states_read_space, extract_unipaths_read_space, mark_unipath_vertices, + extract_cycles, }; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index bd2d6431..45ca571c 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -179,14 +179,21 @@ void Read_CdBG_Extractor::extract_detached_cycles() { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; mark_maximal_unitig_vertices(); std::cout << "Done marking the vertices.\n"; + std::cout << "Extracting the cycles.\n"; + extract_detached_chordless_cycles(); + + std::cout << "\nNumber of detached chordless cycles: " << cycle_count << ".\n" + "Number of vertices in the cycles: " << cycle_vertex_count << ".\n"; + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done extracting the detached chordless cycles. Time taken = " << elapsed_seconds << " seconds.\n"; + std::cout << "Done extracting the cycles. Time taken = " << elapsed_seconds << " seconds.\n"; } @@ -291,6 +298,140 @@ std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, co } +template +void Read_CdBG_Extractor::extract_detached_chordless_cycles() +{ + if(vertices_marked == vertices_scanned) + { + std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; + return; + } + + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_cycles); + + // Launch the reading (and parsing per demand) of the vertices from disk. + const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; + + vertex_parser.launch_production(); + + // Initialize the output sink. + init_output_sink(); + + // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + distribute_unipaths_extraction(&vertex_parser, thread_pool); + + // Wait for the vertices to be depleted from the database. + vertex_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing edges. + thread_pool.close(); + + // Close the output sink. + close_output_sink(); +} + + +template +void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterator* vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. + State_Read_Space state; // State of the vertex `v`. + uint64_t id; // The unique ID of the cycle. + std::vector cycle; // The extracted cycle. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. + uint64_t cycle_vertices = 0; // Number of vertices found to be in detached chordless cycles by this thread. + + Character_Buffer output_buffer(output_sink.sink()); // The output buffer for the cycles. + cycle.reserve(SEQ_SZ); + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) + { + state = hash_table[v].state(); + + if(!state.is_outputted()) + if(extract_cycle(v, id, cycle)) + { + cycles_extracted++; + cycle_vertices += cycle.size() - (k - 1); + + cycle.emplace_back('\n'); + output_buffer += FASTA_Record>(id, cycle); + } + + vertex_count++; + } + + + // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. + lock.lock(); + + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. + " and extracted " << cycles_extracted << " cycles.\n"; + + vertices_scanned += vertex_count; + cycle_count += cycles_extracted; + cycle_vertex_count += cycle_vertices; + + lock.unlock(); +} + + +template +bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle) +{ + // Data structures to be reused per each vertex extension of the cycle. + cuttlefish::side_t s_v = cuttlefish::side_t::back; // The side of the current vertex `v` through which to extend the cycle, i.e. exit `v`. + Directed_Vertex v(v_hat, hash_table); // Current vertex being added to the cycle. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the cycle. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal cycle. + + + const Directed_Vertex anchor(v); + Directed_Vertex sign_vertex(anchor); + + anchor.kmer().get_label(cycle); + + while(true) + { + if(state.is_outputted()) // The cycle is found to have already been outputted. + return false; + + + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + if(v.is_same_vertex(anchor)) + break; + + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + + if(sign_vertex.canonical() > v.canonical()) + sign_vertex = v; + + cycle.emplace_back(Kmer::map_char(b_ext)); + } + + + if(!mark_vertex(sign_vertex)) + return false; + + id = sign_vertex.hash(); + + return true; +} + + template void Read_CdBG_Extractor::init_output_sink() { diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index 2fdb7f80..e31d0035 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -36,6 +36,7 @@ Thread_Pool::Thread_Pool(const uint16_t thread_count, void* const dBG, const case Task_Type::compute_states_read_space: case Task_Type::extract_unipaths_read_space: case Task_Type::mark_unipath_vertices: + case Task_Type::extract_cycles: read_dBG_compaction_params.resize(thread_count); break; @@ -114,6 +115,14 @@ void Thread_Pool::task(const uint16_t thread_id) mark_maximal_unitig_vertices(static_cast*>(params.parser), params.thread_id); } break; + + case Task_Type::extract_cycles: + { + const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; + static_cast*>(dBG)-> + extract_detached_chordless_cycles(static_cast*>(params.parser), params.thread_id); + } + break; } From 16ae6e6bbf99b03fbc8411bb369e9e9df310a7dd Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Fri, 11 Jun 2021 12:58:20 -0400 Subject: [PATCH 106/350] Separate cycles-extractor code --- include/Character_Buffer.hpp | 3 - src/CMakeLists.txt | 2 + src/Character_Buffer_Flusher.cpp | 5 + src/Detached_Cycles_Extractor.cpp | 283 ++++++++++++++++++++++++++++++ src/Read_CdBG_Extractor.cpp | 272 ---------------------------- 5 files changed, 290 insertions(+), 275 deletions(-) create mode 100644 src/Character_Buffer_Flusher.cpp create mode 100644 src/Detached_Cycles_Extractor.cpp diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index 92dd95b5..9bea0149 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -184,9 +184,6 @@ inline void Character_Buffer_Flusher::write(std::vector& bu } -Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. - - inline void Character_Buffer_Flusher::write(std::vector& buf, const Async_Logger_Wrapper& sink) { buf.emplace_back('\0'); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 38e3b6f3..add9990b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,6 +27,8 @@ set(PROJECT_SRC Read_CdBG_Constructor.cpp Read_CdBG_Extractor.cpp Unipaths_Meta_info.cpp + Detached_Cycles_Extractor.cpp + Character_Buffer_Flusher.cpp Validator.cpp Validator_Hash_Table.cpp Sequence_Validator.cpp diff --git a/src/Character_Buffer_Flusher.cpp b/src/Character_Buffer_Flusher.cpp new file mode 100644 index 00000000..1b42c31c --- /dev/null +++ b/src/Character_Buffer_Flusher.cpp @@ -0,0 +1,5 @@ + +#include "Character_Buffer.hpp" + + +Spin_Lock Character_Buffer_Flusher::lock; // Definition of the static lock of `Character_Buffer_Flusher`. diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp new file mode 100644 index 00000000..dfd94bcd --- /dev/null +++ b/src/Detached_Cycles_Extractor.cpp @@ -0,0 +1,283 @@ + +#include "Read_CdBG_Extractor.hpp" +#include "Kmer_SPMC_Iterator.hpp" +#include "FASTA_Record.hpp" +#include "Character_Buffer.hpp" +#include "Thread_Pool.hpp" + + +template +void Read_CdBG_Extractor::extract_detached_cycles() +{ + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + + + std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; + mark_maximal_unitig_vertices(); + std::cout << "Done marking the vertices.\n"; + + std::cout << "Extracting the cycles.\n"; + extract_detached_chordless_cycles(); + + std::cout << "\nNumber of detached chordless cycles: " << cycle_count << ".\n" + "Number of vertices in the cycles: " << cycle_vertex_count << ".\n"; + + + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + std::cout << "Done extracting the cycles. Time taken = " << elapsed_seconds << " seconds.\n"; +} + + +template +void Read_CdBG_Extractor::mark_maximal_unitig_vertices() +{ + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::mark_unipath_vertices); + + // Launch the reading (and parsing per demand) of the vertices from disk. + const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; + + vertex_parser.launch_production(); + + + // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + distribute_unipaths_extraction(&vertex_parser, thread_pool); + + // Wait for the vertices to be depleted from the database. + vertex_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing edges. + thread_pool.close(); + + std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; + std::cout << "Number of marked vertices: " << vertices_marked << ".\n"; +} + + + +template +void Read_CdBG_Extractor::mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. + State_Read_Space state; // State of the vertex `v`. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + uint64_t marked_count = 0; // Number of vertices marked as present in maximal unitigs by this thread. + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) + { + state = hash_table[v].state(); + + if(!state.is_outputted() && is_flanking_state(state, s_v)) + marked_count += mark_maximal_unitig(v, s_v); + + vertex_count++; + } + + + // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. + lock.lock(); + + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. + " and marked " << marked_count << " vertices.\n"; + + vertices_scanned += vertex_count; + vertices_marked += marked_count; + + lock.unlock(); +} + + +template +std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat) +{ + // Data structures to be reused per each vertex extension of the maximal unitig. + cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. + Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. + + std::size_t marked_count = 0; // Number of vertices successfully marked by this thread. + + while(true) + { + if(!mark_vertex(v)) + break; + + marked_count++; + + if(is_flanking_side(state, s_v)) + break; + + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + } + + + return marked_count; +} + + +template +void Read_CdBG_Extractor::extract_detached_chordless_cycles() +{ + if(vertices_marked == vertices_scanned) + { + std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; + return; + } + + // Construct a thread pool. + const uint16_t thread_count = params.thread_count(); + Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_cycles); + + // Launch the reading (and parsing per demand) of the vertices from disk. + const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. + std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; + + vertex_parser.launch_production(); + + // Initialize the output sink. + init_output_sink(); + + // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + distribute_unipaths_extraction(&vertex_parser, thread_pool); + + // Wait for the vertices to be depleted from the database. + vertex_parser.seize_production(); + + // Wait for the consumer threads to finish parsing and processing edges. + thread_pool.close(); + + // Close the output sink. + close_output_sink(); +} + + +template +void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterator* vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. + State_Read_Space state; // State of the vertex `v`. + uint64_t id; // The unique ID of the cycle. + std::vector cycle; // The extracted cycle. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. + uint64_t cycle_vertices = 0; // Number of vertices found to be in detached chordless cycles by this thread. + + Character_Buffer output_buffer(output_sink.sink()); // The output buffer for the cycles. + cycle.reserve(SEQ_SZ); + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) + { + state = hash_table[v].state(); + + if(!state.is_outputted()) + if(extract_cycle(v, id, cycle)) + { + cycles_extracted++; + cycle_vertices += cycle.size() - (k - 1); + + cycle.emplace_back('\n'); + output_buffer += FASTA_Record>(id, cycle); + } + + vertex_count++; + } + + + // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. + lock.lock(); + + std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. + " and extracted " << cycles_extracted << " cycles.\n"; + + vertices_scanned += vertex_count; + cycle_count += cycles_extracted; + cycle_vertex_count += cycle_vertices; + + lock.unlock(); +} + + +template +bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle) +{ + // Data structures to be reused per each vertex extension of the cycle. + cuttlefish::side_t s_v = cuttlefish::side_t::back; // The side of the current vertex `v` through which to extend the cycle, i.e. exit `v`. + Directed_Vertex v(v_hat, hash_table); // Current vertex being added to the cycle. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the cycle. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal cycle. + + + const Directed_Vertex anchor(v); + Directed_Vertex sign_vertex(anchor); + + anchor.kmer().get_label(cycle); + + while(true) + { + if(state.is_outputted()) // The cycle is found to have already been outputted. + return false; + + + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + if(v.is_same_vertex(anchor)) + break; + + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + + if(sign_vertex.canonical() > v.canonical()) + sign_vertex = v; + + cycle.emplace_back(Kmer::map_char(b_ext)); + } + + + if(!mark_vertex(sign_vertex)) + return false; + + id = sign_vertex.hash(); + + return true; +} + + +template +void Read_CdBG_Extractor::init_output_sink() +{ + output_sink.init_sink(params.output_file_path()); +} + + +template +void Read_CdBG_Extractor::close_output_sink() +{ + output_sink.close_sink(); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 45ca571c..694bd918 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -174,278 +174,6 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } -template -void Read_CdBG_Extractor::extract_detached_cycles() -{ - std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - - - std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; - mark_maximal_unitig_vertices(); - std::cout << "Done marking the vertices.\n"; - - std::cout << "Extracting the cycles.\n"; - extract_detached_chordless_cycles(); - - std::cout << "\nNumber of detached chordless cycles: " << cycle_count << ".\n" - "Number of vertices in the cycles: " << cycle_vertex_count << ".\n"; - - - std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done extracting the cycles. Time taken = " << elapsed_seconds << " seconds.\n"; -} - - -template -void Read_CdBG_Extractor::mark_maximal_unitig_vertices() -{ - // Construct a thread pool. - const uint16_t thread_count = params.thread_count(); - Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::mark_unipath_vertices); - - // Launch the reading (and parsing per demand) of the vertices from disk. - const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. - Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. - std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; - - vertex_parser.launch_production(); - - - // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. - distribute_unipaths_extraction(&vertex_parser, thread_pool); - - // Wait for the vertices to be depleted from the database. - vertex_parser.seize_production(); - - // Wait for the consumer threads to finish parsing and processing edges. - thread_pool.close(); - - std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; - std::cout << "Number of marked vertices: " << vertices_marked << ".\n"; -} - - - -template -void Read_CdBG_Extractor::mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) -{ - // Data structures to be reused per each vertex scanned. - Kmer v; // The vertex copy to be scanned one-by-one. - cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig containing it, if `v` is flanking. - State_Read_Space state; // State of the vertex `v`. - - uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - uint64_t marked_count = 0; // Number of vertices marked as present in maximal unitigs by this thread. - - while(vertex_parser->tasks_expected(thread_id)) - if(vertex_parser->value_at(thread_id, v)) - { - state = hash_table[v].state(); - - if(!state.is_outputted() && is_flanking_state(state, s_v)) - marked_count += mark_maximal_unitig(v, s_v); - - vertex_count++; - } - - - // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. - lock.lock(); - - std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. - " and marked " << marked_count << " vertices.\n"; - - vertices_scanned += vertex_count; - vertices_marked += marked_count; - - lock.unlock(); -} - - -template -std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat) -{ - // Data structures to be reused per each vertex extension of the maximal unitig. - cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. - Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. - State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. - cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. - cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. - - std::size_t marked_count = 0; // Number of vertices successfully marked by this thread. - - while(true) - { - if(!mark_vertex(v)) - break; - - marked_count++; - - if(is_flanking_side(state, s_v)) - break; - - e_v = state.edge_at(s_v); - b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); - - v.roll_forward(b_ext, hash_table); - s_v = v.exit_side(); - state = hash_table[v.hash()].state(); - } - - - return marked_count; -} - - -template -void Read_CdBG_Extractor::extract_detached_chordless_cycles() -{ - if(vertices_marked == vertices_scanned) - { - std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; - return; - } - - // Construct a thread pool. - const uint16_t thread_count = params.thread_count(); - Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_cycles); - - // Launch the reading (and parsing per demand) of the vertices from disk. - const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. - Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. - std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; - - vertex_parser.launch_production(); - - // Initialize the output sink. - init_output_sink(); - - // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. - distribute_unipaths_extraction(&vertex_parser, thread_pool); - - // Wait for the vertices to be depleted from the database. - vertex_parser.seize_production(); - - // Wait for the consumer threads to finish parsing and processing edges. - thread_pool.close(); - - // Close the output sink. - close_output_sink(); -} - - -template -void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterator* vertex_parser, const uint16_t thread_id) -{ - // Data structures to be reused per each vertex scanned. - Kmer v; // The vertex copy to be scanned one-by-one. - State_Read_Space state; // State of the vertex `v`. - uint64_t id; // The unique ID of the cycle. - std::vector cycle; // The extracted cycle. - - uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. - uint64_t cycle_vertices = 0; // Number of vertices found to be in detached chordless cycles by this thread. - - Character_Buffer output_buffer(output_sink.sink()); // The output buffer for the cycles. - cycle.reserve(SEQ_SZ); - - while(vertex_parser->tasks_expected(thread_id)) - if(vertex_parser->value_at(thread_id, v)) - { - state = hash_table[v].state(); - - if(!state.is_outputted()) - if(extract_cycle(v, id, cycle)) - { - cycles_extracted++; - cycle_vertices += cycle.size() - (k - 1); - - cycle.emplace_back('\n'); - output_buffer += FASTA_Record>(id, cycle); - } - - vertex_count++; - } - - - // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. - lock.lock(); - - std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. - " and extracted " << cycles_extracted << " cycles.\n"; - - vertices_scanned += vertex_count; - cycle_count += cycles_extracted; - cycle_vertex_count += cycle_vertices; - - lock.unlock(); -} - - -template -bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle) -{ - // Data structures to be reused per each vertex extension of the cycle. - cuttlefish::side_t s_v = cuttlefish::side_t::back; // The side of the current vertex `v` through which to extend the cycle, i.e. exit `v`. - Directed_Vertex v(v_hat, hash_table); // Current vertex being added to the cycle. - State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. - cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the cycle. - cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal cycle. - - - const Directed_Vertex anchor(v); - Directed_Vertex sign_vertex(anchor); - - anchor.kmer().get_label(cycle); - - while(true) - { - if(state.is_outputted()) // The cycle is found to have already been outputted. - return false; - - - e_v = state.edge_at(s_v); - b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); - - v.roll_forward(b_ext, hash_table); - if(v.is_same_vertex(anchor)) - break; - - s_v = v.exit_side(); - state = hash_table[v.hash()].state(); - - if(sign_vertex.canonical() > v.canonical()) - sign_vertex = v; - - cycle.emplace_back(Kmer::map_char(b_ext)); - } - - - if(!mark_vertex(sign_vertex)) - return false; - - id = sign_vertex.hash(); - - return true; -} - - -template -void Read_CdBG_Extractor::init_output_sink() -{ - output_sink.init_sink(params.output_file_path()); -} - - -template -void Read_CdBG_Extractor::close_output_sink() -{ - output_sink.close_sink(); -} - - // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) From 92710c3ad0caa4489f2d31ae0dab2874fbf3969e Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Mon, 14 Jun 2021 12:04:56 -0400 Subject: [PATCH 107/350] Output DC cycles in consistent form Lowest canonical k-mer at front, overall canonical --- include/Character_Buffer.hpp | 27 ++++++++++++++++++++++++--- include/FASTA_Record.hpp | 13 +++++++++++++ include/Read_CdBG_Extractor.hpp | 5 +++-- src/Detached_Cycles_Extractor.cpp | 22 +++++++++++++++++----- 4 files changed, 57 insertions(+), 10 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index 9bea0149..c373333f 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -49,6 +49,14 @@ class Character_Buffer template void operator+=(const FASTA_Record& fasta_rec); + // Appends the content of the FASTA record `fasta_rec` to the buffer. The FASTA + // added sequence is rotated around its index `pivot` — the entire sequence is + // right-rotated so that the `pivot`-index character is at index 0 finally. A + // line-break is added at the end of the sequence, since the user might not be + // able to provide it with the "to be rotated" sequence. + template + void rotate_append(const FASTA_Record& fasta_rec, std::size_t pivot); + // Destructs the buffer object, flushing it if content are present. ~Character_Buffer(); }; @@ -124,14 +132,27 @@ template template inline void Character_Buffer::operator+=(const FASTA_Record& fasta_rec) { - ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size()); + ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size()); // 1 extra byte for the line-break. - fasta_rec.append_header(buffer); // Append the header. - buffer.emplace_back('\n'); // Break-line. + fasta_rec.append_header(buffer); // Append the header. + buffer.emplace_back('\n'); // Break line. fasta_rec.append_seq(buffer); // Append the sequence. } +template +template +inline void Character_Buffer::rotate_append(const FASTA_Record& fasta_rec, const std::size_t pivot) +{ + ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size() + 1); // 2 extra bytes for two line-breaks. + + fasta_rec.append_header(buffer); // Append the header. + buffer.emplace_back('\n'); // Break line. + fasta_rec.append_rotated_seq(buffer, pivot); // Append the sequence right-rotated around index `pivot`. + buffer.emplace_back('\n'); // End the sequence. +} + + template inline void Character_Buffer::ensure_space(const std::size_t append_size) { diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index 20ddc71d..57a94eda 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -37,6 +37,11 @@ class FASTA_Record // Appends the FASTA sequence to the vector `buffer`. void append_seq(std::vector& buffer) const; + + // Appends the FASTA sequence to the vector `buffer` in a rotated form — the + // added sequence is right rotated so that the character at index `pivot` is + // at index 0 finally. + void append_rotated_seq(std::vector& buffer, std::size_t pivot) const; }; @@ -78,5 +83,13 @@ inline void FASTA_Record::append_seq(std::vector& buffer) c } +template +inline void FASTA_Record::append_rotated_seq(std::vector& buffer, const std::size_t pivot) const +{ + buffer.insert(buffer.end(), seq_.begin() + pivot, seq_.end()); + buffer.insert(buffer.end(), seq_.begin(), seq_.begin() + pivot); +} + + #endif diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index de0c36ab..920a640a 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -103,8 +103,9 @@ class Read_CdBG_Extractor // output-marking _first_ by this thread. If the attempt is successful, then the cycle is extracted // in its literal form that starts with `v_hat`'s canonical representation, into `cycle` (it is // overwritten); also, a unique ID for it is put in `id`. If not, `cycle` may contain partial form - // of the unitig, and `id` is unaltered. - bool extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle); + // of the unitig, and `id` is unaltered. The index of the lexicographically lowest (canonical) k-mer + // in `cycle` is recorded at `pivot`. + bool extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle, std::size_t& pivot); // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index dfd94bcd..1d549c34 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -175,6 +175,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato State_Read_Space state; // State of the vertex `v`. uint64_t id; // The unique ID of the cycle. std::vector cycle; // The extracted cycle. + std::size_t pivot; // Index of the lexicographically lowest (canonical) k-mer in the cycle. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. @@ -189,13 +190,14 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato state = hash_table[v].state(); if(!state.is_outputted()) - if(extract_cycle(v, id, cycle)) + if(extract_cycle(v, id, cycle, pivot)) { cycles_extracted++; cycle_vertices += cycle.size() - (k - 1); - cycle.emplace_back('\n'); - output_buffer += FASTA_Record>(id, cycle); + // cycle.emplace_back('\n'); + // output_buffer += FASTA_Record>(id, cycle); + output_buffer.rotate_append(FASTA_Record>(id, cycle), pivot); } vertex_count++; @@ -217,7 +219,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato template -bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle) +bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle, std::size_t& pivot) { // Data structures to be reused per each vertex extension of the cycle. cuttlefish::side_t s_v = cuttlefish::side_t::back; // The side of the current vertex `v` through which to extend the cycle, i.e. exit `v`. @@ -225,10 +227,12 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the cycle. cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal cycle. + std::size_t kmer_idx; // Index of the vertex in the path being traversed. const Directed_Vertex anchor(v); Directed_Vertex sign_vertex(anchor); + pivot = kmer_idx = 0; anchor.kmer().get_label(cycle); @@ -248,8 +252,10 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s s_v = v.exit_side(); state = hash_table[v.hash()].state(); + kmer_idx++; if(sign_vertex.canonical() > v.canonical()) - sign_vertex = v; + sign_vertex = v, + pivot = kmer_idx; cycle.emplace_back(Kmer::map_char(b_ext)); } @@ -258,6 +264,12 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s if(!mark_vertex(sign_vertex)) return false; + if(!sign_vertex.in_canonical_form()) + { + reverse_complement(cycle); + pivot = (cycle.size() - 1) - pivot - (k - 1); + } + id = sign_vertex.hash(); return true; From 1755384806c1ef1ee43cf18844aac3a9621d428b Mon Sep 17 00:00:00 2001 From: jamshed-k Date: Tue, 15 Jun 2021 12:44:53 -0400 Subject: [PATCH 108/350] Fix latent bug --- include/Kmer.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 7d63114c..913cad3e 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -676,8 +676,10 @@ inline void Kmer::get_label(T_container_& label) const { label.resize(k); + constexpr uint16_t PACKED_BYTE_COUNT = k / 32; + // Get the fully packed words' representations. - for(uint16_t data_idx = 0; data_idx < NUM_INTS - 1; ++data_idx) + for(uint16_t data_idx = 0; data_idx < PACKED_BYTE_COUNT; ++data_idx) for(uint16_t bit_pair_idx = 0; bit_pair_idx < 32; ++bit_pair_idx) label[(k - 1) - ((data_idx << 5) + bit_pair_idx)] = map_char(static_cast((kmer_data[data_idx] & (0b11ULL << (2 * bit_pair_idx))) >> (2 * bit_pair_idx))); From a390e2711081eb3f876e049ff88df69184076c65 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 6 Jul 2021 14:24:06 -0400 Subject: [PATCH 109/350] Add post-construction DCC-optimization option CLI is a mess right now --- include/Build_Params.hpp | 10 ++++++++++ include/Kmer_Hash_Table.hpp | 13 +++++++++++++ include/Read_CdBG_Extractor.hpp | 18 +++++++++++++++--- include/State_Read_Space.hpp | 20 ++++++++++++++++---- src/Build_Params.cpp | 2 +- src/CMakeLists.txt | 1 + src/Detached_Cycles_Extractor.cpp | 20 ++++++++++++-------- src/Read_CdBG_Constructor.cpp | 2 +- src/Read_CdBG_Extractor.cpp | 30 ++++++++++++++++++++++++++++-- src/State_Read_Space.cpp | 5 +++++ src/main.cpp | 4 +++- 11 files changed, 105 insertions(+), 20 deletions(-) create mode 100644 src/State_Read_Space.cpp diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index ce037d29..33261d36 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -30,6 +30,7 @@ class Build_Params const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string& mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string& buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. @@ -50,6 +51,7 @@ class Build_Params const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, + const bool dcc_opt, const bool extract_cycles): is_read_graph_(is_read_graph), reference_input_(ref_paths, list_paths, dir_paths), @@ -63,6 +65,7 @@ class Build_Params remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), + dcc_opt_(dcc_opt), extract_cycles_(extract_cycles) {} @@ -151,6 +154,13 @@ class Build_Params } + // Returns whether the option of optimizing post-cdBG-construction extraction of DCCs is specified. + bool dcc_opt() const + { + return dcc_opt_; + } + + // Returns whether the option of extracting detached chordless cycles is specified. bool extract_cycles() const { diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 3ccae92f..a4b08494 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -101,6 +101,10 @@ class Kmer_Hash_Table // state than the one that had been read earlier, then the update fails. bool update(Kmer_Hash_Entry_API& api); + // Updates the state-entry in the hash-table that's at the bucket with ID + // `bucket_id` with the state-value `state`. + void update(uint64_t bucket_id, const State_Read_Space& state); + // Clears the hash-table. Do not invoke on an unused object. void clear(); @@ -185,5 +189,14 @@ inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API +inline void Kmer_Hash_Table::update(const uint64_t bucket_id, const State_Read_Space& state) +{ + sparse_lock.lock(bucket_id); + hash_table[bucket_id] = state.get_state(); + sparse_lock.unlock(bucket_id); +} + + #endif diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 920a640a..5eadd969 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -66,9 +66,13 @@ class Read_CdBG_Extractor // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If // the attempt is successful, then the maximal unitig is extracted in its canonical form into - // `unipath` (it is overwritten); also, a unique ID for it is put in `id`. If not, `unipath` may - // contain partial form of the unitig, and `id` is unaltered. - bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath); + // `unipath` (it is overwritten), a unique ID for it is put in `id`, and the hashes of the vertices + // constituting the path overwrites `path_hashes` (when the user-option is specified). If not, + // `unipath` and `path_hashes` may contain partial form of the path, and `id` is unaltered. + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes); + + // Marks all the vertices which have their hashes present in `path_hashes` as outputted. + void mark_path(const std::vector& path_hashes); // Marks all the vertices that are present in the maximal unitigs of the graph. void mark_maximal_unitig_vertices(); @@ -246,5 +250,13 @@ inline void Read_CdBG_Extractor::reverse_complement(T_container_& seq) } +template +void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) +{ + for(const uint64_t hash: path_hashes) + hash_table.update(hash, State_Read_Space::get_outputted_state()); +} + + #endif diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index a686fa5e..c07eaddb 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -32,10 +32,13 @@ class State_Read_Space // Bitmask used to extract the 'Extended_Base`-encoding of the edge(s) incident to the back side of a vertex. static constexpr cuttlefish::state_code_t BACK_MASK = SIDE_MASK << BACK_IDX; - // State code for vertices that has been outputted. + // State code for vertices that have been outputted. // TODO: Use a well-thought-out value as the marker. static constexpr cuttlefish::state_code_t OUTPUTTED = static_cast((0b101 << FRONT_IDX) | 0b101 << BACK_IDX); + // State for the vertices that have been outputted. + static const State_Read_Space outputted_state; + // Constructs a state that wraps the provided numeric value `code`. State_Read_Space(cuttlefish::state_code_t code); @@ -48,15 +51,15 @@ class State_Read_Space // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. void set_front_encoding(cuttlefish::edge_encoding_t edge); - // Returns the wrapped state-code value. - cuttlefish::state_code_t get_state() const; - public: // Constructs the state of a vertex having both its sides unvisited. constexpr State_Read_Space(); + // Returns the wrapped state-code value. + cuttlefish::state_code_t get_state() const; + // Returns `true` iff some vertex having this state has been outputted. bool is_outputted() const; @@ -74,6 +77,9 @@ class State_Read_Space // Returns `true` iff the underlying code is the same as that one of `rhs`. bool operator==(const State_Read_Space& rhs) const; + + // Returns the state for the vertices that have been marked as outputted. + static const State_Read_Space& get_outputted_state(); }; @@ -135,5 +141,11 @@ inline bool State_Read_Space::operator==(const State_Read_Space& rhs) const } +inline const State_Read_Space& State_Read_Space::get_outputted_state() +{ + return outputted_state; +} + + #endif diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 995f264d..10fe695e 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -60,7 +60,7 @@ bool Build_Params::is_valid() const valid = false; } - if(extract_cycles_) + if(dcc_opt_ || extract_cycles_) { std::cout << "Existence of detached chordless cycles are impossible for reference de Bruijn graphs by definition.\n"; valid = false; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index add9990b..1527870f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,6 +23,7 @@ set(PROJECT_SRC CdBG_Plain_Writer.cpp CdBG_GFA_Writer.cpp CdBG_GFA_Reduced_Writer.cpp + State_Read_Space.cpp Read_CdBG.cpp Read_CdBG_Constructor.cpp Read_CdBG_Extractor.cpp diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 1d549c34..618c2d1d 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -12,9 +12,12 @@ void Read_CdBG_Extractor::extract_detached_cycles() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; - mark_maximal_unitig_vertices(); - std::cout << "Done marking the vertices.\n"; + if(!params.dcc_opt()) + { + std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; + mark_maximal_unitig_vertices(); + std::cout << "Done marking the vertices.\n"; + } std::cout << "Extracting the cycles.\n"; extract_detached_chordless_cycles(); @@ -133,11 +136,12 @@ std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, co template void Read_CdBG_Extractor::extract_detached_chordless_cycles() { - if(vertices_marked == vertices_scanned) - { - std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; - return; - } + // TODO: put the information for this utility check in a meta JSON file. + // if(vertices_marked == vertices_scanned) + // { + // std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; + // return; + // } // Construct a thread pool. const uint16_t thread_count = params.thread_count(); diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index dc6875ba..5a945f1b 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -52,7 +52,7 @@ void Read_CdBG_Constructor::compute_DFA_states() std::cout << "Number of processed edges: " << edges_processed << "\n"; - if(!buckets_file_path.empty()) // Save the hash table buckets, if a file path is provided. + if(!buckets_file_path.empty() && !params.dcc_opt()) // Save the hash table buckets, if a file path is provided. { std::cout << "Saving the hash table buckets in file " << buckets_file_path << ".\n"; hash_table.save_hash_buckets(buckets_file_path); diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 694bd918..820ca16d 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -60,6 +60,16 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; + + if(params.dcc_opt()) // Save the hash table buckets. + { + // TODO: `params.buckets_file_path()` might be empty. + // TODO: Rectify the CLI. + const std::string buckets_file_path = params.buckets_file_path(); + std::cout << "Saving the hash table buckets in file " << buckets_file_path << ".\n"; + hash_table.save_hash_buckets(buckets_file_path); + std::cout << "Saved the buckets in disk.\n"; + } } @@ -85,6 +95,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p State_Read_Space state; // State of the vertex `v`. uint64_t id; // The unique ID of the maximal unitig `p`. std::vector unipath; // The extracted maximal unitig `p`. + std::vector path_hashes; // Hash values of the vertices constituting the maximal unitig `p`. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. @@ -92,13 +103,17 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(SEQ_SZ); + if(params.dcc_opt()) + path_hashes.reserve(BUFF_SZ); + + while(vertex_parser->tasks_expected(thread_id)) if(vertex_parser->value_at(thread_id, v)) { state = hash_table[v].state(); if(!state.is_outputted() && is_flanking_state(state, s_v)) - if(extract_maximal_unitig(v, s_v, id, unipath)) + if(extract_maximal_unitig(v, s_v, id, unipath, path_hashes)) { extracted_unipaths_info.add_maximal_unitig(unipath); @@ -106,6 +121,9 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p // output_buffer += unipath; output_buffer += FASTA_Record>(id, unipath); // unipath.clear(); + + if(params.dcc_opt()) + mark_path(path_hashes); } vertex_count++; @@ -125,7 +143,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p template -bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath) +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes) { // Data structures to be reused per each vertex extension of the maximal unitig. cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. @@ -136,6 +154,12 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const const Directed_Vertex init_vertex(v); init_vertex.kmer().get_label(unipath); + if(params.dcc_opt()) + { + path_hashes.clear(); + path_hashes.emplace_back(init_vertex.hash()); + } + while(true) { @@ -154,6 +178,8 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const state = hash_table[v.hash()].state(); unipath.emplace_back(Kmer::map_char(b_ext)); + if(params.dcc_opt()) + path_hashes.emplace_back(v.hash()); } const Directed_Vertex& term_vertex = v; diff --git a/src/State_Read_Space.cpp b/src/State_Read_Space.cpp new file mode 100644 index 00000000..e4edc32a --- /dev/null +++ b/src/State_Read_Space.cpp @@ -0,0 +1,5 @@ + +#include "State_Read_Space.hpp" + + +const State_Read_Space State_Read_Space::outputted_state(State_Read_Space::OUTPUTTED); diff --git a/src/main.cpp b/src/main.cpp index b96a0aa6..e8f94ba0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,7 @@ void build(int argc, char** argv) ("rm", "remove the KMC database") ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("dcc", "turn on optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") ("cycles", "extract the detached chordless cycles of the graph") ("h,help", "print usage"); @@ -62,9 +63,10 @@ void build(int argc, char** argv) const auto working_dir = result["work_dir"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); + const auto dcc_opt = result["dcc"].as(); const auto extract_cycles = result["cycles"].as(); - const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, extract_cycles); + const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, dcc_opt, extract_cycles); if(!params.is_valid()) { std::cerr << "Invalid input configuration. Aborting.\n"; From 1842e05a6a0904c0a1d3311d7f6a9c80b661a941 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 12:02:26 -0400 Subject: [PATCH 110/350] Expose vertex and edge count --- include/Read_CdBG_Constructor.hpp | 3 +++ include/Read_CdBG_Extractor.hpp | 3 +++ src/Read_CdBG_Constructor.cpp | 7 +++++++ src/Read_CdBG_Extractor.cpp | 7 +++++++ 4 files changed, 20 insertions(+) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 9d59cdbe..ab155d26 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -89,6 +89,9 @@ class Read_CdBG_Constructor // Computes the states of the DFA in the de Bruijn graph. void compute_DFA_states(); + + // Returns the number of distinct edges in the underlying graph. + uint64_t edge_count() const; }; diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 5eadd969..af354ce9 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -176,6 +176,9 @@ class Read_CdBG_Extractor // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the // rest of the graph. A precondition for the algorithm is the availability of the maximal unitigs. void extract_detached_cycles(); + + // Returns the number of vertices in the underlying graph. + uint64_t vertex_count() const; }; diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 5a945f1b..4d9a3823 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -132,6 +132,13 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed } +template +uint64_t Read_CdBG_Constructor::edge_count() const +{ + return edges_processed; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Constructor) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 820ca16d..fdbd6ae9 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -200,6 +200,13 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } +template +uint64_t Read_CdBG_Extractor::vertex_count() const +{ + return vertices_scanned; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) From 5a87615f4df5cfd46142a4b1cdf9168f6aae9b3f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 17:56:46 -0400 Subject: [PATCH 111/350] Add hash table size interface --- include/Kmer_Hash_Table.hpp | 3 +++ src/Kmer_Hash_Table.cpp | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index a4b08494..c1de1fcf 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -105,6 +105,9 @@ class Kmer_Hash_Table // `bucket_id` with the state-value `state`. void update(uint64_t bucket_id, const State_Read_Space& state); + // Returns the number of keys in the hash table. + uint64_t size() const; + // Clears the hash-table. Do not invoke on an unused object. void clear(); diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 9d7d4e89..574b1ac4 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -159,6 +159,13 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co } +template +uint64_t Kmer_Hash_Table::size() const +{ + return kmer_count; +} + + template void Kmer_Hash_Table::clear() { From b3387233e0f563957122f46651010b7b6ac80d03 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 17:59:32 -0400 Subject: [PATCH 112/350] Update edge count interface of cdBg constructor More robust --- include/Read_CdBG_Constructor.hpp | 2 ++ src/Read_CdBG_Constructor.cpp | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index ab155d26..6b0f87d3 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -24,6 +24,8 @@ class Read_CdBG_Constructor const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table& hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + + uint64_t edge_count_; // Number of edges in the underlying graph. // Members required to keep track of the total number of edges processed across different threads. mutable Spin_Lock lock; diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 4d9a3823..dac3a713 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -19,6 +19,12 @@ void Read_CdBG_Constructor::compute_DFA_states() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. + Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. + edge_count_ = edge_container.size(); + std::cout << "Total number of distinct edges: " << edge_count_ << ".\n"; + + const std::string& buckets_file_path = params.buckets_file_path(); if(!buckets_file_path.empty() && file_exists(buckets_file_path)) // The serialized hash table buckets, saved from some earlier execution, exists. { @@ -34,10 +40,6 @@ void Read_CdBG_Constructor::compute_DFA_states() Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::compute_states_read_space); // Launch the reading (and parsing per demand) of the edges from disk. - const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. - Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. - std::cout << "Total number of distinct edges: " << edge_container.size() << ".\n"; - edge_parser.launch_production(); // Launch (multi-threaded) computation of the states. @@ -135,7 +137,7 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed template uint64_t Read_CdBG_Constructor::edge_count() const { - return edges_processed; + return edge_count_; } From 766ef80dde8d9371142496eeb306f5ab36697ea7 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 18:03:02 -0400 Subject: [PATCH 113/350] Add nlohmann json lib --- include/nlohmann/LICENSE.MIT | 21 + include/nlohmann/json.hpp | 26154 ++++++++++++++++++++++++++++++++ include/nlohmann/json_fwd.hpp | 78 + 3 files changed, 26253 insertions(+) create mode 100644 include/nlohmann/LICENSE.MIT create mode 100644 include/nlohmann/json.hpp create mode 100644 include/nlohmann/json_fwd.hpp diff --git a/include/nlohmann/LICENSE.MIT b/include/nlohmann/LICENSE.MIT new file mode 100644 index 00000000..f0622d6d --- /dev/null +++ b/include/nlohmann/LICENSE.MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2013-2021 Niels Lohmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp new file mode 100644 index 00000000..cbe69ef4 --- /dev/null +++ b/include/nlohmann/json.hpp @@ -0,0 +1,26154 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ +| | |__ | | | | | | version 3.9.1 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef INCLUDE_NLOHMANN_JSON_HPP_ +#define INCLUDE_NLOHMANN_JSON_HPP_ + +#define NLOHMANN_JSON_VERSION_MAJOR 3 +#define NLOHMANN_JSON_VERSION_MINOR 9 +#define NLOHMANN_JSON_VERSION_PATCH 1 + +#include // all_of, find, for_each +#include // nullptr_t, ptrdiff_t, size_t +#include // hash, less +#include // initializer_list +#ifndef JSON_NO_IO + #include // istream, ostream +#endif // JSON_NO_IO +#include // random_access_iterator_tag +#include // unique_ptr +#include // accumulate +#include // string, stoi, to_string +#include // declval, forward, move, pair, swap +#include // vector + +// #include + + +#include +#include + +// #include + + +#include // transform +#include // array +#include // forward_list +#include // inserter, front_inserter, end +#include // map +#include // string +#include // tuple, make_tuple +#include // is_arithmetic, is_same, is_enum, underlying_type, is_convertible +#include // unordered_map +#include // pair, declval +#include // valarray + +// #include + + +#include // exception +#include // runtime_error +#include // to_string +#include // vector + +// #include + + +#include // array +#include // size_t +#include // uint8_t +#include // string + +namespace nlohmann +{ +namespace detail +{ +/////////////////////////// +// JSON type enumeration // +/////////////////////////// + +/*! +@brief the JSON type enumeration + +This enumeration collects the different JSON types. It is internally used to +distinguish the stored values, and the functions @ref basic_json::is_null(), +@ref basic_json::is_object(), @ref basic_json::is_array(), +@ref basic_json::is_string(), @ref basic_json::is_boolean(), +@ref basic_json::is_number() (with @ref basic_json::is_number_integer(), +@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()), +@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and +@ref basic_json::is_structured() rely on it. + +@note There are three enumeration entries (number_integer, number_unsigned, and +number_float), because the library distinguishes these three types for numbers: +@ref basic_json::number_unsigned_t is used for unsigned integers, +@ref basic_json::number_integer_t is used for signed integers, and +@ref basic_json::number_float_t is used for floating-point numbers or to +approximate integers which do not fit in the limits of their respective type. + +@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON +value with the default value for a given type + +@since version 1.0.0 +*/ +enum class value_t : std::uint8_t +{ + null, ///< null value + object, ///< object (unordered set of name/value pairs) + array, ///< array (ordered collection of values) + string, ///< string value + boolean, ///< boolean value + number_integer, ///< number value (signed integer) + number_unsigned, ///< number value (unsigned integer) + number_float, ///< number value (floating-point) + binary, ///< binary array (ordered collection of bytes) + discarded ///< discarded by the parser callback function +}; + +/*! +@brief comparison operator for JSON types + +Returns an ordering that is similar to Python: +- order: null < boolean < number < object < array < string < binary +- furthermore, each type is not smaller than itself +- discarded values are not comparable +- binary is represented as a b"" string in python and directly comparable to a + string; however, making a binary array directly comparable with a string would + be surprising behavior in a JSON file. + +@since version 1.0.0 +*/ +inline bool operator<(const value_t lhs, const value_t rhs) noexcept +{ + static constexpr std::array order = {{ + 0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */, + 1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */, + 6 /* binary */ + } + }; + + const auto l_index = static_cast(lhs); + const auto r_index = static_cast(rhs); + return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index]; +} +} // namespace detail +} // namespace nlohmann + +// #include + + +#include +// #include + + +#include // pair +// #include + + +/* Hedley - https://nemequ.github.io/hedley + * Created by Evan Nemerson + * + * To the extent possible under law, the author(s) have dedicated all + * copyright and related and neighboring rights to this software to + * the public domain worldwide. This software is distributed without + * any warranty. + * + * For details, see . + * SPDX-License-Identifier: CC0-1.0 + */ + +#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15) +#if defined(JSON_HEDLEY_VERSION) + #undef JSON_HEDLEY_VERSION +#endif +#define JSON_HEDLEY_VERSION 15 + +#if defined(JSON_HEDLEY_STRINGIFY_EX) + #undef JSON_HEDLEY_STRINGIFY_EX +#endif +#define JSON_HEDLEY_STRINGIFY_EX(x) #x + +#if defined(JSON_HEDLEY_STRINGIFY) + #undef JSON_HEDLEY_STRINGIFY +#endif +#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x) + +#if defined(JSON_HEDLEY_CONCAT_EX) + #undef JSON_HEDLEY_CONCAT_EX +#endif +#define JSON_HEDLEY_CONCAT_EX(a,b) a##b + +#if defined(JSON_HEDLEY_CONCAT) + #undef JSON_HEDLEY_CONCAT +#endif +#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b) + +#if defined(JSON_HEDLEY_CONCAT3_EX) + #undef JSON_HEDLEY_CONCAT3_EX +#endif +#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c + +#if defined(JSON_HEDLEY_CONCAT3) + #undef JSON_HEDLEY_CONCAT3 +#endif +#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c) + +#if defined(JSON_HEDLEY_VERSION_ENCODE) + #undef JSON_HEDLEY_VERSION_ENCODE +#endif +#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) + +#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR) + #undef JSON_HEDLEY_VERSION_DECODE_MAJOR +#endif +#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) + +#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR) + #undef JSON_HEDLEY_VERSION_DECODE_MINOR +#endif +#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) + +#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION) + #undef JSON_HEDLEY_VERSION_DECODE_REVISION +#endif +#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) + +#if defined(JSON_HEDLEY_GNUC_VERSION) + #undef JSON_HEDLEY_GNUC_VERSION +#endif +#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) + #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#elif defined(__GNUC__) + #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) +#endif + +#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK) + #undef JSON_HEDLEY_GNUC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_GNUC_VERSION) + #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_MSVC_VERSION) + #undef JSON_HEDLEY_MSVC_VERSION +#endif +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) +#elif defined(_MSC_FULL_VER) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) +#elif defined(_MSC_VER) && !defined(__ICL) + #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) +#endif + +#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK) + #undef JSON_HEDLEY_MSVC_VERSION_CHECK +#endif +#if !defined(JSON_HEDLEY_MSVC_VERSION) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) +#elif defined(_MSC_VER) && (_MSC_VER >= 1200) + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) +#else + #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) +#endif + +#if defined(JSON_HEDLEY_INTEL_VERSION) + #undef JSON_HEDLEY_INTEL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) + #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) +#elif defined(__INTEL_COMPILER) && !defined(__ICL) + #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) +#endif + +#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK) + #undef JSON_HEDLEY_INTEL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_INTEL_VERSION) + #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_INTEL_CL_VERSION) + #undef JSON_HEDLEY_INTEL_CL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) + #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) +#endif + +#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK) + #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_INTEL_CL_VERSION) + #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_PGI_VERSION) + #undef JSON_HEDLEY_PGI_VERSION +#endif +#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) + #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) +#endif + +#if defined(JSON_HEDLEY_PGI_VERSION_CHECK) + #undef JSON_HEDLEY_PGI_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_PGI_VERSION) + #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_SUNPRO_VERSION) + #undef JSON_HEDLEY_SUNPRO_VERSION +#endif +#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) +#elif defined(__SUNPRO_C) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) +#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) +#elif defined(__SUNPRO_CC) + #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) +#endif + +#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK) + #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_SUNPRO_VERSION) + #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) + #undef JSON_HEDLEY_EMSCRIPTEN_VERSION +#endif +#if defined(__EMSCRIPTEN__) + #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) +#endif + +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK) + #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION) + #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_ARM_VERSION) + #undef JSON_HEDLEY_ARM_VERSION +#endif +#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) + #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) +#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) + #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) +#endif + +#if defined(JSON_HEDLEY_ARM_VERSION_CHECK) + #undef JSON_HEDLEY_ARM_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_ARM_VERSION) + #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_IBM_VERSION) + #undef JSON_HEDLEY_IBM_VERSION +#endif +#if defined(__ibmxl__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) +#elif defined(__xlC__) && defined(__xlC_ver__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) +#elif defined(__xlC__) + #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) +#endif + +#if defined(JSON_HEDLEY_IBM_VERSION_CHECK) + #undef JSON_HEDLEY_IBM_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_IBM_VERSION) + #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_VERSION) + #undef JSON_HEDLEY_TI_VERSION +#endif +#if \ + defined(__TI_COMPILER_VERSION__) && \ + ( \ + defined(__TMS470__) || defined(__TI_ARM__) || \ + defined(__MSP430__) || \ + defined(__TMS320C2000__) \ + ) +#if (__TI_COMPILER_VERSION__ >= 16000000) + #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif +#endif + +#if defined(JSON_HEDLEY_TI_VERSION_CHECK) + #undef JSON_HEDLEY_TI_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_VERSION) + #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL2000_VERSION) + #undef JSON_HEDLEY_TI_CL2000_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) + #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL2000_VERSION) + #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL430_VERSION) + #undef JSON_HEDLEY_TI_CL430_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) + #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL430_VERSION) + #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) + #undef JSON_HEDLEY_TI_ARMCL_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) + #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK) + #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_ARMCL_VERSION) + #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL6X_VERSION) + #undef JSON_HEDLEY_TI_CL6X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) + #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL6X_VERSION) + #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CL7X_VERSION) + #undef JSON_HEDLEY_TI_CL7X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) + #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CL7X_VERSION) + #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) + #undef JSON_HEDLEY_TI_CLPRU_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) + #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK) + #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TI_CLPRU_VERSION) + #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_CRAY_VERSION) + #undef JSON_HEDLEY_CRAY_VERSION +#endif +#if defined(_CRAYC) + #if defined(_RELEASE_PATCHLEVEL) + #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) + #else + #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) + #endif +#endif + +#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK) + #undef JSON_HEDLEY_CRAY_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_CRAY_VERSION) + #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_IAR_VERSION) + #undef JSON_HEDLEY_IAR_VERSION +#endif +#if defined(__IAR_SYSTEMS_ICC__) + #if __VER__ > 1000 + #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) + #else + #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) + #endif +#endif + +#if defined(JSON_HEDLEY_IAR_VERSION_CHECK) + #undef JSON_HEDLEY_IAR_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_IAR_VERSION) + #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_TINYC_VERSION) + #undef JSON_HEDLEY_TINYC_VERSION +#endif +#if defined(__TINYC__) + #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) +#endif + +#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK) + #undef JSON_HEDLEY_TINYC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_TINYC_VERSION) + #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_DMC_VERSION) + #undef JSON_HEDLEY_DMC_VERSION +#endif +#if defined(__DMC__) + #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) +#endif + +#if defined(JSON_HEDLEY_DMC_VERSION_CHECK) + #undef JSON_HEDLEY_DMC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_DMC_VERSION) + #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_COMPCERT_VERSION) + #undef JSON_HEDLEY_COMPCERT_VERSION +#endif +#if defined(__COMPCERT_VERSION__) + #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) +#endif + +#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK) + #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_COMPCERT_VERSION) + #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_PELLES_VERSION) + #undef JSON_HEDLEY_PELLES_VERSION +#endif +#if defined(__POCC__) + #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) +#endif + +#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK) + #undef JSON_HEDLEY_PELLES_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_PELLES_VERSION) + #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_MCST_LCC_VERSION) + #undef JSON_HEDLEY_MCST_LCC_VERSION +#endif +#if defined(__LCC__) && defined(__LCC_MINOR__) + #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) +#endif + +#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK) + #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_MCST_LCC_VERSION) + #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_GCC_VERSION) + #undef JSON_HEDLEY_GCC_VERSION +#endif +#if \ + defined(JSON_HEDLEY_GNUC_VERSION) && \ + !defined(__clang__) && \ + !defined(JSON_HEDLEY_INTEL_VERSION) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_ARM_VERSION) && \ + !defined(JSON_HEDLEY_CRAY_VERSION) && \ + !defined(JSON_HEDLEY_TI_VERSION) && \ + !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL430_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \ + !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \ + !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \ + !defined(__COMPCERT__) && \ + !defined(JSON_HEDLEY_MCST_LCC_VERSION) + #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION +#endif + +#if defined(JSON_HEDLEY_GCC_VERSION_CHECK) + #undef JSON_HEDLEY_GCC_VERSION_CHECK +#endif +#if defined(JSON_HEDLEY_GCC_VERSION) + #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else + #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(JSON_HEDLEY_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_ATTRIBUTE +#endif +#if \ + defined(__has_attribute) && \ + ( \ + (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ + ) +# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) +#else +# define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) + #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) + #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE +#endif +#if \ + defined(__has_cpp_attribute) && \ + defined(__cplusplus) && \ + (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS) + #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS +#endif +#if !defined(__cplusplus) || !defined(__has_cpp_attribute) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#elif \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_IAR_VERSION) && \ + (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ + (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) +#else + #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) + #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) + #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_BUILTIN) + #undef JSON_HEDLEY_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else + #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN) + #undef JSON_HEDLEY_GNUC_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else + #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN) + #undef JSON_HEDLEY_GCC_HAS_BUILTIN +#endif +#if defined(__has_builtin) + #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else + #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_FEATURE) + #undef JSON_HEDLEY_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature) +#else + #define JSON_HEDLEY_HAS_FEATURE(feature) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE) + #undef JSON_HEDLEY_GNUC_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else + #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_FEATURE) + #undef JSON_HEDLEY_GCC_HAS_FEATURE +#endif +#if defined(__has_feature) + #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else + #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_EXTENSION) + #undef JSON_HEDLEY_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) +#else + #define JSON_HEDLEY_HAS_EXTENSION(extension) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION) + #undef JSON_HEDLEY_GNUC_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else + #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION) + #undef JSON_HEDLEY_GCC_HAS_EXTENSION +#endif +#if defined(__has_extension) + #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else + #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) + #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else + #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_HAS_WARNING) + #undef JSON_HEDLEY_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning) +#else + #define JSON_HEDLEY_HAS_WARNING(warning) (0) +#endif + +#if defined(JSON_HEDLEY_GNUC_HAS_WARNING) + #undef JSON_HEDLEY_GNUC_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else + #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_GCC_HAS_WARNING) + #undef JSON_HEDLEY_GCC_HAS_WARNING +#endif +#if defined(__has_warning) + #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else + #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + defined(__clang__) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ + (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) + #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value) +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_PRAGMA(value) __pragma(value) +#else + #define JSON_HEDLEY_PRAGMA(value) +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH) + #undef JSON_HEDLEY_DIAGNOSTIC_PUSH +#endif +#if defined(JSON_HEDLEY_DIAGNOSTIC_POP) + #undef JSON_HEDLEY_DIAGNOSTIC_POP +#endif +#if defined(__clang__) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) + #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) +#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) + #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") + #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#else + #define JSON_HEDLEY_DIAGNOSTIC_PUSH + #define JSON_HEDLEY_DIAGNOSTIC_POP +#endif + +/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#endif +#if defined(__cplusplus) +# if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat") +# if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions") +# if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions") +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# endif +# else +# define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + xpr \ + JSON_HEDLEY_DIAGNOSTIC_POP +# endif +# endif +#endif +#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x +#endif + +#if defined(JSON_HEDLEY_CONST_CAST) + #undef JSON_HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +# define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif \ + JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + ((T) (expr)); \ + JSON_HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_REINTERPRET_CAST) + #undef JSON_HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else + #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_STATIC_CAST) + #undef JSON_HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else + #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(JSON_HEDLEY_CPP_CAST) + #undef JSON_HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +# if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast") +# define JSON_HEDLEY_CPP_CAST(T, expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ + ((T) (expr)) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0) +# define JSON_HEDLEY_CPP_CAST(T, expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("diag_suppress=Pe137") \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr)) +# endif +#else +# define JSON_HEDLEY_CPP_CAST(T, expr) (expr) +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") +#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") +#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") +#elif \ + JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") +#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif + +#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) + #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunused-function") + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) +#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") +#else + #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif + +#if defined(JSON_HEDLEY_DEPRECATED) + #undef JSON_HEDLEY_DEPRECATED +#endif +#if defined(JSON_HEDLEY_DEPRECATED_FOR) + #undef JSON_HEDLEY_DEPRECATED_FOR +#endif +#if \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) +#elif \ + (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) +#elif defined(__cplusplus) && (__cplusplus >= 201402L) + #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated") + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") +#else + #define JSON_HEDLEY_DEPRECATED(since) + #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) +#endif + +#if defined(JSON_HEDLEY_UNAVAILABLE) + #undef JSON_HEDLEY_UNAVAILABLE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) +#else + #define JSON_HEDLEY_UNAVAILABLE(available_since) +#endif + +#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT) + #undef JSON_HEDLEY_WARN_UNUSED_RESULT +#endif +#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG) + #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) +#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) + #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) + #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +#elif defined(_Check_return_) /* SAL */ + #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_ + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ +#else + #define JSON_HEDLEY_WARN_UNUSED_RESULT + #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) +#endif + +#if defined(JSON_HEDLEY_SENTINEL) + #undef JSON_HEDLEY_SENTINEL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) +#else + #define JSON_HEDLEY_SENTINEL(position) +#endif + +#if defined(JSON_HEDLEY_NO_RETURN) + #undef JSON_HEDLEY_NO_RETURN +#endif +#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_NO_RETURN __noreturn +#elif \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + #define JSON_HEDLEY_NO_RETURN _Noreturn +#elif defined(__cplusplus) && (__cplusplus >= 201103L) + #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) + #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") +#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) + #define JSON_HEDLEY_NO_RETURN __attribute((noreturn)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) + #define JSON_HEDLEY_NO_RETURN __declspec(noreturn) +#else + #define JSON_HEDLEY_NO_RETURN +#endif + +#if defined(JSON_HEDLEY_NO_ESCAPE) + #undef JSON_HEDLEY_NO_ESCAPE +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape) + #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__)) +#else + #define JSON_HEDLEY_NO_ESCAPE +#endif + +#if defined(JSON_HEDLEY_UNREACHABLE) + #undef JSON_HEDLEY_UNREACHABLE +#endif +#if defined(JSON_HEDLEY_UNREACHABLE_RETURN) + #undef JSON_HEDLEY_UNREACHABLE_RETURN +#endif +#if defined(JSON_HEDLEY_ASSUME) + #undef JSON_HEDLEY_ASSUME +#endif +#if \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_ASSUME(expr) __assume(expr) +#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume) + #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr) +#elif \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) + #if defined(__cplusplus) + #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr) + #else + #define JSON_HEDLEY_ASSUME(expr) _nassert(expr) + #endif +#endif +#if \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif defined(JSON_HEDLEY_ASSUME) + #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) +#endif +#if !defined(JSON_HEDLEY_ASSUME) + #if defined(JSON_HEDLEY_UNREACHABLE) + #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1))) + #else + #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr) + #endif +#endif +#if defined(JSON_HEDLEY_UNREACHABLE) + #if \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value)) + #else + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE() + #endif +#else + #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value) +#endif +#if !defined(JSON_HEDLEY_UNREACHABLE) + #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0) +#endif + +JSON_HEDLEY_DIAGNOSTIC_PUSH +#if JSON_HEDLEY_HAS_WARNING("-Wpedantic") + #pragma clang diagnostic ignored "-Wpedantic" +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) + #pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) + #if defined(__clang__) + #pragma clang diagnostic ignored "-Wvariadic-macros" + #elif defined(JSON_HEDLEY_GCC_VERSION) + #pragma GCC diagnostic ignored "-Wvariadic-macros" + #endif +#endif +#if defined(JSON_HEDLEY_NON_NULL) + #undef JSON_HEDLEY_NON_NULL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) + #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) +#else + #define JSON_HEDLEY_NON_NULL(...) +#endif +JSON_HEDLEY_DIAGNOSTIC_POP + +#if defined(JSON_HEDLEY_PRINTF_FORMAT) + #undef JSON_HEDLEY_PRINTF_FORMAT +#endif +#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) +#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) +#elif \ + JSON_HEDLEY_HAS_ATTRIBUTE(format) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0) + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) +#else + #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) +#endif + +#if defined(JSON_HEDLEY_CONSTEXPR) + #undef JSON_HEDLEY_CONSTEXPR +#endif +#if defined(__cplusplus) + #if __cplusplus >= 201103L + #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) + #endif +#endif +#if !defined(JSON_HEDLEY_CONSTEXPR) + #define JSON_HEDLEY_CONSTEXPR +#endif + +#if defined(JSON_HEDLEY_PREDICT) + #undef JSON_HEDLEY_PREDICT +#endif +#if defined(JSON_HEDLEY_LIKELY) + #undef JSON_HEDLEY_LIKELY +#endif +#if defined(JSON_HEDLEY_UNLIKELY) + #undef JSON_HEDLEY_UNLIKELY +#endif +#if defined(JSON_HEDLEY_UNPREDICTABLE) + #undef JSON_HEDLEY_UNPREDICTABLE +#endif +#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable) + #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) +#endif +#if \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) +# define JSON_HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) +# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) +#elif \ + (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PREDICT(expr, expected, probability) \ + (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ + })) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ + })) +# define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) +# define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#else +# define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)) +# define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) +# define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) +# define JSON_HEDLEY_LIKELY(expr) (!!(expr)) +# define JSON_HEDLEY_UNLIKELY(expr) (!!(expr)) +#endif +#if !defined(JSON_HEDLEY_UNPREDICTABLE) + #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5) +#endif + +#if defined(JSON_HEDLEY_MALLOC) + #undef JSON_HEDLEY_MALLOC +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_MALLOC __attribute__((__malloc__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory") +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_MALLOC __declspec(restrict) +#else + #define JSON_HEDLEY_MALLOC +#endif + +#if defined(JSON_HEDLEY_PURE) + #undef JSON_HEDLEY_PURE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PURE __attribute__((__pure__)) +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data") +#elif defined(__cplusplus) && \ + ( \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ + ) +# define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;") +#else +# define JSON_HEDLEY_PURE +#endif + +#if defined(JSON_HEDLEY_CONST) + #undef JSON_HEDLEY_CONST +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(const) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_CONST __attribute__((__const__)) +#elif \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) + #define JSON_HEDLEY_CONST _Pragma("no_side_effect") +#else + #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE +#endif + +#if defined(JSON_HEDLEY_RESTRICT) + #undef JSON_HEDLEY_RESTRICT +#endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) + #define JSON_HEDLEY_RESTRICT restrict +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + defined(__clang__) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_RESTRICT __restrict +#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) + #define JSON_HEDLEY_RESTRICT _Restrict +#else + #define JSON_HEDLEY_RESTRICT +#endif + +#if defined(JSON_HEDLEY_INLINE) + #undef JSON_HEDLEY_INLINE +#endif +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + (defined(__cplusplus) && (__cplusplus >= 199711L)) + #define JSON_HEDLEY_INLINE inline +#elif \ + defined(JSON_HEDLEY_GCC_VERSION) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0) + #define JSON_HEDLEY_INLINE __inline__ +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_INLINE __inline +#else + #define JSON_HEDLEY_INLINE +#endif + +#if defined(JSON_HEDLEY_ALWAYS_INLINE) + #undef JSON_HEDLEY_ALWAYS_INLINE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) +# define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_ALWAYS_INLINE __forceinline +#elif defined(__cplusplus) && \ + ( \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ + ) +# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") +#else +# define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE +#endif + +#if defined(JSON_HEDLEY_NEVER_INLINE) + #undef JSON_HEDLEY_NEVER_INLINE +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0) + #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) +#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline") +#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) + #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never") +#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) + #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0) + #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline) +#else + #define JSON_HEDLEY_NEVER_INLINE +#endif + +#if defined(JSON_HEDLEY_PRIVATE) + #undef JSON_HEDLEY_PRIVATE +#endif +#if defined(JSON_HEDLEY_PUBLIC) + #undef JSON_HEDLEY_PUBLIC +#endif +#if defined(JSON_HEDLEY_IMPORT) + #undef JSON_HEDLEY_IMPORT +#endif +#if defined(_WIN32) || defined(__CYGWIN__) +# define JSON_HEDLEY_PRIVATE +# define JSON_HEDLEY_PUBLIC __declspec(dllexport) +# define JSON_HEDLEY_IMPORT __declspec(dllimport) +#else +# if \ + JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + ( \ + defined(__TI_EABI__) && \ + ( \ + (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ + ) \ + ) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) +# define JSON_HEDLEY_PUBLIC __attribute__((__visibility__("default"))) +# else +# define JSON_HEDLEY_PRIVATE +# define JSON_HEDLEY_PUBLIC +# endif +# define JSON_HEDLEY_IMPORT extern +#endif + +#if defined(JSON_HEDLEY_NO_THROW) + #undef JSON_HEDLEY_NO_THROW +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__)) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) + #define JSON_HEDLEY_NO_THROW __declspec(nothrow) +#else + #define JSON_HEDLEY_NO_THROW +#endif + +#if defined(JSON_HEDLEY_FALL_THROUGH) + #undef JSON_HEDLEY_FALL_THROUGH +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) + #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) +#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) + #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) +#elif defined(__fallthrough) /* SAL */ + #define JSON_HEDLEY_FALL_THROUGH __fallthrough +#else + #define JSON_HEDLEY_FALL_THROUGH +#endif + +#if defined(JSON_HEDLEY_RETURNS_NON_NULL) + #undef JSON_HEDLEY_RETURNS_NON_NULL +#endif +#if \ + JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) +#elif defined(_Ret_notnull_) /* SAL */ + #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_ +#else + #define JSON_HEDLEY_RETURNS_NON_NULL +#endif + +#if defined(JSON_HEDLEY_ARRAY_PARAM) + #undef JSON_HEDLEY_ARRAY_PARAM +#endif +#if \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ + !defined(__STDC_NO_VLA__) && \ + !defined(__cplusplus) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_TINYC_VERSION) + #define JSON_HEDLEY_ARRAY_PARAM(name) (name) +#else + #define JSON_HEDLEY_ARRAY_PARAM(name) +#endif + +#if defined(JSON_HEDLEY_IS_CONSTANT) + #undef JSON_HEDLEY_IS_CONSTANT +#endif +#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR) + #undef JSON_HEDLEY_REQUIRE_CONSTEXPR +#endif +/* JSON_HEDLEY_IS_CONSTEXPR_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(JSON_HEDLEY_IS_CONSTEXPR_) + #undef JSON_HEDLEY_IS_CONSTEXPR_ +#endif +#if \ + JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) + #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) +#endif +#if !defined(__cplusplus) +# if \ + JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24) +#if defined(__INTPTR_TYPE__) + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) +#else + #include + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) +#endif +# elif \ + ( \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(JSON_HEDLEY_SUNPRO_VERSION) && \ + !defined(JSON_HEDLEY_PGI_VERSION) && \ + !defined(JSON_HEDLEY_IAR_VERSION)) || \ + (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ + JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0) +#if defined(__INTPTR_TYPE__) + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) +#else + #include + #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) +#endif +# elif \ + defined(JSON_HEDLEY_GCC_VERSION) || \ + defined(JSON_HEDLEY_INTEL_VERSION) || \ + defined(JSON_HEDLEY_TINYC_VERSION) || \ + defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \ + JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ + defined(JSON_HEDLEY_TI_CL2000_VERSION) || \ + defined(JSON_HEDLEY_TI_CL6X_VERSION) || \ + defined(JSON_HEDLEY_TI_CL7X_VERSION) || \ + defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \ + defined(__clang__) +# define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \ + sizeof(void) != \ + sizeof(*( \ + 1 ? \ + ((void*) ((expr) * 0L) ) : \ +((struct { char v[sizeof(void) * 2]; } *) 1) \ + ) \ + ) \ + ) +# endif +#endif +#if defined(JSON_HEDLEY_IS_CONSTEXPR_) + #if !defined(JSON_HEDLEY_IS_CONSTANT) + #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr) + #endif + #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) +#else + #if !defined(JSON_HEDLEY_IS_CONSTANT) + #define JSON_HEDLEY_IS_CONSTANT(expr) (0) + #endif + #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) +#endif + +#if defined(JSON_HEDLEY_BEGIN_C_DECLS) + #undef JSON_HEDLEY_BEGIN_C_DECLS +#endif +#if defined(JSON_HEDLEY_END_C_DECLS) + #undef JSON_HEDLEY_END_C_DECLS +#endif +#if defined(JSON_HEDLEY_C_DECL) + #undef JSON_HEDLEY_C_DECL +#endif +#if defined(__cplusplus) + #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" { + #define JSON_HEDLEY_END_C_DECLS } + #define JSON_HEDLEY_C_DECL extern "C" +#else + #define JSON_HEDLEY_BEGIN_C_DECLS + #define JSON_HEDLEY_END_C_DECLS + #define JSON_HEDLEY_C_DECL +#endif + +#if defined(JSON_HEDLEY_STATIC_ASSERT) + #undef JSON_HEDLEY_STATIC_ASSERT +#endif +#if \ + !defined(__cplusplus) && ( \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ + (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \ + JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + defined(_Static_assert) \ + ) +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) +#elif \ + (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ + JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) +#else +# define JSON_HEDLEY_STATIC_ASSERT(expr, message) +#endif + +#if defined(JSON_HEDLEY_NULL) + #undef JSON_HEDLEY_NULL +#endif +#if defined(__cplusplus) + #if __cplusplus >= 201103L + #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) + #elif defined(NULL) + #define JSON_HEDLEY_NULL NULL + #else + #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0) + #endif +#elif defined(NULL) + #define JSON_HEDLEY_NULL NULL +#else + #define JSON_HEDLEY_NULL ((void*) 0) +#endif + +#if defined(JSON_HEDLEY_MESSAGE) + #undef JSON_HEDLEY_MESSAGE +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define JSON_HEDLEY_MESSAGE(msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + JSON_HEDLEY_PRAGMA(message msg) \ + JSON_HEDLEY_DIAGNOSTIC_POP +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg) +#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg) +#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0) +# define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#else +# define JSON_HEDLEY_MESSAGE(msg) +#endif + +#if defined(JSON_HEDLEY_WARNING) + #undef JSON_HEDLEY_WARNING +#endif +#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define JSON_HEDLEY_WARNING(msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + JSON_HEDLEY_PRAGMA(clang warning msg) \ + JSON_HEDLEY_DIAGNOSTIC_POP +#elif \ + JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ + JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg) +#elif \ + JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg)) +#else +# define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg) +#endif + +#if defined(JSON_HEDLEY_REQUIRE) + #undef JSON_HEDLEY_REQUIRE +#endif +#if defined(JSON_HEDLEY_REQUIRE_MSG) + #undef JSON_HEDLEY_REQUIRE_MSG +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if) +# if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat") +# define JSON_HEDLEY_REQUIRE(expr) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), msg, "error"))) \ + JSON_HEDLEY_DIAGNOSTIC_POP +# else +# define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) +# endif +#else +# define JSON_HEDLEY_REQUIRE(expr) +# define JSON_HEDLEY_REQUIRE_MSG(expr,msg) +#endif + +#if defined(JSON_HEDLEY_FLAGS) + #undef JSON_HEDLEY_FLAGS +#endif +#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) + #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__)) +#else + #define JSON_HEDLEY_FLAGS +#endif + +#if defined(JSON_HEDLEY_FLAGS_CAST) + #undef JSON_HEDLEY_FLAGS_CAST +#endif +#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0) +# define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ + JSON_HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("warning(disable:188)") \ + ((T) (expr)); \ + JSON_HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr) +#endif + +#if defined(JSON_HEDLEY_EMPTY_BASES) + #undef JSON_HEDLEY_EMPTY_BASES +#endif +#if \ + (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ + JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) + #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases) +#else + #define JSON_HEDLEY_EMPTY_BASES +#endif + +/* Remaining macros are deprecated. */ + +#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) + #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK +#endif +#if defined(__clang__) + #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) +#else + #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN) + #undef JSON_HEDLEY_CLANG_HAS_BUILTIN +#endif +#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin) + +#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE) + #undef JSON_HEDLEY_CLANG_HAS_FEATURE +#endif +#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature) + +#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION) + #undef JSON_HEDLEY_CLANG_HAS_EXTENSION +#endif +#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension) + +#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) + #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE +#endif +#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) + +#if defined(JSON_HEDLEY_CLANG_HAS_WARNING) + #undef JSON_HEDLEY_CLANG_HAS_WARNING +#endif +#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning) + +#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */ + + +// This file contains all internal macro definitions +// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them + +// exclude unsupported compilers +#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) + #if defined(__clang__) + #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 + #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" + #endif + #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) + #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 + #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" + #endif + #endif +#endif + +// C++ language standard detection +// if the user manually specified the used c++ version this is skipped +#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11) + #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) + #define JSON_HAS_CPP_20 + #define JSON_HAS_CPP_17 + #define JSON_HAS_CPP_14 + #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 + #define JSON_HAS_CPP_17 + #define JSON_HAS_CPP_14 + #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) + #define JSON_HAS_CPP_14 + #endif + // the cpp 11 flag is always specified because it is the minimal required version + #define JSON_HAS_CPP_11 +#endif + +// disable documentation warnings on clang +#if defined(__clang__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdocumentation" +#endif + +// allow to disable exceptions +#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) + #define JSON_THROW(exception) throw exception + #define JSON_TRY try + #define JSON_CATCH(exception) catch(exception) + #define JSON_INTERNAL_CATCH(exception) catch(exception) +#else + #include + #define JSON_THROW(exception) std::abort() + #define JSON_TRY if(true) + #define JSON_CATCH(exception) if(false) + #define JSON_INTERNAL_CATCH(exception) if(false) +#endif + +// override exception macros +#if defined(JSON_THROW_USER) + #undef JSON_THROW + #define JSON_THROW JSON_THROW_USER +#endif +#if defined(JSON_TRY_USER) + #undef JSON_TRY + #define JSON_TRY JSON_TRY_USER +#endif +#if defined(JSON_CATCH_USER) + #undef JSON_CATCH + #define JSON_CATCH JSON_CATCH_USER + #undef JSON_INTERNAL_CATCH + #define JSON_INTERNAL_CATCH JSON_CATCH_USER +#endif +#if defined(JSON_INTERNAL_CATCH_USER) + #undef JSON_INTERNAL_CATCH + #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER +#endif + +// allow to override assert +#if !defined(JSON_ASSERT) + #include // assert + #define JSON_ASSERT(x) assert(x) +#endif + +// allow to access some private functions (needed by the test suite) +#if defined(JSON_TESTS_PRIVATE) + #define JSON_PRIVATE_UNLESS_TESTED public +#else + #define JSON_PRIVATE_UNLESS_TESTED private +#endif + +/*! +@brief macro to briefly define a mapping between an enum and JSON +@def NLOHMANN_JSON_SERIALIZE_ENUM +@since version 3.4.0 +*/ +#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ + template \ + inline void to_json(BasicJsonType& j, const ENUM_TYPE& e) \ + { \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [e](const std::pair& ej_pair) -> bool \ + { \ + return ej_pair.first == e; \ + }); \ + j = ((it != std::end(m)) ? it : std::begin(m))->second; \ + } \ + template \ + inline void from_json(const BasicJsonType& j, ENUM_TYPE& e) \ + { \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [&j](const std::pair& ej_pair) -> bool \ + { \ + return ej_pair.second == j; \ + }); \ + e = ((it != std::end(m)) ? it : std::begin(m))->first; \ + } + +// Ugly macros to avoid uglier copy-paste when specializing basic_json. They +// may be removed in the future once the class is split. + +#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ + template class ObjectType, \ + template class ArrayType, \ + class StringType, class BooleanType, class NumberIntegerType, \ + class NumberUnsignedType, class NumberFloatType, \ + template class AllocatorType, \ + template class JSONSerializer, \ + class BinaryType> + +#define NLOHMANN_BASIC_JSON_TPL \ + basic_json + +// Macros to simplify conversion from/to types + +#define NLOHMANN_JSON_EXPAND( x ) x +#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME +#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \ + NLOHMANN_JSON_PASTE64, \ + NLOHMANN_JSON_PASTE63, \ + NLOHMANN_JSON_PASTE62, \ + NLOHMANN_JSON_PASTE61, \ + NLOHMANN_JSON_PASTE60, \ + NLOHMANN_JSON_PASTE59, \ + NLOHMANN_JSON_PASTE58, \ + NLOHMANN_JSON_PASTE57, \ + NLOHMANN_JSON_PASTE56, \ + NLOHMANN_JSON_PASTE55, \ + NLOHMANN_JSON_PASTE54, \ + NLOHMANN_JSON_PASTE53, \ + NLOHMANN_JSON_PASTE52, \ + NLOHMANN_JSON_PASTE51, \ + NLOHMANN_JSON_PASTE50, \ + NLOHMANN_JSON_PASTE49, \ + NLOHMANN_JSON_PASTE48, \ + NLOHMANN_JSON_PASTE47, \ + NLOHMANN_JSON_PASTE46, \ + NLOHMANN_JSON_PASTE45, \ + NLOHMANN_JSON_PASTE44, \ + NLOHMANN_JSON_PASTE43, \ + NLOHMANN_JSON_PASTE42, \ + NLOHMANN_JSON_PASTE41, \ + NLOHMANN_JSON_PASTE40, \ + NLOHMANN_JSON_PASTE39, \ + NLOHMANN_JSON_PASTE38, \ + NLOHMANN_JSON_PASTE37, \ + NLOHMANN_JSON_PASTE36, \ + NLOHMANN_JSON_PASTE35, \ + NLOHMANN_JSON_PASTE34, \ + NLOHMANN_JSON_PASTE33, \ + NLOHMANN_JSON_PASTE32, \ + NLOHMANN_JSON_PASTE31, \ + NLOHMANN_JSON_PASTE30, \ + NLOHMANN_JSON_PASTE29, \ + NLOHMANN_JSON_PASTE28, \ + NLOHMANN_JSON_PASTE27, \ + NLOHMANN_JSON_PASTE26, \ + NLOHMANN_JSON_PASTE25, \ + NLOHMANN_JSON_PASTE24, \ + NLOHMANN_JSON_PASTE23, \ + NLOHMANN_JSON_PASTE22, \ + NLOHMANN_JSON_PASTE21, \ + NLOHMANN_JSON_PASTE20, \ + NLOHMANN_JSON_PASTE19, \ + NLOHMANN_JSON_PASTE18, \ + NLOHMANN_JSON_PASTE17, \ + NLOHMANN_JSON_PASTE16, \ + NLOHMANN_JSON_PASTE15, \ + NLOHMANN_JSON_PASTE14, \ + NLOHMANN_JSON_PASTE13, \ + NLOHMANN_JSON_PASTE12, \ + NLOHMANN_JSON_PASTE11, \ + NLOHMANN_JSON_PASTE10, \ + NLOHMANN_JSON_PASTE9, \ + NLOHMANN_JSON_PASTE8, \ + NLOHMANN_JSON_PASTE7, \ + NLOHMANN_JSON_PASTE6, \ + NLOHMANN_JSON_PASTE5, \ + NLOHMANN_JSON_PASTE4, \ + NLOHMANN_JSON_PASTE3, \ + NLOHMANN_JSON_PASTE2, \ + NLOHMANN_JSON_PASTE1)(__VA_ARGS__)) +#define NLOHMANN_JSON_PASTE2(func, v1) func(v1) +#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2) +#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3) +#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4) +#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5) +#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6) +#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7) +#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8) +#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9) +#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10) +#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) +#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) +#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) +#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) +#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) +#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) +#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) +#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) +#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) +#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) +#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) +#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) +#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) +#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) +#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) +#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) +#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) +#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) +#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) +#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) +#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) +#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) +#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) +#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) +#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) +#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) +#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) +#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) +#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) +#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) +#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) +#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) +#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) +#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) +#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) +#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) +#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) +#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) +#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) +#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) +#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) +#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) +#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) +#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) +#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) +#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) +#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) +#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) +#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) +#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) +#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) +#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) +#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) + +#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1; +#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1); + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_INTRUSIVE +@since version 3.9.0 +*/ +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +/*! +@brief macro +@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE +@since version 3.9.0 +*/ +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } + +#ifndef JSON_USE_IMPLICIT_CONVERSIONS + #define JSON_USE_IMPLICIT_CONVERSIONS 1 +#endif + +#if JSON_USE_IMPLICIT_CONVERSIONS + #define JSON_EXPLICIT +#else + #define JSON_EXPLICIT explicit +#endif + + +namespace nlohmann +{ +namespace detail +{ + +/*! +@brief replace all occurrences of a substring by another string + +@param[in,out] s the string to manipulate; changed so that all + occurrences of @a f are replaced with @a t +@param[in] f the substring to replace with @a t +@param[in] t the string to replace @a f + +@pre The search string @a f must not be empty. **This precondition is +enforced with an assertion.** + +@since version 2.0.0 +*/ +inline void replace_substring(std::string& s, const std::string& f, + const std::string& t) +{ + JSON_ASSERT(!f.empty()); + for (auto pos = s.find(f); // find first occurrence of f + pos != std::string::npos; // make sure f was found + s.replace(pos, f.size(), t), // replace with t, and + pos = s.find(f, pos + t.size())) // find next occurrence of f + {} +} + +/*! + * @brief string escaping as described in RFC 6901 (Sect. 4) + * @param[in] s string to escape + * @return escaped string + * + * Note the order of escaping "~" to "~0" and "/" to "~1" is important. + */ +inline std::string escape(std::string s) +{ + replace_substring(s, "~", "~0"); + replace_substring(s, "/", "~1"); + return s; +} + +/*! + * @brief string unescaping as described in RFC 6901 (Sect. 4) + * @param[in] s string to unescape + * @return unescaped string + * + * Note the order of escaping "~1" to "/" and "~0" to "~" is important. + */ +static void unescape(std::string& s) +{ + replace_substring(s, "~1", "/"); + replace_substring(s, "~0", "~"); +} + +} // namespace detail +} // namespace nlohmann + +// #include + + +#include // size_t + +namespace nlohmann +{ +namespace detail +{ +/// struct to capture the start position of the current token +struct position_t +{ + /// the total number of characters read + std::size_t chars_read_total = 0; + /// the number of characters read in the current line + std::size_t chars_read_current_line = 0; + /// the number of lines read + std::size_t lines_read = 0; + + /// conversion to size_t to preserve SAX interface + constexpr operator size_t() const + { + return chars_read_total; + } +}; + +} // namespace detail +} // namespace nlohmann + +// #include + + +namespace nlohmann +{ +namespace detail +{ +//////////////// +// exceptions // +//////////////// + +/*! +@brief general exception of the @ref basic_json class + +This class is an extension of `std::exception` objects with a member @a id for +exception ids. It is used as the base class for all exceptions thrown by the +@ref basic_json class. This class can hence be used as "wildcard" to catch +exceptions. + +Subclasses: +- @ref parse_error for exceptions indicating a parse error +- @ref invalid_iterator for exceptions indicating errors with iterators +- @ref type_error for exceptions indicating executing a member function with + a wrong type +- @ref out_of_range for exceptions indicating access out of the defined range +- @ref other_error for exceptions indicating other library errors + +@internal +@note To have nothrow-copy-constructible exceptions, we internally use + `std::runtime_error` which can cope with arbitrary-length error messages. + Intermediate strings are built with static functions and then passed to + the actual constructor. +@endinternal + +@liveexample{The following code shows how arbitrary library exceptions can be +caught.,exception} + +@since version 3.0.0 +*/ +class exception : public std::exception +{ + public: + /// returns the explanatory string + const char* what() const noexcept override + { + return m.what(); + } + + /// the id of the exception + const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes) + + protected: + JSON_HEDLEY_NON_NULL(3) + exception(int id_, const char* what_arg) : id(id_), m(what_arg) {} + + static std::string name(const std::string& ename, int id_) + { + return "[json.exception." + ename + "." + std::to_string(id_) + "] "; + } + + template + static std::string diagnostics(const BasicJsonType& leaf_element) + { +#if JSON_DIAGNOSTICS + std::vector tokens; + for (const auto* current = &leaf_element; current->m_parent != nullptr; current = current->m_parent) + { + switch (current->m_parent->type()) + { + case value_t::array: + { + for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i) + { + if (¤t->m_parent->m_value.array->operator[](i) == current) + { + tokens.emplace_back(std::to_string(i)); + break; + } + } + break; + } + + case value_t::object: + { + for (const auto& element : *current->m_parent->m_value.object) + { + if (&element.second == current) + { + tokens.emplace_back(element.first.c_str()); + break; + } + } + break; + } + + default: // LCOV_EXCL_LINE + break; // LCOV_EXCL_LINE + } + } + + if (tokens.empty()) + { + return ""; + } + + return "(" + std::accumulate(tokens.rbegin(), tokens.rend(), std::string{}, + [](const std::string & a, const std::string & b) + { + return a + "/" + detail::escape(b); + }) + ") "; +#else + static_cast(leaf_element); + return ""; +#endif + } + + private: + /// an exception object as storage for error messages + std::runtime_error m; +}; + +/*! +@brief exception indicating a parse error + +This exception is thrown by the library when a parse error occurs. Parse errors +can occur during the deserialization of JSON text, CBOR, MessagePack, as well +as when using JSON Patch. + +Member @a byte holds the byte index of the last read character in the input +file. + +Exceptions have ids 1xx. + +name / id | example message | description +------------------------------ | --------------- | ------------------------- +json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position. +json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point. +json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid. +json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects. +json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors. +json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`. +json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character. +json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences. +json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number. +json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read. +json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. +json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. +json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. + +@note For an input with n bytes, 1 is the index of the first character and n+1 + is the index of the terminating null byte or the end of file. This also + holds true when reading a byte vector (CBOR or MessagePack). + +@liveexample{The following code shows how a `parse_error` exception can be +caught.,parse_error} + +@sa - @ref exception for the base class of the library exceptions +@sa - @ref invalid_iterator for exceptions indicating errors with iterators +@sa - @ref type_error for exceptions indicating executing a member function with + a wrong type +@sa - @ref out_of_range for exceptions indicating access out of the defined range +@sa - @ref other_error for exceptions indicating other library errors + +@since version 3.0.0 +*/ +class parse_error : public exception +{ + public: + /*! + @brief create a parse error exception + @param[in] id_ the id of the exception + @param[in] pos the position where the error occurred (or with + chars_read_total=0 if the position cannot be + determined) + @param[in] what_arg the explanatory string + @return parse_error object + */ + template + static parse_error create(int id_, const position_t& pos, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("parse_error", id_) + "parse error" + + position_string(pos) + ": " + exception::diagnostics(context) + what_arg; + return parse_error(id_, pos.chars_read_total, w.c_str()); + } + + template + static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("parse_error", id_) + "parse error" + + (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") + + ": " + exception::diagnostics(context) + what_arg; + return parse_error(id_, byte_, w.c_str()); + } + + /*! + @brief byte index of the parse error + + The byte index of the last read character in the input file. + + @note For an input with n bytes, 1 is the index of the first character and + n+1 is the index of the terminating null byte or the end of file. + This also holds true when reading a byte vector (CBOR or MessagePack). + */ + const std::size_t byte; + + private: + parse_error(int id_, std::size_t byte_, const char* what_arg) + : exception(id_, what_arg), byte(byte_) {} + + static std::string position_string(const position_t& pos) + { + return " at line " + std::to_string(pos.lines_read + 1) + + ", column " + std::to_string(pos.chars_read_current_line); + } +}; + +/*! +@brief exception indicating errors with iterators + +This exception is thrown if iterators passed to a library function do not match +the expected semantics. + +Exceptions have ids 2xx. + +name / id | example message | description +----------------------------------- | --------------- | ------------------------- +json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid. +json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion. +json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from. +json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid. +json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid. +json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range. +json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key. +json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered. +json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered. +json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid. +json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to. +json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container. +json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered. +json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin(). + +@liveexample{The following code shows how an `invalid_iterator` exception can be +caught.,invalid_iterator} + +@sa - @ref exception for the base class of the library exceptions +@sa - @ref parse_error for exceptions indicating a parse error +@sa - @ref type_error for exceptions indicating executing a member function with + a wrong type +@sa - @ref out_of_range for exceptions indicating access out of the defined range +@sa - @ref other_error for exceptions indicating other library errors + +@since version 3.0.0 +*/ +class invalid_iterator : public exception +{ + public: + template + static invalid_iterator create(int id_, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("invalid_iterator", id_) + exception::diagnostics(context) + what_arg; + return invalid_iterator(id_, w.c_str()); + } + + private: + JSON_HEDLEY_NON_NULL(3) + invalid_iterator(int id_, const char* what_arg) + : exception(id_, what_arg) {} +}; + +/*! +@brief exception indicating executing a member function with a wrong type + +This exception is thrown in case of a type error; that is, a library function is +executed on a JSON value whose type does not match the expected semantics. + +Exceptions have ids 3xx. + +name / id | example message | description +----------------------------- | --------------- | ------------------------- +json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead. +json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types. +json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &. +json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types. +json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types. +json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types. +json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types. +json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types. +json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types. +json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types. +json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types. +json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types. +json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined. +json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers. +json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive. +json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. | +json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) | + +@liveexample{The following code shows how a `type_error` exception can be +caught.,type_error} + +@sa - @ref exception for the base class of the library exceptions +@sa - @ref parse_error for exceptions indicating a parse error +@sa - @ref invalid_iterator for exceptions indicating errors with iterators +@sa - @ref out_of_range for exceptions indicating access out of the defined range +@sa - @ref other_error for exceptions indicating other library errors + +@since version 3.0.0 +*/ +class type_error : public exception +{ + public: + template + static type_error create(int id_, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("type_error", id_) + exception::diagnostics(context) + what_arg; + return type_error(id_, w.c_str()); + } + + private: + JSON_HEDLEY_NON_NULL(3) + type_error(int id_, const char* what_arg) : exception(id_, what_arg) {} +}; + +/*! +@brief exception indicating access out of the defined range + +This exception is thrown in case a library function is called on an input +parameter that exceeds the expected range, for instance in case of array +indices or nonexisting object keys. + +Exceptions have ids 4xx. + +name / id | example message | description +------------------------------- | --------------- | ------------------------- +json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1. +json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it. +json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object. +json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. +json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. +json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | +json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | +json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | + +@liveexample{The following code shows how an `out_of_range` exception can be +caught.,out_of_range} + +@sa - @ref exception for the base class of the library exceptions +@sa - @ref parse_error for exceptions indicating a parse error +@sa - @ref invalid_iterator for exceptions indicating errors with iterators +@sa - @ref type_error for exceptions indicating executing a member function with + a wrong type +@sa - @ref other_error for exceptions indicating other library errors + +@since version 3.0.0 +*/ +class out_of_range : public exception +{ + public: + template + static out_of_range create(int id_, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("out_of_range", id_) + exception::diagnostics(context) + what_arg; + return out_of_range(id_, w.c_str()); + } + + private: + JSON_HEDLEY_NON_NULL(3) + out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {} +}; + +/*! +@brief exception indicating other library errors + +This exception is thrown in case of errors that cannot be classified with the +other exception types. + +Exceptions have ids 5xx. + +name / id | example message | description +------------------------------ | --------------- | ------------------------- +json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed. + +@sa - @ref exception for the base class of the library exceptions +@sa - @ref parse_error for exceptions indicating a parse error +@sa - @ref invalid_iterator for exceptions indicating errors with iterators +@sa - @ref type_error for exceptions indicating executing a member function with + a wrong type +@sa - @ref out_of_range for exceptions indicating access out of the defined range + +@liveexample{The following code shows how an `other_error` exception can be +caught.,other_error} + +@since version 3.0.0 +*/ +class other_error : public exception +{ + public: + template + static other_error create(int id_, const std::string& what_arg, const BasicJsonType& context) + { + std::string w = exception::name("other_error", id_) + exception::diagnostics(context) + what_arg; + return other_error(id_, w.c_str()); + } + + private: + JSON_HEDLEY_NON_NULL(3) + other_error(int id_, const char* what_arg) : exception(id_, what_arg) {} +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + + +#include // size_t +#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type +#include // index_sequence, make_index_sequence, index_sequence_for + +// #include + + +namespace nlohmann +{ +namespace detail +{ + +template +using uncvref_t = typename std::remove_cv::type>::type; + +#ifdef JSON_HAS_CPP_14 + +// the following utilities are natively available in C++14 +using std::enable_if_t; +using std::index_sequence; +using std::make_index_sequence; +using std::index_sequence_for; + +#else + +// alias templates to reduce boilerplate +template +using enable_if_t = typename std::enable_if::type; + +// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h +// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0. + +//// START OF CODE FROM GOOGLE ABSEIL + +// integer_sequence +// +// Class template representing a compile-time integer sequence. An instantiation +// of `integer_sequence` has a sequence of integers encoded in its +// type through its template arguments (which is a common need when +// working with C++11 variadic templates). `absl::integer_sequence` is designed +// to be a drop-in replacement for C++14's `std::integer_sequence`. +// +// Example: +// +// template< class T, T... Ints > +// void user_function(integer_sequence); +// +// int main() +// { +// // user_function's `T` will be deduced to `int` and `Ints...` +// // will be deduced to `0, 1, 2, 3, 4`. +// user_function(make_integer_sequence()); +// } +template +struct integer_sequence +{ + using value_type = T; + static constexpr std::size_t size() noexcept + { + return sizeof...(Ints); + } +}; + +// index_sequence +// +// A helper template for an `integer_sequence` of `size_t`, +// `absl::index_sequence` is designed to be a drop-in replacement for C++14's +// `std::index_sequence`. +template +using index_sequence = integer_sequence; + +namespace utility_internal +{ + +template +struct Extend; + +// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency. +template +struct Extend, SeqSize, 0> +{ + using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >; +}; + +template +struct Extend, SeqSize, 1> +{ + using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >; +}; + +// Recursion helper for 'make_integer_sequence'. +// 'Gen::type' is an alias for 'integer_sequence'. +template +struct Gen +{ + using type = + typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type; +}; + +template +struct Gen +{ + using type = integer_sequence; +}; + +} // namespace utility_internal + +// Compile-time sequences of integers + +// make_integer_sequence +// +// This template alias is equivalent to +// `integer_sequence`, and is designed to be a drop-in +// replacement for C++14's `std::make_integer_sequence`. +template +using make_integer_sequence = typename utility_internal::Gen::type; + +// make_index_sequence +// +// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`, +// and is designed to be a drop-in replacement for C++14's +// `std::make_index_sequence`. +template +using make_index_sequence = make_integer_sequence; + +// index_sequence_for +// +// Converts a typename pack into an index sequence of the same length, and +// is designed to be a drop-in replacement for C++14's +// `std::index_sequence_for()` +template +using index_sequence_for = make_index_sequence; + +//// END OF CODE FROM GOOGLE ABSEIL + +#endif + +// dispatch utility (taken from ranges-v3) +template struct priority_tag : priority_tag < N - 1 > {}; +template<> struct priority_tag<0> {}; + +// taken from ranges-v3 +template +struct static_const +{ + static constexpr T value{}; +}; + +template +constexpr T static_const::value; + +} // namespace detail +} // namespace nlohmann + +// #include + + +namespace nlohmann +{ +namespace detail +{ +// dispatching helper struct +template struct identity_tag {}; +} // namespace detail +} // namespace nlohmann + +// #include + + +#include // numeric_limits +#include // false_type, is_constructible, is_integral, is_same, true_type +#include // declval +#include // tuple + +// #include + + +#include // random_access_iterator_tag + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template struct make_void +{ + using type = void; +}; +template using void_t = typename make_void::type; +} // namespace detail +} // namespace nlohmann + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template +struct iterator_types {}; + +template +struct iterator_types < + It, + void_t> +{ + using difference_type = typename It::difference_type; + using value_type = typename It::value_type; + using pointer = typename It::pointer; + using reference = typename It::reference; + using iterator_category = typename It::iterator_category; +}; + +// This is required as some compilers implement std::iterator_traits in a way that +// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341. +template +struct iterator_traits +{ +}; + +template +struct iterator_traits < T, enable_if_t < !std::is_pointer::value >> + : iterator_types +{ +}; + +template +struct iterator_traits::value>> +{ + using iterator_category = std::random_access_iterator_tag; + using value_type = T; + using difference_type = ptrdiff_t; + using pointer = T*; + using reference = T&; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + +// #include + + +#include + +// #include + + +// https://en.cppreference.com/w/cpp/experimental/is_detected +namespace nlohmann +{ +namespace detail +{ +struct nonesuch +{ + nonesuch() = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; + nonesuch(nonesuch const&&) = delete; + void operator=(nonesuch const&) = delete; + void operator=(nonesuch&&) = delete; +}; + +template class Op, + class... Args> +struct detector +{ + using value_t = std::false_type; + using type = Default; +}; + +template class Op, class... Args> +struct detector>, Op, Args...> +{ + using value_t = std::true_type; + using type = Op; +}; + +template class Op, class... Args> +using is_detected = typename detector::value_t; + +template class Op, class... Args> +using detected_t = typename detector::type; + +template class Op, class... Args> +using detected_or = detector; + +template class Op, class... Args> +using detected_or_t = typename detected_or::type; + +template class Op, class... Args> +using is_detected_exact = std::is_same>; + +template class Op, class... Args> +using is_detected_convertible = + std::is_convertible, To>; +} // namespace detail +} // namespace nlohmann + +// #include +#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ +#define INCLUDE_NLOHMANN_JSON_FWD_HPP_ + +#include // int64_t, uint64_t +#include // map +#include // allocator +#include // string +#include // vector + +/*! +@brief namespace for Niels Lohmann +@see https://github.com/nlohmann +@since version 1.0.0 +*/ +namespace nlohmann +{ +/*! +@brief default JSONSerializer template argument + +This serializer ignores the template arguments and uses ADL +([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) +for serialization. +*/ +template +struct adl_serializer; + +template class ObjectType = + std::map, + template class ArrayType = std::vector, + class StringType = std::string, class BooleanType = bool, + class NumberIntegerType = std::int64_t, + class NumberUnsignedType = std::uint64_t, + class NumberFloatType = double, + template class AllocatorType = std::allocator, + template class JSONSerializer = + adl_serializer, + class BinaryType = std::vector> +class basic_json; + +/*! +@brief JSON Pointer + +A JSON pointer defines a string syntax for identifying a specific value +within a JSON document. It can be used with functions `at` and +`operator[]`. Furthermore, JSON pointers are the base for JSON patches. + +@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) + +@since version 2.0.0 +*/ +template +class json_pointer; + +/*! +@brief default JSON class + +This type is the default specialization of the @ref basic_json class which +uses the standard template types. + +@since version 1.0.0 +*/ +using json = basic_json<>; + +template +struct ordered_map; + +/*! +@brief ordered JSON class + +This type preserves the insertion order of object keys. + +@since version 3.9.0 +*/ +using ordered_json = basic_json; + +} // namespace nlohmann + +#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ + + +namespace nlohmann +{ +/*! +@brief detail namespace with internal helper functions + +This namespace collects functions that should not be exposed, +implementations of some @ref basic_json methods, and meta-programming helpers. + +@since version 2.1.0 +*/ +namespace detail +{ +///////////// +// helpers // +///////////// + +// Note to maintainers: +// +// Every trait in this file expects a non CV-qualified type. +// The only exceptions are in the 'aliases for detected' section +// (i.e. those of the form: decltype(T::member_function(std::declval()))) +// +// In this case, T has to be properly CV-qualified to constraint the function arguments +// (e.g. to_json(BasicJsonType&, const T&)) + +template struct is_basic_json : std::false_type {}; + +NLOHMANN_BASIC_JSON_TPL_DECLARATION +struct is_basic_json : std::true_type {}; + +////////////////////// +// json_ref helpers // +////////////////////// + +template +class json_ref; + +template +struct is_json_ref : std::false_type {}; + +template +struct is_json_ref> : std::true_type {}; + +////////////////////////// +// aliases for detected // +////////////////////////// + +template +using mapped_type_t = typename T::mapped_type; + +template +using key_type_t = typename T::key_type; + +template +using value_type_t = typename T::value_type; + +template +using difference_type_t = typename T::difference_type; + +template +using pointer_t = typename T::pointer; + +template +using reference_t = typename T::reference; + +template +using iterator_category_t = typename T::iterator_category; + +template +using iterator_t = typename T::iterator; + +template +using to_json_function = decltype(T::to_json(std::declval()...)); + +template +using from_json_function = decltype(T::from_json(std::declval()...)); + +template +using get_template_function = decltype(std::declval().template get()); + +// trait checking if JSONSerializer::from_json(json const&, udt&) exists +template +struct has_from_json : std::false_type {}; + +// trait checking if j.get is valid +// use this trait instead of std::is_constructible or std::is_convertible, +// both rely on, or make use of implicit conversions, and thus fail when T +// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958) +template +struct is_getable +{ + static constexpr bool value = is_detected::value; +}; + +template +struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + +// This trait checks if JSONSerializer::from_json(json const&) exists +// this overload is used for non-default-constructible user-defined-types +template +struct has_non_default_from_json : std::false_type {}; + +template +struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + +// This trait checks if BasicJsonType::json_serializer::to_json exists +// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion. +template +struct has_to_json : std::false_type {}; + +template +struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json::value >> +{ + using serializer = typename BasicJsonType::template json_serializer; + + static constexpr bool value = + is_detected_exact::value; +}; + + +/////////////////// +// is_ functions // +/////////////////// + +// https://en.cppreference.com/w/cpp/types/conjunction +template struct conjunction : std::true_type { }; +template struct conjunction : B1 { }; +template +struct conjunction +: std::conditional, B1>::type {}; + +// Reimplementation of is_constructible and is_default_constructible, due to them being broken for +// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367). +// This causes compile errors in e.g. clang 3.5 or gcc 4.9. +template +struct is_default_constructible : std::is_default_constructible {}; + +template +struct is_default_constructible> + : conjunction, is_default_constructible> {}; + +template +struct is_default_constructible> + : conjunction, is_default_constructible> {}; + +template +struct is_default_constructible> + : conjunction...> {}; + +template +struct is_default_constructible> + : conjunction...> {}; + + +template +struct is_constructible : std::is_constructible {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + +template +struct is_constructible> : is_default_constructible> {}; + + +template +struct is_iterator_traits : std::false_type {}; + +template +struct is_iterator_traits> +{ + private: + using traits = iterator_traits; + + public: + static constexpr auto value = + is_detected::value && + is_detected::value && + is_detected::value && + is_detected::value && + is_detected::value; +}; + +// The following implementation of is_complete_type is taken from +// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/ +// and is written by Xiang Fan who agreed to using it in this library. + +template +struct is_complete_type : std::false_type {}; + +template +struct is_complete_type : std::true_type {}; + +template +struct is_compatible_object_type_impl : std::false_type {}; + +template +struct is_compatible_object_type_impl < + BasicJsonType, CompatibleObjectType, + enable_if_t < is_detected::value&& + is_detected::value >> +{ + using object_t = typename BasicJsonType::object_t; + + // macOS's is_constructible does not play well with nonesuch... + static constexpr bool value = + is_constructible::value && + is_constructible::value; +}; + +template +struct is_compatible_object_type + : is_compatible_object_type_impl {}; + +template +struct is_constructible_object_type_impl : std::false_type {}; + +template +struct is_constructible_object_type_impl < + BasicJsonType, ConstructibleObjectType, + enable_if_t < is_detected::value&& + is_detected::value >> +{ + using object_t = typename BasicJsonType::object_t; + + static constexpr bool value = + (is_default_constructible::value && + (std::is_move_assignable::value || + std::is_copy_assignable::value) && + (is_constructible::value && + std::is_same < + typename object_t::mapped_type, + typename ConstructibleObjectType::mapped_type >::value)) || + (has_from_json::value || + has_non_default_from_json < + BasicJsonType, + typename ConstructibleObjectType::mapped_type >::value); +}; + +template +struct is_constructible_object_type + : is_constructible_object_type_impl {}; + +template +struct is_compatible_string_type_impl : std::false_type {}; + +template +struct is_compatible_string_type_impl < + BasicJsonType, CompatibleStringType, + enable_if_t::value >> +{ + static constexpr auto value = + is_constructible::value; +}; + +template +struct is_compatible_string_type + : is_compatible_string_type_impl {}; + +template +struct is_constructible_string_type_impl : std::false_type {}; + +template +struct is_constructible_string_type_impl < + BasicJsonType, ConstructibleStringType, + enable_if_t::value >> +{ + static constexpr auto value = + is_constructible::value; +}; + +template +struct is_constructible_string_type + : is_constructible_string_type_impl {}; + +template +struct is_compatible_array_type_impl : std::false_type {}; + +template +struct is_compatible_array_type_impl < + BasicJsonType, CompatibleArrayType, + enable_if_t < is_detected::value&& + is_detected::value&& +// This is needed because json_reverse_iterator has a ::iterator type... +// Therefore it is detected as a CompatibleArrayType. +// The real fix would be to have an Iterable concept. + !is_iterator_traits < + iterator_traits>::value >> +{ + static constexpr bool value = + is_constructible::value; +}; + +template +struct is_compatible_array_type + : is_compatible_array_type_impl {}; + +template +struct is_constructible_array_type_impl : std::false_type {}; + +template +struct is_constructible_array_type_impl < + BasicJsonType, ConstructibleArrayType, + enable_if_t::value >> + : std::true_type {}; + +template +struct is_constructible_array_type_impl < + BasicJsonType, ConstructibleArrayType, + enable_if_t < !std::is_same::value&& + is_default_constructible::value&& +(std::is_move_assignable::value || + std::is_copy_assignable::value)&& +is_detected::value&& +is_detected::value&& +is_complete_type < +detected_t>::value >> +{ + static constexpr bool value = + // This is needed because json_reverse_iterator has a ::iterator type, + // furthermore, std::back_insert_iterator (and other iterators) have a + // base class `iterator`... Therefore it is detected as a + // ConstructibleArrayType. The real fix would be to have an Iterable + // concept. + !is_iterator_traits>::value && + + (std::is_same::value || + has_from_json::value || + has_non_default_from_json < + BasicJsonType, typename ConstructibleArrayType::value_type >::value); +}; + +template +struct is_constructible_array_type + : is_constructible_array_type_impl {}; + +template +struct is_compatible_integer_type_impl : std::false_type {}; + +template +struct is_compatible_integer_type_impl < + RealIntegerType, CompatibleNumberIntegerType, + enable_if_t < std::is_integral::value&& + std::is_integral::value&& + !std::is_same::value >> +{ + // is there an assert somewhere on overflows? + using RealLimits = std::numeric_limits; + using CompatibleLimits = std::numeric_limits; + + static constexpr auto value = + is_constructible::value && + CompatibleLimits::is_integer && + RealLimits::is_signed == CompatibleLimits::is_signed; +}; + +template +struct is_compatible_integer_type + : is_compatible_integer_type_impl {}; + +template +struct is_compatible_type_impl: std::false_type {}; + +template +struct is_compatible_type_impl < + BasicJsonType, CompatibleType, + enable_if_t::value >> +{ + static constexpr bool value = + has_to_json::value; +}; + +template +struct is_compatible_type + : is_compatible_type_impl {}; + +template +struct is_constructible_tuple : std::false_type {}; + +template +struct is_constructible_tuple> : conjunction...> {}; +} // namespace detail +} // namespace nlohmann + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template +void from_json(const BasicJsonType& j, typename std::nullptr_t& n) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_null())) + { + JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name()), j)); + } + n = nullptr; +} + +// overloads for basic_json template parameters +template < typename BasicJsonType, typename ArithmeticType, + enable_if_t < std::is_arithmetic::value&& + !std::is_same::value, + int > = 0 > +void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val) +{ + switch (static_cast(j)) + { + case value_t::number_unsigned: + { + val = static_cast(*j.template get_ptr()); + break; + } + case value_t::number_integer: + { + val = static_cast(*j.template get_ptr()); + break; + } + case value_t::number_float: + { + val = static_cast(*j.template get_ptr()); + break; + } + + default: + JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j)); + } +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_boolean())) + { + JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name()), j)); + } + b = *j.template get_ptr(); +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_string())) + { + JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j)); + } + s = *j.template get_ptr(); +} + +template < + typename BasicJsonType, typename ConstructibleStringType, + enable_if_t < + is_constructible_string_type::value&& + !std::is_same::value, + int > = 0 > +void from_json(const BasicJsonType& j, ConstructibleStringType& s) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_string())) + { + JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j)); + } + + s = *j.template get_ptr(); +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val) +{ + get_arithmetic_value(j, val); +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val) +{ + get_arithmetic_value(j, val); +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val) +{ + get_arithmetic_value(j, val); +} + +template::value, int> = 0> +void from_json(const BasicJsonType& j, EnumType& e) +{ + typename std::underlying_type::type val; + get_arithmetic_value(j, val); + e = static_cast(val); +} + +// forward_list doesn't have an insert method +template::value, int> = 0> +void from_json(const BasicJsonType& j, std::forward_list& l) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + l.clear(); + std::transform(j.rbegin(), j.rend(), + std::front_inserter(l), [](const BasicJsonType & i) + { + return i.template get(); + }); +} + +// valarray doesn't have an insert method +template::value, int> = 0> +void from_json(const BasicJsonType& j, std::valarray& l) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + l.resize(j.size()); + std::transform(j.begin(), j.end(), std::begin(l), + [](const BasicJsonType & elem) + { + return elem.template get(); + }); +} + +template +auto from_json(const BasicJsonType& j, T (&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) +-> decltype(j.template get(), void()) +{ + for (std::size_t i = 0; i < N; ++i) + { + arr[i] = j.at(i).template get(); + } +} + +template +void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/) +{ + arr = *j.template get_ptr(); +} + +template +auto from_json_array_impl(const BasicJsonType& j, std::array& arr, + priority_tag<2> /*unused*/) +-> decltype(j.template get(), void()) +{ + for (std::size_t i = 0; i < N; ++i) + { + arr[i] = j.at(i).template get(); + } +} + +template::value, + int> = 0> +auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/) +-> decltype( + arr.reserve(std::declval()), + j.template get(), + void()) +{ + using std::end; + + ConstructibleArrayType ret; + ret.reserve(j.size()); + std::transform(j.begin(), j.end(), + std::inserter(ret, end(ret)), [](const BasicJsonType & i) + { + // get() returns *this, this won't call a from_json + // method when value_type is BasicJsonType + return i.template get(); + }); + arr = std::move(ret); +} + +template::value, + int> = 0> +void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, + priority_tag<0> /*unused*/) +{ + using std::end; + + ConstructibleArrayType ret; + std::transform( + j.begin(), j.end(), std::inserter(ret, end(ret)), + [](const BasicJsonType & i) + { + // get() returns *this, this won't call a from_json + // method when value_type is BasicJsonType + return i.template get(); + }); + arr = std::move(ret); +} + +template < typename BasicJsonType, typename ConstructibleArrayType, + enable_if_t < + is_constructible_array_type::value&& + !is_constructible_object_type::value&& + !is_constructible_string_type::value&& + !std::is_same::value&& + !is_basic_json::value, + int > = 0 > +auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr) +-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}), +j.template get(), +void()) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + + from_json_array_impl(j, arr, priority_tag<3> {}); +} + +template < typename BasicJsonType, typename T, std::size_t... Idx > +std::array from_json_inplace_array_impl(BasicJsonType&& j, + identity_tag> /*unused*/, index_sequence /*unused*/) +{ + return { { std::forward(j).at(Idx).template get()... } }; +} + +template < typename BasicJsonType, typename T, std::size_t N > +auto from_json(BasicJsonType&& j, identity_tag> tag) +-> decltype(from_json_inplace_array_impl(std::forward(j), tag, make_index_sequence {})) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + + return from_json_inplace_array_impl(std::forward(j), tag, make_index_sequence {}); +} + +template +void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_binary())) + { + JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(j.type_name()), j)); + } + + bin = *j.template get_ptr(); +} + +template::value, int> = 0> +void from_json(const BasicJsonType& j, ConstructibleObjectType& obj) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_object())) + { + JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name()), j)); + } + + ConstructibleObjectType ret; + const auto* inner_object = j.template get_ptr(); + using value_type = typename ConstructibleObjectType::value_type; + std::transform( + inner_object->begin(), inner_object->end(), + std::inserter(ret, ret.begin()), + [](typename BasicJsonType::object_t::value_type const & p) + { + return value_type(p.first, p.second.template get()); + }); + obj = std::move(ret); +} + +// overload for arithmetic types, not chosen for basic_json template arguments +// (BooleanType, etc..); note: Is it really necessary to provide explicit +// overloads for boolean_t etc. in case of a custom BooleanType which is not +// an arithmetic type? +template < typename BasicJsonType, typename ArithmeticType, + enable_if_t < + std::is_arithmetic::value&& + !std::is_same::value&& + !std::is_same::value&& + !std::is_same::value&& + !std::is_same::value, + int > = 0 > +void from_json(const BasicJsonType& j, ArithmeticType& val) +{ + switch (static_cast(j)) + { + case value_t::number_unsigned: + { + val = static_cast(*j.template get_ptr()); + break; + } + case value_t::number_integer: + { + val = static_cast(*j.template get_ptr()); + break; + } + case value_t::number_float: + { + val = static_cast(*j.template get_ptr()); + break; + } + case value_t::boolean: + { + val = static_cast(*j.template get_ptr()); + break; + } + + default: + JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j)); + } +} + +template +std::tuple from_json_tuple_impl_base(BasicJsonType&& j, index_sequence /*unused*/) +{ + return std::make_tuple(std::forward(j).at(Idx).template get()...); +} + +template < typename BasicJsonType, class A1, class A2 > +std::pair from_json_tuple_impl(BasicJsonType&& j, identity_tag> /*unused*/, priority_tag<0> /*unused*/) +{ + return {std::forward(j).at(0).template get(), + std::forward(j).at(1).template get()}; +} + +template +void from_json_tuple_impl(BasicJsonType&& j, std::pair& p, priority_tag<1> /*unused*/) +{ + p = from_json_tuple_impl(std::forward(j), identity_tag> {}, priority_tag<0> {}); +} + +template +std::tuple from_json_tuple_impl(BasicJsonType&& j, identity_tag> /*unused*/, priority_tag<2> /*unused*/) +{ + return from_json_tuple_impl_base(std::forward(j), index_sequence_for {}); +} + +template +void from_json_tuple_impl(BasicJsonType&& j, std::tuple& t, priority_tag<3> /*unused*/) +{ + t = from_json_tuple_impl_base(std::forward(j), index_sequence_for {}); +} + +template +auto from_json(BasicJsonType&& j, TupleRelated&& t) +-> decltype(from_json_tuple_impl(std::forward(j), std::forward(t), priority_tag<3> {})) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + + return from_json_tuple_impl(std::forward(j), std::forward(t), priority_tag<3> {}); +} + +template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator, + typename = enable_if_t < !std::is_constructible < + typename BasicJsonType::string_t, Key >::value >> +void from_json(const BasicJsonType& j, std::map& m) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + m.clear(); + for (const auto& p : j) + { + if (JSON_HEDLEY_UNLIKELY(!p.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j)); + } + m.emplace(p.at(0).template get(), p.at(1).template get()); + } +} + +template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator, + typename = enable_if_t < !std::is_constructible < + typename BasicJsonType::string_t, Key >::value >> +void from_json(const BasicJsonType& j, std::unordered_map& m) +{ + if (JSON_HEDLEY_UNLIKELY(!j.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j)); + } + m.clear(); + for (const auto& p : j) + { + if (JSON_HEDLEY_UNLIKELY(!p.is_array())) + { + JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j)); + } + m.emplace(p.at(0).template get(), p.at(1).template get()); + } +} + +struct from_json_fn +{ + template + auto operator()(const BasicJsonType& j, T&& val) const + noexcept(noexcept(from_json(j, std::forward(val)))) + -> decltype(from_json(j, std::forward(val))) + { + return from_json(j, std::forward(val)); + } +}; +} // namespace detail + +/// namespace to hold default `from_json` function +/// to see why this is required: +/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html +namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces) +{ +constexpr const auto& from_json = detail::static_const::value; // NOLINT(misc-definitions-in-headers) +} // namespace +} // namespace nlohmann + +// #include + + +#include // copy +#include // begin, end +#include // string +#include // tuple, get +#include // is_same, is_constructible, is_floating_point, is_enum, underlying_type +#include // move, forward, declval, pair +#include // valarray +#include // vector + +// #include + + +#include // size_t +#include // input_iterator_tag +#include // string, to_string +#include // tuple_size, get, tuple_element +#include // move + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template +void int_to_string( string_type& target, std::size_t value ) +{ + // For ADL + using std::to_string; + target = to_string(value); +} +template class iteration_proxy_value +{ + public: + using difference_type = std::ptrdiff_t; + using value_type = iteration_proxy_value; + using pointer = value_type * ; + using reference = value_type & ; + using iterator_category = std::input_iterator_tag; + using string_type = typename std::remove_cv< typename std::remove_reference().key() ) >::type >::type; + + private: + /// the iterator + IteratorType anchor; + /// an index for arrays (used to create key names) + std::size_t array_index = 0; + /// last stringified array index + mutable std::size_t array_index_last = 0; + /// a string representation of the array index + mutable string_type array_index_str = "0"; + /// an empty string (to return a reference for primitive values) + const string_type empty_str{}; + + public: + explicit iteration_proxy_value(IteratorType it) noexcept + : anchor(std::move(it)) + {} + + /// dereference operator (needed for range-based for) + iteration_proxy_value& operator*() + { + return *this; + } + + /// increment operator (needed for range-based for) + iteration_proxy_value& operator++() + { + ++anchor; + ++array_index; + + return *this; + } + + /// equality operator (needed for InputIterator) + bool operator==(const iteration_proxy_value& o) const + { + return anchor == o.anchor; + } + + /// inequality operator (needed for range-based for) + bool operator!=(const iteration_proxy_value& o) const + { + return anchor != o.anchor; + } + + /// return key of the iterator + const string_type& key() const + { + JSON_ASSERT(anchor.m_object != nullptr); + + switch (anchor.m_object->type()) + { + // use integer array index as key + case value_t::array: + { + if (array_index != array_index_last) + { + int_to_string( array_index_str, array_index ); + array_index_last = array_index; + } + return array_index_str; + } + + // use key from the object + case value_t::object: + return anchor.key(); + + // use an empty key for all primitive types + default: + return empty_str; + } + } + + /// return value of the iterator + typename IteratorType::reference value() const + { + return anchor.value(); + } +}; + +/// proxy class for the items() function +template class iteration_proxy +{ + private: + /// the container to iterate + typename IteratorType::reference container; + + public: + /// construct iteration proxy from a container + explicit iteration_proxy(typename IteratorType::reference cont) noexcept + : container(cont) {} + + /// return iterator begin (needed for range-based for) + iteration_proxy_value begin() noexcept + { + return iteration_proxy_value(container.begin()); + } + + /// return iterator end (needed for range-based for) + iteration_proxy_value end() noexcept + { + return iteration_proxy_value(container.end()); + } +}; +// Structured Bindings Support +// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// And see https://github.com/nlohmann/json/pull/1391 +template = 0> +auto get(const nlohmann::detail::iteration_proxy_value& i) -> decltype(i.key()) +{ + return i.key(); +} +// Structured Bindings Support +// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// And see https://github.com/nlohmann/json/pull/1391 +template = 0> +auto get(const nlohmann::detail::iteration_proxy_value& i) -> decltype(i.value()) +{ + return i.value(); +} +} // namespace detail +} // namespace nlohmann + +// The Addition to the STD Namespace is required to add +// Structured Bindings Support to the iteration_proxy_value class +// For further reference see https://blog.tartanllama.xyz/structured-bindings/ +// And see https://github.com/nlohmann/json/pull/1391 +namespace std +{ +#if defined(__clang__) + // Fix: https://github.com/nlohmann/json/issues/1401 + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wmismatched-tags" +#endif +template +class tuple_size<::nlohmann::detail::iteration_proxy_value> + : public std::integral_constant {}; + +template +class tuple_element> +{ + public: + using type = decltype( + get(std::declval < + ::nlohmann::detail::iteration_proxy_value> ())); +}; +#if defined(__clang__) + #pragma clang diagnostic pop +#endif +} // namespace std + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +////////////////// +// constructors // +////////////////// + +template struct external_constructor; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept + { + j.m_type = value_t::boolean; + j.m_value = b; + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s) + { + j.m_type = value_t::string; + j.m_value = s; + j.assert_invariant(); + } + + template + static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s) + { + j.m_type = value_t::string; + j.m_value = std::move(s); + j.assert_invariant(); + } + + template < typename BasicJsonType, typename CompatibleStringType, + enable_if_t < !std::is_same::value, + int > = 0 > + static void construct(BasicJsonType& j, const CompatibleStringType& str) + { + j.m_type = value_t::string; + j.m_value.string = j.template create(str); + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b) + { + j.m_type = value_t::binary; + j.m_value = typename BasicJsonType::binary_t(b); + j.assert_invariant(); + } + + template + static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b) + { + j.m_type = value_t::binary; + j.m_value = typename BasicJsonType::binary_t(std::move(b));; + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept + { + j.m_type = value_t::number_float; + j.m_value = val; + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept + { + j.m_type = value_t::number_unsigned; + j.m_value = val; + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept + { + j.m_type = value_t::number_integer; + j.m_value = val; + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr) + { + j.m_type = value_t::array; + j.m_value = arr; + j.set_parents(); + j.assert_invariant(); + } + + template + static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr) + { + j.m_type = value_t::array; + j.m_value = std::move(arr); + j.set_parents(); + j.assert_invariant(); + } + + template < typename BasicJsonType, typename CompatibleArrayType, + enable_if_t < !std::is_same::value, + int > = 0 > + static void construct(BasicJsonType& j, const CompatibleArrayType& arr) + { + using std::begin; + using std::end; + j.m_type = value_t::array; + j.m_value.array = j.template create(begin(arr), end(arr)); + j.set_parents(); + j.assert_invariant(); + } + + template + static void construct(BasicJsonType& j, const std::vector& arr) + { + j.m_type = value_t::array; + j.m_value = value_t::array; + j.m_value.array->reserve(arr.size()); + for (const bool x : arr) + { + j.m_value.array->push_back(x); + j.set_parent(j.m_value.array->back()); + } + j.assert_invariant(); + } + + template::value, int> = 0> + static void construct(BasicJsonType& j, const std::valarray& arr) + { + j.m_type = value_t::array; + j.m_value = value_t::array; + j.m_value.array->resize(arr.size()); + if (arr.size() > 0) + { + std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin()); + } + j.set_parents(); + j.assert_invariant(); + } +}; + +template<> +struct external_constructor +{ + template + static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj) + { + j.m_type = value_t::object; + j.m_value = obj; + j.set_parents(); + j.assert_invariant(); + } + + template + static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj) + { + j.m_type = value_t::object; + j.m_value = std::move(obj); + j.set_parents(); + j.assert_invariant(); + } + + template < typename BasicJsonType, typename CompatibleObjectType, + enable_if_t < !std::is_same::value, int > = 0 > + static void construct(BasicJsonType& j, const CompatibleObjectType& obj) + { + using std::begin; + using std::end; + + j.m_type = value_t::object; + j.m_value.object = j.template create(begin(obj), end(obj)); + j.set_parents(); + j.assert_invariant(); + } +}; + +///////////// +// to_json // +///////////// + +template::value, int> = 0> +void to_json(BasicJsonType& j, T b) noexcept +{ + external_constructor::construct(j, b); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, const CompatibleString& s) +{ + external_constructor::construct(j, s); +} + +template +void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s) +{ + external_constructor::construct(j, std::move(s)); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, FloatType val) noexcept +{ + external_constructor::construct(j, static_cast(val)); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept +{ + external_constructor::construct(j, static_cast(val)); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept +{ + external_constructor::construct(j, static_cast(val)); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, EnumType e) noexcept +{ + using underlying_type = typename std::underlying_type::type; + external_constructor::construct(j, static_cast(e)); +} + +template +void to_json(BasicJsonType& j, const std::vector& e) +{ + external_constructor::construct(j, e); +} + +template < typename BasicJsonType, typename CompatibleArrayType, + enable_if_t < is_compatible_array_type::value&& + !is_compatible_object_type::value&& + !is_compatible_string_type::value&& + !std::is_same::value&& + !is_basic_json::value, + int > = 0 > +void to_json(BasicJsonType& j, const CompatibleArrayType& arr) +{ + external_constructor::construct(j, arr); +} + +template +void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin) +{ + external_constructor::construct(j, bin); +} + +template::value, int> = 0> +void to_json(BasicJsonType& j, const std::valarray& arr) +{ + external_constructor::construct(j, std::move(arr)); +} + +template +void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr) +{ + external_constructor::construct(j, std::move(arr)); +} + +template < typename BasicJsonType, typename CompatibleObjectType, + enable_if_t < is_compatible_object_type::value&& !is_basic_json::value, int > = 0 > +void to_json(BasicJsonType& j, const CompatibleObjectType& obj) +{ + external_constructor::construct(j, obj); +} + +template +void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj) +{ + external_constructor::construct(j, std::move(obj)); +} + +template < + typename BasicJsonType, typename T, std::size_t N, + enable_if_t < !std::is_constructible::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) + int > = 0 > +void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) +{ + external_constructor::construct(j, arr); +} + +template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible::value&& std::is_constructible::value, int > = 0 > +void to_json(BasicJsonType& j, const std::pair& p) +{ + j = { p.first, p.second }; +} + +// for https://github.com/nlohmann/json/pull/1134 +template>::value, int> = 0> +void to_json(BasicJsonType& j, const T& b) +{ + j = { {b.key(), b.value()} }; +} + +template +void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence /*unused*/) +{ + j = { std::get(t)... }; +} + +template::value, int > = 0> +void to_json(BasicJsonType& j, const T& t) +{ + to_json_tuple_impl(j, t, make_index_sequence::value> {}); +} + +struct to_json_fn +{ + template + auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward(val)))) + -> decltype(to_json(j, std::forward(val)), void()) + { + return to_json(j, std::forward(val)); + } +}; +} // namespace detail + +/// namespace to hold default `to_json` function +/// to see why this is required: +/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html +namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces) +{ +constexpr const auto& to_json = detail::static_const::value; // NOLINT(misc-definitions-in-headers) +} // namespace +} // namespace nlohmann + +// #include + +// #include + + +namespace nlohmann +{ + +template +struct adl_serializer +{ + /*! + @brief convert a JSON value to any value type + + This function is usually called by the `get()` function of the + @ref basic_json class (either explicit or via conversion operators). + + @note This function is chosen for default-constructible value types. + + @param[in] j JSON value to read from + @param[in,out] val value to write to + */ + template + static auto from_json(BasicJsonType && j, TargetType& val) noexcept( + noexcept(::nlohmann::from_json(std::forward(j), val))) + -> decltype(::nlohmann::from_json(std::forward(j), val), void()) + { + ::nlohmann::from_json(std::forward(j), val); + } + + /*! + @brief convert a JSON value to any value type + + This function is usually called by the `get()` function of the + @ref basic_json class (either explicit or via conversion operators). + + @note This function is chosen for value types which are not default-constructible. + + @param[in] j JSON value to read from + + @return copy of the JSON value, converted to @a ValueType + */ + template + static auto from_json(BasicJsonType && j) noexcept( + noexcept(::nlohmann::from_json(std::forward(j), detail::identity_tag {}))) + -> decltype(::nlohmann::from_json(std::forward(j), detail::identity_tag {})) + { + return ::nlohmann::from_json(std::forward(j), detail::identity_tag {}); + } + + /*! + @brief convert any value type to a JSON value + + This function is usually called by the constructors of the @ref basic_json + class. + + @param[in,out] j JSON value to write to + @param[in] val value to read from + */ + template + static auto to_json(BasicJsonType& j, TargetType && val) noexcept( + noexcept(::nlohmann::to_json(j, std::forward(val)))) + -> decltype(::nlohmann::to_json(j, std::forward(val)), void()) + { + ::nlohmann::to_json(j, std::forward(val)); + } +}; +} // namespace nlohmann + +// #include + + +#include // uint8_t +#include // tie +#include // move + +namespace nlohmann +{ + +/*! +@brief an internal type for a backed binary type + +This type extends the template parameter @a BinaryType provided to `basic_json` +with a subtype used by BSON and MessagePack. This type exists so that the user +does not have to specify a type themselves with a specific naming scheme in +order to override the binary type. + +@tparam BinaryType container to store bytes (`std::vector` by + default) + +@since version 3.8.0 +*/ +template +class byte_container_with_subtype : public BinaryType +{ + public: + /// the type of the underlying container + using container_type = BinaryType; + + byte_container_with_subtype() noexcept(noexcept(container_type())) + : container_type() + {} + + byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b))) + : container_type(b) + {} + + byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b)))) + : container_type(std::move(b)) + {} + + byte_container_with_subtype(const container_type& b, std::uint8_t subtype_) noexcept(noexcept(container_type(b))) + : container_type(b) + , m_subtype(subtype_) + , m_has_subtype(true) + {} + + byte_container_with_subtype(container_type&& b, std::uint8_t subtype_) noexcept(noexcept(container_type(std::move(b)))) + : container_type(std::move(b)) + , m_subtype(subtype_) + , m_has_subtype(true) + {} + + bool operator==(const byte_container_with_subtype& rhs) const + { + return std::tie(static_cast(*this), m_subtype, m_has_subtype) == + std::tie(static_cast(rhs), rhs.m_subtype, rhs.m_has_subtype); + } + + bool operator!=(const byte_container_with_subtype& rhs) const + { + return !(rhs == *this); + } + + /*! + @brief sets the binary subtype + + Sets the binary subtype of the value, also flags a binary JSON value as + having a subtype, which has implications for serialization. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @sa see @ref subtype() -- return the binary subtype + @sa see @ref clear_subtype() -- clears the binary subtype + @sa see @ref has_subtype() -- returns whether or not the binary value has a + subtype + + @since version 3.8.0 + */ + void set_subtype(std::uint8_t subtype_) noexcept + { + m_subtype = subtype_; + m_has_subtype = true; + } + + /*! + @brief return the binary subtype + + Returns the numerical subtype of the value if it has a subtype. If it does + not have a subtype, this function will return size_t(-1) as a sentinel + value. + + @return the numerical subtype of the binary value + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @sa see @ref set_subtype() -- sets the binary subtype + @sa see @ref clear_subtype() -- clears the binary subtype + @sa see @ref has_subtype() -- returns whether or not the binary value has a + subtype + + @since version 3.8.0 + */ + constexpr std::uint8_t subtype() const noexcept + { + return m_subtype; + } + + /*! + @brief return whether the value has a subtype + + @return whether the value has a subtype + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @sa see @ref subtype() -- return the binary subtype + @sa see @ref set_subtype() -- sets the binary subtype + @sa see @ref clear_subtype() -- clears the binary subtype + + @since version 3.8.0 + */ + constexpr bool has_subtype() const noexcept + { + return m_has_subtype; + } + + /*! + @brief clears the binary subtype + + Clears the binary subtype and flags the value as not having a subtype, which + has implications for serialization; for instance MessagePack will prefer the + bin family over the ext family. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @sa see @ref subtype() -- return the binary subtype + @sa see @ref set_subtype() -- sets the binary subtype + @sa see @ref has_subtype() -- returns whether or not the binary value has a + subtype + + @since version 3.8.0 + */ + void clear_subtype() noexcept + { + m_subtype = 0; + m_has_subtype = false; + } + + private: + std::uint8_t m_subtype = 0; + bool m_has_subtype = false; +}; + +} // namespace nlohmann + +// #include + +// #include + +// #include + +// #include + + +#include // uint8_t +#include // size_t +#include // hash + +// #include + + +namespace nlohmann +{ +namespace detail +{ + +// boost::hash_combine +inline std::size_t combine(std::size_t seed, std::size_t h) noexcept +{ + seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U); + return seed; +} + +/*! +@brief hash a JSON value + +The hash function tries to rely on std::hash where possible. Furthermore, the +type of the JSON value is taken into account to have different hash values for +null, 0, 0U, and false, etc. + +@tparam BasicJsonType basic_json specialization +@param j JSON value to hash +@return hash value of j +*/ +template +std::size_t hash(const BasicJsonType& j) +{ + using string_t = typename BasicJsonType::string_t; + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + + const auto type = static_cast(j.type()); + switch (j.type()) + { + case BasicJsonType::value_t::null: + case BasicJsonType::value_t::discarded: + { + return combine(type, 0); + } + + case BasicJsonType::value_t::object: + { + auto seed = combine(type, j.size()); + for (const auto& element : j.items()) + { + const auto h = std::hash {}(element.key()); + seed = combine(seed, h); + seed = combine(seed, hash(element.value())); + } + return seed; + } + + case BasicJsonType::value_t::array: + { + auto seed = combine(type, j.size()); + for (const auto& element : j) + { + seed = combine(seed, hash(element)); + } + return seed; + } + + case BasicJsonType::value_t::string: + { + const auto h = std::hash {}(j.template get_ref()); + return combine(type, h); + } + + case BasicJsonType::value_t::boolean: + { + const auto h = std::hash {}(j.template get()); + return combine(type, h); + } + + case BasicJsonType::value_t::number_integer: + { + const auto h = std::hash {}(j.template get()); + return combine(type, h); + } + + case BasicJsonType::value_t::number_unsigned: + { + const auto h = std::hash {}(j.template get()); + return combine(type, h); + } + + case BasicJsonType::value_t::number_float: + { + const auto h = std::hash {}(j.template get()); + return combine(type, h); + } + + case BasicJsonType::value_t::binary: + { + auto seed = combine(type, j.get_binary().size()); + const auto h = std::hash {}(j.get_binary().has_subtype()); + seed = combine(seed, h); + seed = combine(seed, j.get_binary().subtype()); + for (const auto byte : j.get_binary()) + { + seed = combine(seed, std::hash {}(byte)); + } + return seed; + } + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + return 0; // LCOV_EXCL_LINE + } +} + +} // namespace detail +} // namespace nlohmann + +// #include + + +#include // generate_n +#include // array +#include // ldexp +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t +#include // snprintf +#include // memcpy +#include // back_inserter +#include // numeric_limits +#include // char_traits, string +#include // make_pair, move +#include // vector + +// #include + +// #include + + +#include // array +#include // size_t +#include // strlen +#include // begin, end, iterator_traits, random_access_iterator_tag, distance, next +#include // shared_ptr, make_shared, addressof +#include // accumulate +#include // string, char_traits +#include // enable_if, is_base_of, is_pointer, is_integral, remove_pointer +#include // pair, declval + +#ifndef JSON_NO_IO + #include //FILE * + #include // istream +#endif // JSON_NO_IO + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/// the supported input formats +enum class input_format_t { json, cbor, msgpack, ubjson, bson }; + +//////////////////// +// input adapters // +//////////////////// + +#ifndef JSON_NO_IO +/*! +Input adapter for stdio file access. This adapter read only 1 byte and do not use any + buffer. This adapter is a very low level adapter. +*/ +class file_input_adapter +{ + public: + using char_type = char; + + JSON_HEDLEY_NON_NULL(2) + explicit file_input_adapter(std::FILE* f) noexcept + : m_file(f) + {} + + // make class move-only + file_input_adapter(const file_input_adapter&) = delete; + file_input_adapter(file_input_adapter&&) noexcept = default; + file_input_adapter& operator=(const file_input_adapter&) = delete; + file_input_adapter& operator=(file_input_adapter&&) = delete; + ~file_input_adapter() = default; + + std::char_traits::int_type get_character() noexcept + { + return std::fgetc(m_file); + } + + private: + /// the file pointer to read from + std::FILE* m_file; +}; + + +/*! +Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at +beginning of input. Does not support changing the underlying std::streambuf +in mid-input. Maintains underlying std::istream and std::streambuf to support +subsequent use of standard std::istream operations to process any input +characters following those used in parsing the JSON input. Clears the +std::istream flags; any input errors (e.g., EOF) will be detected by the first +subsequent call for input from the std::istream. +*/ +class input_stream_adapter +{ + public: + using char_type = char; + + ~input_stream_adapter() + { + // clear stream flags; we use underlying streambuf I/O, do not + // maintain ifstream flags, except eof + if (is != nullptr) + { + is->clear(is->rdstate() & std::ios::eofbit); + } + } + + explicit input_stream_adapter(std::istream& i) + : is(&i), sb(i.rdbuf()) + {} + + // delete because of pointer members + input_stream_adapter(const input_stream_adapter&) = delete; + input_stream_adapter& operator=(input_stream_adapter&) = delete; + input_stream_adapter& operator=(input_stream_adapter&&) = delete; + + input_stream_adapter(input_stream_adapter&& rhs) noexcept + : is(rhs.is), sb(rhs.sb) + { + rhs.is = nullptr; + rhs.sb = nullptr; + } + + // std::istream/std::streambuf use std::char_traits::to_int_type, to + // ensure that std::char_traits::eof() and the character 0xFF do not + // end up as the same value, eg. 0xFFFFFFFF. + std::char_traits::int_type get_character() + { + auto res = sb->sbumpc(); + // set eof manually, as we don't use the istream interface. + if (JSON_HEDLEY_UNLIKELY(res == std::char_traits::eof())) + { + is->clear(is->rdstate() | std::ios::eofbit); + } + return res; + } + + private: + /// the associated input stream + std::istream* is = nullptr; + std::streambuf* sb = nullptr; +}; +#endif // JSON_NO_IO + +// General-purpose iterator-based adapter. It might not be as fast as +// theoretically possible for some containers, but it is extremely versatile. +template +class iterator_input_adapter +{ + public: + using char_type = typename std::iterator_traits::value_type; + + iterator_input_adapter(IteratorType first, IteratorType last) + : current(std::move(first)), end(std::move(last)) + {} + + typename std::char_traits::int_type get_character() + { + if (JSON_HEDLEY_LIKELY(current != end)) + { + auto result = std::char_traits::to_int_type(*current); + std::advance(current, 1); + return result; + } + + return std::char_traits::eof(); + } + + private: + IteratorType current; + IteratorType end; + + template + friend struct wide_string_input_helper; + + bool empty() const + { + return current == end; + } +}; + + +template +struct wide_string_input_helper; + +template +struct wide_string_input_helper +{ + // UTF-32 + static void fill_buffer(BaseInputAdapter& input, + std::array::int_type, 4>& utf8_bytes, + size_t& utf8_bytes_index, + size_t& utf8_bytes_filled) + { + utf8_bytes_index = 0; + + if (JSON_HEDLEY_UNLIKELY(input.empty())) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const auto wc = input.get_character(); + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = static_cast::int_type>(wc); + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = static_cast::int_type>(0xC0u | ((static_cast(wc) >> 6u) & 0x1Fu)); + utf8_bytes[1] = static_cast::int_type>(0x80u | (static_cast(wc) & 0x3Fu)); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) + { + utf8_bytes[0] = static_cast::int_type>(0xE0u | ((static_cast(wc) >> 12u) & 0x0Fu)); + utf8_bytes[1] = static_cast::int_type>(0x80u | ((static_cast(wc) >> 6u) & 0x3Fu)); + utf8_bytes[2] = static_cast::int_type>(0x80u | (static_cast(wc) & 0x3Fu)); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = static_cast::int_type>(0xF0u | ((static_cast(wc) >> 18u) & 0x07u)); + utf8_bytes[1] = static_cast::int_type>(0x80u | ((static_cast(wc) >> 12u) & 0x3Fu)); + utf8_bytes[2] = static_cast::int_type>(0x80u | ((static_cast(wc) >> 6u) & 0x3Fu)); + utf8_bytes[3] = static_cast::int_type>(0x80u | (static_cast(wc) & 0x3Fu)); + utf8_bytes_filled = 4; + } + else + { + // unknown character + utf8_bytes[0] = static_cast::int_type>(wc); + utf8_bytes_filled = 1; + } + } + } +}; + +template +struct wide_string_input_helper +{ + // UTF-16 + static void fill_buffer(BaseInputAdapter& input, + std::array::int_type, 4>& utf8_bytes, + size_t& utf8_bytes_index, + size_t& utf8_bytes_filled) + { + utf8_bytes_index = 0; + + if (JSON_HEDLEY_UNLIKELY(input.empty())) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const auto wc = input.get_character(); + + // UTF-16 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = static_cast::int_type>(wc); + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = static_cast::int_type>(0xC0u | ((static_cast(wc) >> 6u))); + utf8_bytes[1] = static_cast::int_type>(0x80u | (static_cast(wc) & 0x3Fu)); + utf8_bytes_filled = 2; + } + else if (0xD800 > wc || wc >= 0xE000) + { + utf8_bytes[0] = static_cast::int_type>(0xE0u | ((static_cast(wc) >> 12u))); + utf8_bytes[1] = static_cast::int_type>(0x80u | ((static_cast(wc) >> 6u) & 0x3Fu)); + utf8_bytes[2] = static_cast::int_type>(0x80u | (static_cast(wc) & 0x3Fu)); + utf8_bytes_filled = 3; + } + else + { + if (JSON_HEDLEY_UNLIKELY(!input.empty())) + { + const auto wc2 = static_cast(input.get_character()); + const auto charcode = 0x10000u + (((static_cast(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu)); + utf8_bytes[0] = static_cast::int_type>(0xF0u | (charcode >> 18u)); + utf8_bytes[1] = static_cast::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu)); + utf8_bytes[2] = static_cast::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu)); + utf8_bytes[3] = static_cast::int_type>(0x80u | (charcode & 0x3Fu)); + utf8_bytes_filled = 4; + } + else + { + utf8_bytes[0] = static_cast::int_type>(wc); + utf8_bytes_filled = 1; + } + } + } + } +}; + +// Wraps another input apdater to convert wide character types into individual bytes. +template +class wide_string_input_adapter +{ + public: + using char_type = char; + + wide_string_input_adapter(BaseInputAdapter base) + : base_adapter(base) {} + + typename std::char_traits::int_type get_character() noexcept + { + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) + { + fill_buffer(); + + JSON_ASSERT(utf8_bytes_filled > 0); + JSON_ASSERT(utf8_bytes_index == 0); + } + + // use buffer + JSON_ASSERT(utf8_bytes_filled > 0); + JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled); + return utf8_bytes[utf8_bytes_index++]; + } + + private: + BaseInputAdapter base_adapter; + + template + void fill_buffer() + { + wide_string_input_helper::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled); + } + + /// a buffer for UTF-8 bytes + std::array::int_type, 4> utf8_bytes = {{0, 0, 0, 0}}; + + /// index to the utf8_codes array for the next valid byte + std::size_t utf8_bytes_index = 0; + /// number of valid bytes in the utf8_codes array + std::size_t utf8_bytes_filled = 0; +}; + + +template +struct iterator_input_adapter_factory +{ + using iterator_type = IteratorType; + using char_type = typename std::iterator_traits::value_type; + using adapter_type = iterator_input_adapter; + + static adapter_type create(IteratorType first, IteratorType last) + { + return adapter_type(std::move(first), std::move(last)); + } +}; + +template +struct is_iterator_of_multibyte +{ + using value_type = typename std::iterator_traits::value_type; + enum + { + value = sizeof(value_type) > 1 + }; +}; + +template +struct iterator_input_adapter_factory::value>> +{ + using iterator_type = IteratorType; + using char_type = typename std::iterator_traits::value_type; + using base_adapter_type = iterator_input_adapter; + using adapter_type = wide_string_input_adapter; + + static adapter_type create(IteratorType first, IteratorType last) + { + return adapter_type(base_adapter_type(std::move(first), std::move(last))); + } +}; + +// General purpose iterator-based input +template +typename iterator_input_adapter_factory::adapter_type input_adapter(IteratorType first, IteratorType last) +{ + using factory_type = iterator_input_adapter_factory; + return factory_type::create(first, last); +} + +// Convenience shorthand from container to iterator +// Enables ADL on begin(container) and end(container) +// Encloses the using declarations in namespace for not to leak them to outside scope + +namespace container_input_adapter_factory_impl +{ + +using std::begin; +using std::end; + +template +struct container_input_adapter_factory {}; + +template +struct container_input_adapter_factory< ContainerType, + void_t()), end(std::declval()))>> + { + using adapter_type = decltype(input_adapter(begin(std::declval()), end(std::declval()))); + + static adapter_type create(const ContainerType& container) +{ + return input_adapter(begin(container), end(container)); +} + }; + +} // namespace container_input_adapter_factory_impl + +template +typename container_input_adapter_factory_impl::container_input_adapter_factory::adapter_type input_adapter(const ContainerType& container) +{ + return container_input_adapter_factory_impl::container_input_adapter_factory::create(container); +} + +#ifndef JSON_NO_IO +// Special cases with fast paths +inline file_input_adapter input_adapter(std::FILE* file) +{ + return file_input_adapter(file); +} + +inline input_stream_adapter input_adapter(std::istream& stream) +{ + return input_stream_adapter(stream); +} + +inline input_stream_adapter input_adapter(std::istream&& stream) +{ + return input_stream_adapter(stream); +} +#endif // JSON_NO_IO + +using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval(), std::declval())); + +// Null-delimited strings, and the like. +template < typename CharT, + typename std::enable_if < + std::is_pointer::value&& + !std::is_array::value&& + std::is_integral::type>::value&& + sizeof(typename std::remove_pointer::type) == 1, + int >::type = 0 > +contiguous_bytes_input_adapter input_adapter(CharT b) +{ + auto length = std::strlen(reinterpret_cast(b)); + const auto* ptr = reinterpret_cast(b); + return input_adapter(ptr, ptr + length); +} + +template +auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) +{ + return input_adapter(array, array + N); +} + +// This class only handles inputs of input_buffer_adapter type. +// It's required so that expressions like {ptr, len} can be implicitely casted +// to the correct adapter. +class span_input_adapter +{ + public: + template < typename CharT, + typename std::enable_if < + std::is_pointer::value&& + std::is_integral::type>::value&& + sizeof(typename std::remove_pointer::type) == 1, + int >::type = 0 > + span_input_adapter(CharT b, std::size_t l) + : ia(reinterpret_cast(b), reinterpret_cast(b) + l) {} + + template::iterator_category, std::random_access_iterator_tag>::value, + int>::type = 0> + span_input_adapter(IteratorType first, IteratorType last) + : ia(input_adapter(first, last)) {} + + contiguous_bytes_input_adapter&& get() + { + return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg) + } + + private: + contiguous_bytes_input_adapter ia; +}; +} // namespace detail +} // namespace nlohmann + +// #include + + +#include +#include // string +#include // move +#include // vector + +// #include + +// #include + + +namespace nlohmann +{ + +/*! +@brief SAX interface + +This class describes the SAX interface used by @ref nlohmann::json::sax_parse. +Each function is called in different situations while the input is parsed. The +boolean return value informs the parser whether to continue processing the +input. +*/ +template +struct json_sax +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + + /*! + @brief a null value was read + @return whether parsing should proceed + */ + virtual bool null() = 0; + + /*! + @brief a boolean value was read + @param[in] val boolean value + @return whether parsing should proceed + */ + virtual bool boolean(bool val) = 0; + + /*! + @brief an integer number was read + @param[in] val integer value + @return whether parsing should proceed + */ + virtual bool number_integer(number_integer_t val) = 0; + + /*! + @brief an unsigned integer number was read + @param[in] val unsigned integer value + @return whether parsing should proceed + */ + virtual bool number_unsigned(number_unsigned_t val) = 0; + + /*! + @brief an floating-point number was read + @param[in] val floating-point value + @param[in] s raw token value + @return whether parsing should proceed + */ + virtual bool number_float(number_float_t val, const string_t& s) = 0; + + /*! + @brief a string was read + @param[in] val string value + @return whether parsing should proceed + @note It is safe to move the passed string. + */ + virtual bool string(string_t& val) = 0; + + /*! + @brief a binary string was read + @param[in] val binary value + @return whether parsing should proceed + @note It is safe to move the passed binary. + */ + virtual bool binary(binary_t& val) = 0; + + /*! + @brief the beginning of an object was read + @param[in] elements number of object elements or -1 if unknown + @return whether parsing should proceed + @note binary formats may report the number of elements + */ + virtual bool start_object(std::size_t elements) = 0; + + /*! + @brief an object key was read + @param[in] val object key + @return whether parsing should proceed + @note It is safe to move the passed string. + */ + virtual bool key(string_t& val) = 0; + + /*! + @brief the end of an object was read + @return whether parsing should proceed + */ + virtual bool end_object() = 0; + + /*! + @brief the beginning of an array was read + @param[in] elements number of array elements or -1 if unknown + @return whether parsing should proceed + @note binary formats may report the number of elements + */ + virtual bool start_array(std::size_t elements) = 0; + + /*! + @brief the end of an array was read + @return whether parsing should proceed + */ + virtual bool end_array() = 0; + + /*! + @brief a parse error occurred + @param[in] position the position in the input where the error occurs + @param[in] last_token the last read token + @param[in] ex an exception object describing the error + @return whether parsing should proceed (must return false) + */ + virtual bool parse_error(std::size_t position, + const std::string& last_token, + const detail::exception& ex) = 0; + + json_sax() = default; + json_sax(const json_sax&) = default; + json_sax(json_sax&&) noexcept = default; + json_sax& operator=(const json_sax&) = default; + json_sax& operator=(json_sax&&) noexcept = default; + virtual ~json_sax() = default; +}; + + +namespace detail +{ +/*! +@brief SAX implementation to create a JSON value from SAX events + +This class implements the @ref json_sax interface and processes the SAX events +to create a JSON value which makes it basically a DOM parser. The structure or +hierarchy of the JSON value is managed by the stack `ref_stack` which contains +a pointer to the respective array or object for each recursion depth. + +After successful parsing, the value that is passed by reference to the +constructor contains the parsed value. + +@tparam BasicJsonType the JSON type +*/ +template +class json_sax_dom_parser +{ + public: + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + + /*! + @param[in,out] r reference to a JSON value that is manipulated while + parsing + @param[in] allow_exceptions_ whether parse errors yield exceptions + */ + explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true) + : root(r), allow_exceptions(allow_exceptions_) + {} + + // make class move-only + json_sax_dom_parser(const json_sax_dom_parser&) = delete; + json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete; + json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + ~json_sax_dom_parser() = default; + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + ref_stack.push_back(handle_value(BasicJsonType::value_t::object)); + + if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size())) + { + JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back())); + } + + return true; + } + + bool key(string_t& val) + { + // add null at given key and store the reference for later + object_element = &(ref_stack.back()->m_value.object->operator[](val)); + return true; + } + + bool end_object() + { + ref_stack.back()->set_parents(); + ref_stack.pop_back(); + return true; + } + + bool start_array(std::size_t len) + { + ref_stack.push_back(handle_value(BasicJsonType::value_t::array)); + + if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size())) + { + JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back())); + } + + return true; + } + + bool end_array() + { + ref_stack.back()->set_parents(); + ref_stack.pop_back(); + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, + const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + JSON_THROW(ex); + } + return false; + } + + constexpr bool is_errored() const + { + return errored; + } + + private: + /*! + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + */ + template + JSON_HEDLEY_RETURNS_NON_NULL + BasicJsonType* handle_value(Value&& v) + { + if (ref_stack.empty()) + { + root = BasicJsonType(std::forward(v)); + return &root; + } + + JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + ref_stack.back()->m_value.array->emplace_back(std::forward(v)); + return &(ref_stack.back()->m_value.array->back()); + } + + JSON_ASSERT(ref_stack.back()->is_object()); + JSON_ASSERT(object_element); + *object_element = BasicJsonType(std::forward(v)); + return object_element; + } + + /// the parsed JSON value + BasicJsonType& root; + /// stack to model hierarchy of values + std::vector ref_stack {}; + /// helper to hold the reference for the next object element + BasicJsonType* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; +}; + +template +class json_sax_dom_callback_parser +{ + public: + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + using parser_callback_t = typename BasicJsonType::parser_callback_t; + using parse_event_t = typename BasicJsonType::parse_event_t; + + json_sax_dom_callback_parser(BasicJsonType& r, + const parser_callback_t cb, + const bool allow_exceptions_ = true) + : root(r), callback(cb), allow_exceptions(allow_exceptions_) + { + keep_stack.push_back(true); + } + + // make class move-only + json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete; + json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete; + json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + ~json_sax_dom_callback_parser() = default; + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + // check callback for object start + const bool keep = callback(static_cast(ref_stack.size()), parse_event_t::object_start, discarded); + keep_stack.push_back(keep); + + auto val = handle_value(BasicJsonType::value_t::object, true); + ref_stack.push_back(val.second); + + // check object limit + if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size())) + { + JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back())); + } + + return true; + } + + bool key(string_t& val) + { + BasicJsonType k = BasicJsonType(val); + + // check callback for key + const bool keep = callback(static_cast(ref_stack.size()), parse_event_t::key, k); + key_keep_stack.push_back(keep); + + // add discarded value at given key and store the reference for later + if (keep && ref_stack.back()) + { + object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded); + } + + return true; + } + + bool end_object() + { + if (ref_stack.back()) + { + if (!callback(static_cast(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back())) + { + // discard object + *ref_stack.back() = discarded; + } + else + { + ref_stack.back()->set_parents(); + } + } + + JSON_ASSERT(!ref_stack.empty()); + JSON_ASSERT(!keep_stack.empty()); + ref_stack.pop_back(); + keep_stack.pop_back(); + + if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured()) + { + // remove discarded value + for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it) + { + if (it->is_discarded()) + { + ref_stack.back()->erase(it); + break; + } + } + } + + return true; + } + + bool start_array(std::size_t len) + { + const bool keep = callback(static_cast(ref_stack.size()), parse_event_t::array_start, discarded); + keep_stack.push_back(keep); + + auto val = handle_value(BasicJsonType::value_t::array, true); + ref_stack.push_back(val.second); + + // check array limit + if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size())) + { + JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back())); + } + + return true; + } + + bool end_array() + { + bool keep = true; + + if (ref_stack.back()) + { + keep = callback(static_cast(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back()); + if (keep) + { + ref_stack.back()->set_parents(); + } + else + { + // discard array + *ref_stack.back() = discarded; + } + } + + JSON_ASSERT(!ref_stack.empty()); + JSON_ASSERT(!keep_stack.empty()); + ref_stack.pop_back(); + keep_stack.pop_back(); + + // remove discarded value + if (!keep && !ref_stack.empty() && ref_stack.back()->is_array()) + { + ref_stack.back()->m_value.array->pop_back(); + } + + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, + const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + JSON_THROW(ex); + } + return false; + } + + constexpr bool is_errored() const + { + return errored; + } + + private: + /*! + @param[in] v value to add to the JSON value we build during parsing + @param[in] skip_callback whether we should skip calling the callback + function; this is required after start_array() and + start_object() SAX events, because otherwise we would call the + callback function with an empty array or object, respectively. + + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + + @return pair of boolean (whether value should be kept) and pointer (to the + passed value in the ref_stack hierarchy; nullptr if not kept) + */ + template + std::pair handle_value(Value&& v, const bool skip_callback = false) + { + JSON_ASSERT(!keep_stack.empty()); + + // do not handle this value if we know it would be added to a discarded + // container + if (!keep_stack.back()) + { + return {false, nullptr}; + } + + // create value + auto value = BasicJsonType(std::forward(v)); + + // check callback + const bool keep = skip_callback || callback(static_cast(ref_stack.size()), parse_event_t::value, value); + + // do not handle this value if we just learnt it shall be discarded + if (!keep) + { + return {false, nullptr}; + } + + if (ref_stack.empty()) + { + root = std::move(value); + return {true, &root}; + } + + // skip this value if we already decided to skip the parent + // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360) + if (!ref_stack.back()) + { + return {false, nullptr}; + } + + // we now only expect arrays and objects + JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + // array + if (ref_stack.back()->is_array()) + { + ref_stack.back()->m_value.array->emplace_back(std::move(value)); + return {true, &(ref_stack.back()->m_value.array->back())}; + } + + // object + JSON_ASSERT(ref_stack.back()->is_object()); + // check if we should store an element for the current key + JSON_ASSERT(!key_keep_stack.empty()); + const bool store_element = key_keep_stack.back(); + key_keep_stack.pop_back(); + + if (!store_element) + { + return {false, nullptr}; + } + + JSON_ASSERT(object_element); + *object_element = std::move(value); + return {true, object_element}; + } + + /// the parsed JSON value + BasicJsonType& root; + /// stack to model hierarchy of values + std::vector ref_stack {}; + /// stack to manage which values to keep + std::vector keep_stack {}; + /// stack to manage which object keys to keep + std::vector key_keep_stack {}; + /// helper to hold the reference for the next object element + BasicJsonType* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// callback function + const parser_callback_t callback = nullptr; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; + /// a discarded value for the callback + BasicJsonType discarded = BasicJsonType::value_t::discarded; +}; + +template +class json_sax_acceptor +{ + public: + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + + bool null() + { + return true; + } + + bool boolean(bool /*unused*/) + { + return true; + } + + bool number_integer(number_integer_t /*unused*/) + { + return true; + } + + bool number_unsigned(number_unsigned_t /*unused*/) + { + return true; + } + + bool number_float(number_float_t /*unused*/, const string_t& /*unused*/) + { + return true; + } + + bool string(string_t& /*unused*/) + { + return true; + } + + bool binary(binary_t& /*unused*/) + { + return true; + } + + bool start_object(std::size_t /*unused*/ = std::size_t(-1)) + { + return true; + } + + bool key(string_t& /*unused*/) + { + return true; + } + + bool end_object() + { + return true; + } + + bool start_array(std::size_t /*unused*/ = std::size_t(-1)) + { + return true; + } + + bool end_array() + { + return true; + } + + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/) + { + return false; + } +}; +} // namespace detail + +} // namespace nlohmann + +// #include + + +#include // array +#include // localeconv +#include // size_t +#include // snprintf +#include // strtof, strtod, strtold, strtoll, strtoull +#include // initializer_list +#include // char_traits, string +#include // move +#include // vector + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////// +// lexer // +/////////// + +template +class lexer_base +{ + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual value + value_float, ///< an floating point number -- use get_number_float() for actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + JSON_HEDLEY_RETURNS_NON_NULL + JSON_HEDLEY_CONST + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case token_type::value_unsigned: + case token_type::value_integer: + case token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + // LCOV_EXCL_START + default: // catch non-enum values + return "unknown token"; + // LCOV_EXCL_STOP + } + } +}; +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer : public lexer_base +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + using token_type = typename lexer_base::token_type; + + explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept + : ia(std::move(adapter)) + , ignore_comments(ignore_comments_) + , decimal_point_char(static_cast(get_decimal_point())) + {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + ~lexer() = default; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + JSON_HEDLEY_PURE + static char get_decimal_point() noexcept + { + const auto* loc = localeconv(); + JSON_ASSERT(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + For input "\u c1 c2 c3 c4" the codepoint is: + (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 + = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) + + Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' + must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The + conversion is done by subtracting the offset (0x30, 0x37, and 0x57) + between the ASCII value of the character and the desired integer value. + + @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or + non-hex character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + JSON_ASSERT(current == 'u'); + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + get(); + + if (current >= '0' && current <= '9') + { + codepoint += static_cast((static_cast(current) - 0x30u) << factor); + } + else if (current >= 'A' && current <= 'F') + { + codepoint += static_cast((static_cast(current) - 0x37u) << factor); + } + else if (current >= 'a' && current <= 'f') + { + codepoint += static_cast((static_cast(current) - 0x57u) << factor); + } + else + { + return -1; + } + } + + JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); + return codepoint; + } + + /*! + @brief check if the next byte(s) are inside a given range + + Adds the current byte and, for each passed range, reads a new byte and + checks if it is inside the range. If a violation was detected, set up an + error message and return false. Otherwise, return true. + + @param[in] ranges list of integers; interpreted as list of pairs of + inclusive lower and upper bound, respectively + + @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, + 1, 2, or 3 pairs. This precondition is enforced by an assertion. + + @return true if and only if no range violation was detected + */ + bool next_byte_in_range(std::initializer_list ranges) + { + JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); + add(current); + + for (auto range = ranges.begin(); range != ranges.end(); ++range) + { + get(); + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + { + add(current); + } + else + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return false; + } + } + + return true; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 8259. While + scanning, bytes are escaped and copied into buffer token_buffer. Then the + function returns successfully, token_buffer is *not* null-terminated (as it + may contain \0 bytes), and token_buffer.size() is the number of bytes in the + string. + + @return token_type::value_string if string could be successfully scanned, + token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset token_buffer (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + JSON_ASSERT(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + const int codepoint1 = get_codepoint(); + int codepoint = codepoint1; // start with codepoint1 + + if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) + { + // overwrite codepoint + codepoint = static_cast( + // high surrogate occupies the most significant 22 bits + (static_cast(codepoint1) << 10u) + // low surrogate occupies the least significant 15 bits + + static_cast(codepoint2) + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00u); + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; + return token_type::parse_error; + } + } + + // result of the above calculation yields a proper codepoint + JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); + + // translate codepoint into bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else if (codepoint <= 0xFFFF) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + + case 0x1F: + { + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xD8: + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + { + if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) + { + return token_type::parse_error; + } + break; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xE0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xEE: + case 0xEF: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xED: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xF0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xF1: + case 0xF2: + case 0xF3: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xF4: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + /*! + * @brief scan a comment + * @return whether comment could be scanned successfully + */ + bool scan_comment() + { + switch (get()) + { + // single-line comments skip input until a newline or EOF is read + case '/': + { + while (true) + { + switch (get()) + { + case '\n': + case '\r': + case std::char_traits::eof(): + case '\0': + return true; + + default: + break; + } + } + } + + // multi-line comments skip input until */ is read + case '*': + { + while (true) + { + switch (get()) + { + case std::char_traits::eof(): + case '\0': + { + error_message = "invalid comment; missing closing '*/'"; + return false; + } + + case '*': + { + switch (get()) + { + case '/': + return true; + + default: + { + unget(); + continue; + } + } + } + + default: + continue; + } + } + } + + // unexpected character after reading '/' + default: + { + error_message = "invalid comment; expecting '/' or '*' after '/'"; + return false; + } + } + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 8259. + + The function is realized with a deterministic finite state machine derived + from the grammar described in RFC 8259. Starting in state "init", the + input is read and used to determined the next state. Only state "done" + accepts the number. State "error" is a trap state to model errors. In the + table below, "anything" means any character but the ones listed before. + + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in token_buffer. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() // lgtm [cpp/use-of-goto] + { + // reset token_buffer to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + // all other characters are rejected outside scan_number() + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + goto scan_number_done; + } + +scan_number_done: + // unget the character after the number (we only read it to know that + // we are done scanning a number) + unget(); + + char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + errno = 0; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + const auto x = std::strtoull(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + const auto x = std::strtoll(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if an + // integer conversion above failed + strtof(value_float, token_buffer.data(), &endptr); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + JSON_HEDLEY_NON_NULL(2) + token_type scan_literal(const char_type* literal_text, const std::size_t length, + token_type return_type) + { + JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + for (std::size_t i = 1; i < length; ++i) + { + if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset token_buffer; current character is beginning of token + void reset() noexcept + { + token_buffer.clear(); + token_string.clear(); + token_string.push_back(std::char_traits::to_char_type(current)); + } + + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + char_int_type get() + { + ++position.chars_read_total; + ++position.chars_read_current_line; + + if (next_unget) + { + // just reset the next_unget variable and work with current + next_unget = false; + } + else + { + current = ia.get_character(); + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + + if (current == '\n') + { + ++position.lines_read; + position.chars_read_current_line = 0; + } + + return current; + } + + /*! + @brief unget current character (read it again on next get) + + We implement unget by setting variable next_unget to true. The input is not + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. + */ + void unget() + { + next_unget = true; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + JSON_ASSERT(!token_string.empty()); + token_string.pop_back(); + } + } + + /// add a character to token_buffer + void add(char_int_type c) + { + token_buffer.push_back(static_cast(c)); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return current string value (implicitly resets the token; useful only once) + string_t& get_string() + { + return token_buffer; + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr position_t get_position() const noexcept + { + return position; + } + + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. + std::string get_token_string() const + { + // escape control characters + std::string result; + for (const auto c : token_string) + { + if (static_cast(c) <= '\x1F') + { + // escape control characters + std::array cs{{}}; + (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + result += cs.data(); + } + else + { + // add character as is + result.push_back(static_cast(c)); + } + } + + return result; + } + + /// return syntax error message + JSON_HEDLEY_RETURNS_NON_NULL + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + /*! + @brief skip the UTF-8 byte order mark + @return true iff there is no BOM or the correct BOM has been skipped + */ + bool skip_bom() + { + if (get() == 0xEF) + { + // check if we completely parse the BOM + return get() == 0xBB && get() == 0xBF; + } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; + } + + void skip_whitespace() + { + do + { + get(); + } + while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); + } + + token_type scan() + { + // initially, skip the BOM + if (position.chars_read_total == 0 && !skip_bom()) + { + error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; + return token_type::parse_error; + } + + // read next character and ignore whitespace + skip_whitespace(); + + // ignore comments + while (ignore_comments && current == '/') + { + if (!scan_comment()) + { + return token_type::parse_error; + } + + // skip following whitespace + skip_whitespace(); + } + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + { + std::array true_literal = {{char_type('t'), char_type('r'), char_type('u'), char_type('e')}}; + return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); + } + case 'f': + { + std::array false_literal = {{char_type('f'), char_type('a'), char_type('l'), char_type('s'), char_type('e')}}; + return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); + } + case 'n': + { + std::array null_literal = {{char_type('n'), char_type('u'), char_type('l'), char_type('l')}}; + return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); + } + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + InputAdapterType ia; + + /// whether comments should be ignored (true) or signaled as errors (false) + const bool ignore_comments = false; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// whether the next get() call should just return current + bool next_unget = false; + + /// the start position of the current token + position_t position {}; + + /// raw input token string (for error messages) + std::vector token_string {}; + + /// buffer for variable-length tokens (numbers, strings) + string_t token_buffer {}; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char_int_type decimal_point_char = '.'; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + + +#include // size_t +#include // declval +#include // string + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template +using null_function_t = decltype(std::declval().null()); + +template +using boolean_function_t = + decltype(std::declval().boolean(std::declval())); + +template +using number_integer_function_t = + decltype(std::declval().number_integer(std::declval())); + +template +using number_unsigned_function_t = + decltype(std::declval().number_unsigned(std::declval())); + +template +using number_float_function_t = decltype(std::declval().number_float( + std::declval(), std::declval())); + +template +using string_function_t = + decltype(std::declval().string(std::declval())); + +template +using binary_function_t = + decltype(std::declval().binary(std::declval())); + +template +using start_object_function_t = + decltype(std::declval().start_object(std::declval())); + +template +using key_function_t = + decltype(std::declval().key(std::declval())); + +template +using end_object_function_t = decltype(std::declval().end_object()); + +template +using start_array_function_t = + decltype(std::declval().start_array(std::declval())); + +template +using end_array_function_t = decltype(std::declval().end_array()); + +template +using parse_error_function_t = decltype(std::declval().parse_error( + std::declval(), std::declval(), + std::declval())); + +template +struct is_sax +{ + private: + static_assert(is_basic_json::value, + "BasicJsonType must be of type basic_json<...>"); + + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + using exception_t = typename BasicJsonType::exception; + + public: + static constexpr bool value = + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value && + is_detected_exact::value; +}; + +template +struct is_sax_static_asserts +{ + private: + static_assert(is_basic_json::value, + "BasicJsonType must be of type basic_json<...>"); + + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + using exception_t = typename BasicJsonType::exception; + + public: + static_assert(is_detected_exact::value, + "Missing/invalid function: bool null()"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool boolean(bool)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool boolean(bool)"); + static_assert( + is_detected_exact::value, + "Missing/invalid function: bool number_integer(number_integer_t)"); + static_assert( + is_detected_exact::value, + "Missing/invalid function: bool number_unsigned(number_unsigned_t)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool number_float(number_float_t, const string_t&)"); + static_assert( + is_detected_exact::value, + "Missing/invalid function: bool string(string_t&)"); + static_assert( + is_detected_exact::value, + "Missing/invalid function: bool binary(binary_t&)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool start_object(std::size_t)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool key(string_t&)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool end_object()"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool start_array(std::size_t)"); + static_assert(is_detected_exact::value, + "Missing/invalid function: bool end_array()"); + static_assert( + is_detected_exact::value, + "Missing/invalid function: bool parse_error(std::size_t, const " + "std::string&, const exception&)"); +}; +} // namespace detail +} // namespace nlohmann + +// #include + + +namespace nlohmann +{ +namespace detail +{ + +/// how to treat CBOR tags +enum class cbor_tag_handler_t +{ + error, ///< throw a parse_error exception in case of a tag + ignore ///< ignore tags +}; + +/*! +@brief determine system byte order + +@return true if and only if system's byte order is little endian + +@note from https://stackoverflow.com/a/1001328/266378 +*/ +static inline bool little_endianess(int num = 1) noexcept +{ + return *reinterpret_cast(&num) == 1; +} + + +/////////////////// +// binary reader // +/////////////////// + +/*! +@brief deserialization of CBOR, MessagePack, and UBJSON values +*/ +template> +class binary_reader +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + using json_sax_t = SAX; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + /*! + @brief create a binary reader + + @param[in] adapter input adapter to read from + */ + explicit binary_reader(InputAdapterType&& adapter) noexcept : ia(std::move(adapter)) + { + (void)detail::is_sax_static_asserts {}; + } + + // make class move-only + binary_reader(const binary_reader&) = delete; + binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + binary_reader& operator=(const binary_reader&) = delete; + binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) + ~binary_reader() = default; + + /*! + @param[in] format the binary format to parse + @param[in] sax_ a SAX event processor + @param[in] strict whether to expect the input to be consumed completed + @param[in] tag_handler how to treat CBOR tags + + @return whether parsing was successful + */ + JSON_HEDLEY_NON_NULL(3) + bool sax_parse(const input_format_t format, + json_sax_t* sax_, + const bool strict = true, + const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) + { + sax = sax_; + bool result = false; + + switch (format) + { + case input_format_t::bson: + result = parse_bson_internal(); + break; + + case input_format_t::cbor: + result = parse_cbor_internal(true, tag_handler); + break; + + case input_format_t::msgpack: + result = parse_msgpack_internal(); + break; + + case input_format_t::ubjson: + result = parse_ubjson_internal(); + break; + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + + // strict mode: next byte must be EOF + if (result && strict) + { + if (format == input_format_t::ubjson) + { + get_ignore_noop(); + } + else + { + get(); + } + + if (JSON_HEDLEY_UNLIKELY(current != std::char_traits::eof())) + { + return sax->parse_error(chars_read, get_token_string(), + parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value"), BasicJsonType())); + } + } + + return result; + } + + private: + ////////// + // BSON // + ////////// + + /*! + @brief Reads in a BSON-object and passes it to the SAX-parser. + @return whether a valid BSON-value was passed to the SAX parser + */ + bool parse_bson_internal() + { + std::int32_t document_size{}; + get_number(input_format_t::bson, document_size); + + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1)))) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false))) + { + return false; + } + + return sax->end_object(); + } + + /*! + @brief Parses a C-style string from the BSON input. + @param[in,out] result A reference to the string variable where the read + string is to be stored. + @return `true` if the \x00-byte indicating the end of the string was + encountered before the EOF; false` indicates an unexpected EOF. + */ + bool get_bson_cstr(string_t& result) + { + auto out = std::back_inserter(result); + while (true) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring"))) + { + return false; + } + if (current == 0x00) + { + return true; + } + *out++ = static_cast(current); + } + } + + /*! + @brief Parses a zero-terminated string of length @a len from the BSON + input. + @param[in] len The length (including the zero-byte at the end) of the + string to be read. + @param[in,out] result A reference to the string variable where the read + string is to be stored. + @tparam NumberType The type of the length @a len + @pre len >= 1 + @return `true` if the string was successfully parsed + */ + template + bool get_bson_string(const NumberType len, string_t& result) + { + if (JSON_HEDLEY_UNLIKELY(len < 1)) + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string"), BasicJsonType())); + } + + return get_string(input_format_t::bson, len - static_cast(1), result) && get() != std::char_traits::eof(); + } + + /*! + @brief Parses a byte array input of length @a len from the BSON input. + @param[in] len The length of the byte array to be read. + @param[in,out] result A reference to the binary variable where the read + array is to be stored. + @tparam NumberType The type of the length @a len + @pre len >= 0 + @return `true` if the byte array was successfully parsed + */ + template + bool get_bson_binary(const NumberType len, binary_t& result) + { + if (JSON_HEDLEY_UNLIKELY(len < 0)) + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "byte array length cannot be negative, is " + std::to_string(len), "binary"), BasicJsonType())); + } + + // All BSON binary values have a subtype + std::uint8_t subtype{}; + get_number(input_format_t::bson, subtype); + result.set_subtype(subtype); + + return get_binary(input_format_t::bson, len, result); + } + + /*! + @brief Read a BSON document element of the given @a element_type. + @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html + @param[in] element_type_parse_position The position in the input stream, + where the `element_type` was read. + @warning Not all BSON element types are supported yet. An unsupported + @a element_type will give rise to a parse_error.114: + Unsupported BSON record type 0x... + @return whether a valid BSON-object/array was passed to the SAX parser + */ + bool parse_bson_element_internal(const char_int_type element_type, + const std::size_t element_type_parse_position) + { + switch (element_type) + { + case 0x01: // double + { + double number{}; + return get_number(input_format_t::bson, number) && sax->number_float(static_cast(number), ""); + } + + case 0x02: // string + { + std::int32_t len{}; + string_t value; + return get_number(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value); + } + + case 0x03: // object + { + return parse_bson_internal(); + } + + case 0x04: // array + { + return parse_bson_array(); + } + + case 0x05: // binary + { + std::int32_t len{}; + binary_t value; + return get_number(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value); + } + + case 0x08: // boolean + { + return sax->boolean(get() != 0); + } + + case 0x0A: // null + { + return sax->null(); + } + + case 0x10: // int32 + { + std::int32_t value{}; + return get_number(input_format_t::bson, value) && sax->number_integer(value); + } + + case 0x12: // int64 + { + std::int64_t value{}; + return get_number(input_format_t::bson, value) && sax->number_integer(value); + } + + default: // anything else not supported (yet) + { + std::array cr{{}}; + (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast(element_type)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data()), BasicJsonType())); + } + } + } + + /*! + @brief Read a BSON element list (as specified in the BSON-spec) + + The same binary layout is used for objects and arrays, hence it must be + indicated with the argument @a is_array which one is expected + (true --> array, false --> object). + + @param[in] is_array Determines if the element list being read is to be + treated as an object (@a is_array == false), or as an + array (@a is_array == true). + @return whether a valid BSON-object/array was passed to the SAX parser + */ + bool parse_bson_element_list(const bool is_array) + { + string_t key; + + while (auto element_type = get()) + { + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list"))) + { + return false; + } + + const std::size_t element_type_parse_position = chars_read; + if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) + { + return false; + } + + if (!is_array && !sax->key(key)) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) + { + return false; + } + + // get_bson_cstr only appends + key.clear(); + } + + return true; + } + + /*! + @brief Reads an array from the BSON input and passes it to the SAX-parser. + @return whether a valid BSON-array was passed to the SAX parser + */ + bool parse_bson_array() + { + std::int32_t document_size{}; + get_number(input_format_t::bson, document_size); + + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1)))) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true))) + { + return false; + } + + return sax->end_array(); + } + + ////////// + // CBOR // + ////////// + + /*! + @param[in] get_char whether a new character should be retrieved from the + input (true) or whether the last read character should + be considered instead (false) + @param[in] tag_handler how CBOR tags should be treated + + @return whether a valid CBOR value was passed to the SAX parser + */ + bool parse_cbor_internal(const bool get_char, + const cbor_tag_handler_t tag_handler) + { + switch (get_char ? get() : current) + { + // EOF + case std::char_traits::eof(): + return unexpect_eof(input_format_t::cbor, "value"); + + // Integer 0x00..0x17 (0..23) + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + case 0x0D: + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + return sax->number_unsigned(static_cast(current)); + + case 0x18: // Unsigned integer (one-byte uint8_t follows) + { + std::uint8_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); + } + + case 0x19: // Unsigned integer (two-byte uint16_t follows) + { + std::uint16_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); + } + + case 0x1A: // Unsigned integer (four-byte uint32_t follows) + { + std::uint32_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); + } + + case 0x1B: // Unsigned integer (eight-byte uint64_t follows) + { + std::uint64_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); + } + + // Negative integer -1-0x00..-1-0x17 (-1..-24) + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + return sax->number_integer(static_cast(0x20 - 1 - current)); + + case 0x38: // Negative integer (one-byte uint8_t follows) + { + std::uint8_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); + } + + case 0x39: // Negative integer -1-n (two-byte uint16_t follows) + { + std::uint16_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); + } + + case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) + { + std::uint32_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); + } + + case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) + { + std::uint64_t number{}; + return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) + - static_cast(number)); + } + + // Binary data (0x00..0x17 bytes follow) + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: // Binary data (one-byte uint8_t for n follows) + case 0x59: // Binary data (two-byte uint16_t for n follow) + case 0x5A: // Binary data (four-byte uint32_t for n follow) + case 0x5B: // Binary data (eight-byte uint64_t for n follow) + case 0x5F: // Binary data (indefinite length) + { + binary_t b; + return get_cbor_binary(b) && sax->binary(b); + } + + // UTF-8 string (0x00..0x17 bytes follow) + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: // UTF-8 string (one-byte uint8_t for n follows) + case 0x79: // UTF-8 string (two-byte uint16_t for n follow) + case 0x7A: // UTF-8 string (four-byte uint32_t for n follow) + case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow) + case 0x7F: // UTF-8 string (indefinite length) + { + string_t s; + return get_cbor_string(s) && sax->string(s); + } + + // array (0x00..0x17 data items follow) + case 0x80: + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + case 0x90: + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: + return get_cbor_array(static_cast(static_cast(current) & 0x1Fu), tag_handler); + + case 0x98: // array (one-byte uint8_t for n follows) + { + std::uint8_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + } + + case 0x99: // array (two-byte uint16_t for n follow) + { + std::uint16_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + } + + case 0x9A: // array (four-byte uint32_t for n follow) + { + std::uint32_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + } + + case 0x9B: // array (eight-byte uint64_t for n follow) + { + std::uint64_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + } + + case 0x9F: // array (indefinite length) + return get_cbor_array(std::size_t(-1), tag_handler); + + // map (0x00..0x17 pairs of data items follow) + case 0xA0: + case 0xA1: + case 0xA2: + case 0xA3: + case 0xA4: + case 0xA5: + case 0xA6: + case 0xA7: + case 0xA8: + case 0xA9: + case 0xAA: + case 0xAB: + case 0xAC: + case 0xAD: + case 0xAE: + case 0xAF: + case 0xB0: + case 0xB1: + case 0xB2: + case 0xB3: + case 0xB4: + case 0xB5: + case 0xB6: + case 0xB7: + return get_cbor_object(static_cast(static_cast(current) & 0x1Fu), tag_handler); + + case 0xB8: // map (one-byte uint8_t for n follows) + { + std::uint8_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + } + + case 0xB9: // map (two-byte uint16_t for n follow) + { + std::uint16_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + } + + case 0xBA: // map (four-byte uint32_t for n follow) + { + std::uint32_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + } + + case 0xBB: // map (eight-byte uint64_t for n follow) + { + std::uint64_t len{}; + return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + } + + case 0xBF: // map (indefinite length) + return get_cbor_object(std::size_t(-1), tag_handler); + + case 0xC6: // tagged item + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD8: // tagged item (1 bytes follow) + case 0xD9: // tagged item (2 bytes follow) + case 0xDA: // tagged item (4 bytes follow) + case 0xDB: // tagged item (8 bytes follow) + { + switch (tag_handler) + { + case cbor_tag_handler_t::error: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType())); + } + + case cbor_tag_handler_t::ignore: + { + switch (current) + { + case 0xD8: + { + std::uint8_t len{}; + get_number(input_format_t::cbor, len); + break; + } + case 0xD9: + { + std::uint16_t len{}; + get_number(input_format_t::cbor, len); + break; + } + case 0xDA: + { + std::uint32_t len{}; + get_number(input_format_t::cbor, len); + break; + } + case 0xDB: + { + std::uint64_t len{}; + get_number(input_format_t::cbor, len); + break; + } + default: + break; + } + return parse_cbor_internal(true, tag_handler); + } + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + return false; // LCOV_EXCL_LINE + } + } + + case 0xF4: // false + return sax->boolean(false); + + case 0xF5: // true + return sax->boolean(true); + + case 0xF6: // null + return sax->null(); + + case 0xF9: // Half-Precision Float (two-byte IEEE 754) + { + const auto byte1_raw = get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) + { + return false; + } + const auto byte2_raw = get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) + { + return false; + } + + const auto byte1 = static_cast(byte1_raw); + const auto byte2 = static_cast(byte2_raw); + + // code from RFC 7049, Appendix D, Figure 3: + // As half-precision floating-point numbers were only added + // to IEEE 754 in 2008, today's programming platforms often + // still only have limited support for them. It is very + // easy to include at least decoding support for them even + // without such support. An example of a small decoder for + // half-precision floating-point numbers in the C language + // is shown in Fig. 3. + const auto half = static_cast((byte1 << 8u) + byte2); + const double val = [&half] + { + const int exp = (half >> 10u) & 0x1Fu; + const unsigned int mant = half & 0x3FFu; + JSON_ASSERT(0 <= exp&& exp <= 32); + JSON_ASSERT(mant <= 1024); + switch (exp) + { + case 0: + return std::ldexp(mant, -24); + case 31: + return (mant == 0) + ? std::numeric_limits::infinity() + : std::numeric_limits::quiet_NaN(); + default: + return std::ldexp(mant + 1024, exp - 25); + } + }(); + return sax->number_float((half & 0x8000u) != 0 + ? static_cast(-val) + : static_cast(val), ""); + } + + case 0xFA: // Single-Precision Float (four-byte IEEE 754) + { + float number{}; + return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); + } + + case 0xFB: // Double-Precision Float (eight-byte IEEE 754) + { + double number{}; + return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); + } + + default: // anything else (0xFF is handled inside the other types) + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType())); + } + } + } + + /*! + @brief reads a CBOR string + + This function first reads starting bytes to determine the expected + string length and then copies this number of bytes into a string. + Additionally, CBOR's strings with indefinite lengths are supported. + + @param[out] result created string + + @return whether string creation completed + */ + bool get_cbor_string(string_t& result) + { + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string"))) + { + return false; + } + + switch (current) + { + // UTF-8 string (0x00..0x17 bytes follow) + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + { + return get_string(input_format_t::cbor, static_cast(current) & 0x1Fu, result); + } + + case 0x78: // UTF-8 string (one-byte uint8_t for n follows) + { + std::uint8_t len{}; + return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result); + } + + case 0x79: // UTF-8 string (two-byte uint16_t for n follow) + { + std::uint16_t len{}; + return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result); + } + + case 0x7A: // UTF-8 string (four-byte uint32_t for n follow) + { + std::uint32_t len{}; + return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result); + } + + case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow) + { + std::uint64_t len{}; + return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result); + } + + case 0x7F: // UTF-8 string (indefinite length) + { + while (get() != 0xFF) + { + string_t chunk; + if (!get_cbor_string(chunk)) + { + return false; + } + result.append(chunk); + } + return true; + } + + default: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string"), BasicJsonType())); + } + } + } + + /*! + @brief reads a CBOR byte array + + This function first reads starting bytes to determine the expected + byte array length and then copies this number of bytes into the byte array. + Additionally, CBOR's byte arrays with indefinite lengths are supported. + + @param[out] result created byte array + + @return whether byte array creation completed + */ + bool get_cbor_binary(binary_t& result) + { + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary"))) + { + return false; + } + + switch (current) + { + // Binary data (0x00..0x17 bytes follow) + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + { + return get_binary(input_format_t::cbor, static_cast(current) & 0x1Fu, result); + } + + case 0x58: // Binary data (one-byte uint8_t for n follows) + { + std::uint8_t len{}; + return get_number(input_format_t::cbor, len) && + get_binary(input_format_t::cbor, len, result); + } + + case 0x59: // Binary data (two-byte uint16_t for n follow) + { + std::uint16_t len{}; + return get_number(input_format_t::cbor, len) && + get_binary(input_format_t::cbor, len, result); + } + + case 0x5A: // Binary data (four-byte uint32_t for n follow) + { + std::uint32_t len{}; + return get_number(input_format_t::cbor, len) && + get_binary(input_format_t::cbor, len, result); + } + + case 0x5B: // Binary data (eight-byte uint64_t for n follow) + { + std::uint64_t len{}; + return get_number(input_format_t::cbor, len) && + get_binary(input_format_t::cbor, len, result); + } + + case 0x5F: // Binary data (indefinite length) + { + while (get() != 0xFF) + { + binary_t chunk; + if (!get_cbor_binary(chunk)) + { + return false; + } + result.insert(result.end(), chunk.begin(), chunk.end()); + } + return true; + } + + default: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x" + last_token, "binary"), BasicJsonType())); + } + } + } + + /*! + @param[in] len the length of the array or std::size_t(-1) for an + array of indefinite size + @param[in] tag_handler how CBOR tags should be treated + @return whether array creation completed + */ + bool get_cbor_array(const std::size_t len, + const cbor_tag_handler_t tag_handler) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len))) + { + return false; + } + + if (len != std::size_t(-1)) + { + for (std::size_t i = 0; i < len; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) + { + return false; + } + } + } + else + { + while (get() != 0xFF) + { + if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler))) + { + return false; + } + } + } + + return sax->end_array(); + } + + /*! + @param[in] len the length of the object or std::size_t(-1) for an + object of indefinite size + @param[in] tag_handler how CBOR tags should be treated + @return whether object creation completed + */ + bool get_cbor_object(const std::size_t len, + const cbor_tag_handler_t tag_handler) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len))) + { + return false; + } + + string_t key; + if (len != std::size_t(-1)) + { + for (std::size_t i = 0; i < len; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) + { + return false; + } + key.clear(); + } + } + else + { + while (get() != 0xFF) + { + if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) + { + return false; + } + key.clear(); + } + } + + return sax->end_object(); + } + + ///////////// + // MsgPack // + ///////////// + + /*! + @return whether a valid MessagePack value was passed to the SAX parser + */ + bool parse_msgpack_internal() + { + switch (get()) + { + // EOF + case std::char_traits::eof(): + return unexpect_eof(input_format_t::msgpack, "value"); + + // positive fixint + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + case 0x0D: + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1A: + case 0x1B: + case 0x1C: + case 0x1D: + case 0x1E: + case 0x1F: + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5C: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + return sax->number_unsigned(static_cast(current)); + + // fixmap + case 0x80: + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + return get_msgpack_object(static_cast(static_cast(current) & 0x0Fu)); + + // fixarray + case 0x90: + case 0x91: + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: + case 0x97: + case 0x98: + case 0x99: + case 0x9A: + case 0x9B: + case 0x9C: + case 0x9D: + case 0x9E: + case 0x9F: + return get_msgpack_array(static_cast(static_cast(current) & 0x0Fu)); + + // fixstr + case 0xA0: + case 0xA1: + case 0xA2: + case 0xA3: + case 0xA4: + case 0xA5: + case 0xA6: + case 0xA7: + case 0xA8: + case 0xA9: + case 0xAA: + case 0xAB: + case 0xAC: + case 0xAD: + case 0xAE: + case 0xAF: + case 0xB0: + case 0xB1: + case 0xB2: + case 0xB3: + case 0xB4: + case 0xB5: + case 0xB6: + case 0xB7: + case 0xB8: + case 0xB9: + case 0xBA: + case 0xBB: + case 0xBC: + case 0xBD: + case 0xBE: + case 0xBF: + case 0xD9: // str 8 + case 0xDA: // str 16 + case 0xDB: // str 32 + { + string_t s; + return get_msgpack_string(s) && sax->string(s); + } + + case 0xC0: // nil + return sax->null(); + + case 0xC2: // false + return sax->boolean(false); + + case 0xC3: // true + return sax->boolean(true); + + case 0xC4: // bin 8 + case 0xC5: // bin 16 + case 0xC6: // bin 32 + case 0xC7: // ext 8 + case 0xC8: // ext 16 + case 0xC9: // ext 32 + case 0xD4: // fixext 1 + case 0xD5: // fixext 2 + case 0xD6: // fixext 4 + case 0xD7: // fixext 8 + case 0xD8: // fixext 16 + { + binary_t b; + return get_msgpack_binary(b) && sax->binary(b); + } + + case 0xCA: // float 32 + { + float number{}; + return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); + } + + case 0xCB: // float 64 + { + double number{}; + return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); + } + + case 0xCC: // uint 8 + { + std::uint8_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); + } + + case 0xCD: // uint 16 + { + std::uint16_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); + } + + case 0xCE: // uint 32 + { + std::uint32_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); + } + + case 0xCF: // uint 64 + { + std::uint64_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); + } + + case 0xD0: // int 8 + { + std::int8_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_integer(number); + } + + case 0xD1: // int 16 + { + std::int16_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_integer(number); + } + + case 0xD2: // int 32 + { + std::int32_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_integer(number); + } + + case 0xD3: // int 64 + { + std::int64_t number{}; + return get_number(input_format_t::msgpack, number) && sax->number_integer(number); + } + + case 0xDC: // array 16 + { + std::uint16_t len{}; + return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); + } + + case 0xDD: // array 32 + { + std::uint32_t len{}; + return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); + } + + case 0xDE: // map 16 + { + std::uint16_t len{}; + return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); + } + + case 0xDF: // map 32 + { + std::uint32_t len{}; + return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); + } + + // negative fixint + case 0xE0: + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xED: + case 0xEE: + case 0xEF: + case 0xF0: + case 0xF1: + case 0xF2: + case 0xF3: + case 0xF4: + case 0xF5: + case 0xF6: + case 0xF7: + case 0xF8: + case 0xF9: + case 0xFA: + case 0xFB: + case 0xFC: + case 0xFD: + case 0xFE: + case 0xFF: + return sax->number_integer(static_cast(current)); + + default: // anything else + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value"), BasicJsonType())); + } + } + } + + /*! + @brief reads a MessagePack string + + This function first reads starting bytes to determine the expected + string length and then copies this number of bytes into a string. + + @param[out] result created string + + @return whether string creation completed + */ + bool get_msgpack_string(string_t& result) + { + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string"))) + { + return false; + } + + switch (current) + { + // fixstr + case 0xA0: + case 0xA1: + case 0xA2: + case 0xA3: + case 0xA4: + case 0xA5: + case 0xA6: + case 0xA7: + case 0xA8: + case 0xA9: + case 0xAA: + case 0xAB: + case 0xAC: + case 0xAD: + case 0xAE: + case 0xAF: + case 0xB0: + case 0xB1: + case 0xB2: + case 0xB3: + case 0xB4: + case 0xB5: + case 0xB6: + case 0xB7: + case 0xB8: + case 0xB9: + case 0xBA: + case 0xBB: + case 0xBC: + case 0xBD: + case 0xBE: + case 0xBF: + { + return get_string(input_format_t::msgpack, static_cast(current) & 0x1Fu, result); + } + + case 0xD9: // str 8 + { + std::uint8_t len{}; + return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result); + } + + case 0xDA: // str 16 + { + std::uint16_t len{}; + return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result); + } + + case 0xDB: // str 32 + { + std::uint32_t len{}; + return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result); + } + + default: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string"), BasicJsonType())); + } + } + } + + /*! + @brief reads a MessagePack byte array + + This function first reads starting bytes to determine the expected + byte array length and then copies this number of bytes into a byte array. + + @param[out] result created byte array + + @return whether byte array creation completed + */ + bool get_msgpack_binary(binary_t& result) + { + // helper function to set the subtype + auto assign_and_return_true = [&result](std::int8_t subtype) + { + result.set_subtype(static_cast(subtype)); + return true; + }; + + switch (current) + { + case 0xC4: // bin 8 + { + std::uint8_t len{}; + return get_number(input_format_t::msgpack, len) && + get_binary(input_format_t::msgpack, len, result); + } + + case 0xC5: // bin 16 + { + std::uint16_t len{}; + return get_number(input_format_t::msgpack, len) && + get_binary(input_format_t::msgpack, len, result); + } + + case 0xC6: // bin 32 + { + std::uint32_t len{}; + return get_number(input_format_t::msgpack, len) && + get_binary(input_format_t::msgpack, len, result); + } + + case 0xC7: // ext 8 + { + std::uint8_t len{}; + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, len) && + get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, len, result) && + assign_and_return_true(subtype); + } + + case 0xC8: // ext 16 + { + std::uint16_t len{}; + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, len) && + get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, len, result) && + assign_and_return_true(subtype); + } + + case 0xC9: // ext 32 + { + std::uint32_t len{}; + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, len) && + get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, len, result) && + assign_and_return_true(subtype); + } + + case 0xD4: // fixext 1 + { + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, 1, result) && + assign_and_return_true(subtype); + } + + case 0xD5: // fixext 2 + { + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, 2, result) && + assign_and_return_true(subtype); + } + + case 0xD6: // fixext 4 + { + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, 4, result) && + assign_and_return_true(subtype); + } + + case 0xD7: // fixext 8 + { + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, 8, result) && + assign_and_return_true(subtype); + } + + case 0xD8: // fixext 16 + { + std::int8_t subtype{}; + return get_number(input_format_t::msgpack, subtype) && + get_binary(input_format_t::msgpack, 16, result) && + assign_and_return_true(subtype); + } + + default: // LCOV_EXCL_LINE + return false; // LCOV_EXCL_LINE + } + } + + /*! + @param[in] len the length of the array + @return whether array creation completed + */ + bool get_msgpack_array(const std::size_t len) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len))) + { + return false; + } + + for (std::size_t i = 0; i < len; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal())) + { + return false; + } + } + + return sax->end_array(); + } + + /*! + @param[in] len the length of the object + @return whether object creation completed + */ + bool get_msgpack_object(const std::size_t len) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len))) + { + return false; + } + + string_t key; + for (std::size_t i = 0; i < len; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) + { + return false; + } + + if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal())) + { + return false; + } + key.clear(); + } + + return sax->end_object(); + } + + //////////// + // UBJSON // + //////////// + + /*! + @param[in] get_char whether a new character should be retrieved from the + input (true, default) or whether the last read + character should be considered instead + + @return whether a valid UBJSON value was passed to the SAX parser + */ + bool parse_ubjson_internal(const bool get_char = true) + { + return get_ubjson_value(get_char ? get_ignore_noop() : current); + } + + /*! + @brief reads a UBJSON string + + This function is either called after reading the 'S' byte explicitly + indicating a string, or in case of an object key where the 'S' byte can be + left out. + + @param[out] result created string + @param[in] get_char whether a new character should be retrieved from the + input (true, default) or whether the last read + character should be considered instead + + @return whether string creation completed + */ + bool get_ubjson_string(string_t& result, const bool get_char = true) + { + if (get_char) + { + get(); // TODO(niels): may we ignore N here? + } + + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value"))) + { + return false; + } + + switch (current) + { + case 'U': + { + std::uint8_t len{}; + return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result); + } + + case 'i': + { + std::int8_t len{}; + return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result); + } + + case 'I': + { + std::int16_t len{}; + return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result); + } + + case 'l': + { + std::int32_t len{}; + return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result); + } + + case 'L': + { + std::int64_t len{}; + return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result); + } + + default: + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string"), BasicJsonType())); + } + } + + /*! + @param[out] result determined size + @return whether size determination completed + */ + bool get_ubjson_size_value(std::size_t& result) + { + switch (get_ignore_noop()) + { + case 'U': + { + std::uint8_t number{}; + if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number))) + { + return false; + } + result = static_cast(number); + return true; + } + + case 'i': + { + std::int8_t number{}; + if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number))) + { + return false; + } + result = static_cast(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char + return true; + } + + case 'I': + { + std::int16_t number{}; + if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number))) + { + return false; + } + result = static_cast(number); + return true; + } + + case 'l': + { + std::int32_t number{}; + if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number))) + { + return false; + } + result = static_cast(number); + return true; + } + + case 'L': + { + std::int64_t number{}; + if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number))) + { + return false; + } + result = static_cast(number); + return true; + } + + default: + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size"), BasicJsonType())); + } + } + } + + /*! + @brief determine the type and size for a container + + In the optimized UBJSON format, a type and a size can be provided to allow + for a more compact representation. + + @param[out] result pair of the size and the type + + @return whether pair creation completed + */ + bool get_ubjson_size_type(std::pair& result) + { + result.first = string_t::npos; // size + result.second = 0; // type + + get_ignore_noop(); + + if (current == '$') + { + result.second = get(); // must not ignore 'N', because 'N' maybe the type + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type"))) + { + return false; + } + + get_ignore_noop(); + if (JSON_HEDLEY_UNLIKELY(current != '#')) + { + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value"))) + { + return false; + } + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size"), BasicJsonType())); + } + + return get_ubjson_size_value(result.first); + } + + if (current == '#') + { + return get_ubjson_size_value(result.first); + } + + return true; + } + + /*! + @param prefix the previously read or set type prefix + @return whether value creation completed + */ + bool get_ubjson_value(const char_int_type prefix) + { + switch (prefix) + { + case std::char_traits::eof(): // EOF + return unexpect_eof(input_format_t::ubjson, "value"); + + case 'T': // true + return sax->boolean(true); + case 'F': // false + return sax->boolean(false); + + case 'Z': // null + return sax->null(); + + case 'U': + { + std::uint8_t number{}; + return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number); + } + + case 'i': + { + std::int8_t number{}; + return get_number(input_format_t::ubjson, number) && sax->number_integer(number); + } + + case 'I': + { + std::int16_t number{}; + return get_number(input_format_t::ubjson, number) && sax->number_integer(number); + } + + case 'l': + { + std::int32_t number{}; + return get_number(input_format_t::ubjson, number) && sax->number_integer(number); + } + + case 'L': + { + std::int64_t number{}; + return get_number(input_format_t::ubjson, number) && sax->number_integer(number); + } + + case 'd': + { + float number{}; + return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); + } + + case 'D': + { + double number{}; + return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); + } + + case 'H': + { + return get_ubjson_high_precision_number(); + } + + case 'C': // char + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char"))) + { + return false; + } + if (JSON_HEDLEY_UNLIKELY(current > 127)) + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char"), BasicJsonType())); + } + string_t s(1, static_cast(current)); + return sax->string(s); + } + + case 'S': // string + { + string_t s; + return get_ubjson_string(s) && sax->string(s); + } + + case '[': // array + return get_ubjson_array(); + + case '{': // object + return get_ubjson_object(); + + default: // anything else + { + auto last_token = get_token_string(); + return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value"), BasicJsonType())); + } + } + } + + /*! + @return whether array creation completed + */ + bool get_ubjson_array() + { + std::pair size_and_type; + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) + { + return false; + } + + if (size_and_type.first != string_t::npos) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) + { + return false; + } + + if (size_and_type.second != 0) + { + if (size_and_type.second != 'N') + { + for (std::size_t i = 0; i < size_and_type.first; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) + { + return false; + } + } + } + } + else + { + for (std::size_t i = 0; i < size_and_type.first; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) + { + return false; + } + } + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1)))) + { + return false; + } + + while (current != ']') + { + if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false))) + { + return false; + } + get_ignore_noop(); + } + } + + return sax->end_array(); + } + + /*! + @return whether object creation completed + */ + bool get_ubjson_object() + { + std::pair size_and_type; + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) + { + return false; + } + + string_t key; + if (size_and_type.first != string_t::npos) + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) + { + return false; + } + + if (size_and_type.second != 0) + { + for (std::size_t i = 0; i < size_and_type.first; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + { + return false; + } + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) + { + return false; + } + key.clear(); + } + } + else + { + for (std::size_t i = 0; i < size_and_type.first; ++i) + { + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + { + return false; + } + if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) + { + return false; + } + key.clear(); + } + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1)))) + { + return false; + } + + while (current != '}') + { + if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) + { + return false; + } + if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) + { + return false; + } + get_ignore_noop(); + key.clear(); + } + } + + return sax->end_object(); + } + + // Note, no reader for UBJSON binary types is implemented because they do + // not exist + + bool get_ubjson_high_precision_number() + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + using ia_type = decltype(detail::input_adapter(number_vector)); + auto number_lexer = detail::lexer(detail::input_adapter(number_vector), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType())); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType())); + } + } + + /////////////////////// + // Utility functions // + /////////////////////// + + /*! + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a -'ve valued + `std::char_traits::eof()` in that case. + + @return character read from the input + */ + char_int_type get() + { + ++chars_read; + return current = ia.get_character(); + } + + /*! + @return character read from the input after ignoring all 'N' entries + */ + char_int_type get_ignore_noop() + { + do + { + get(); + } + while (current == 'N'); + + return current; + } + + /* + @brief read a number from the input + + @tparam NumberType the type of the number + @param[in] format the current format (for diagnostics) + @param[out] result number of type @a NumberType + + @return whether conversion completed + + @note This function needs to respect the system's endianess, because + bytes in CBOR, MessagePack, and UBJSON are stored in network order + (big endian) and therefore need reordering on little endian systems. + */ + template + bool get_number(const input_format_t format, NumberType& result) + { + // step 1: read input into array with system's byte order + std::array vec{}; + for (std::size_t i = 0; i < sizeof(NumberType); ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number"))) + { + return false; + } + + // reverse byte order prior to conversion if necessary + if (is_little_endian != InputIsLittleEndian) + { + vec[sizeof(NumberType) - i - 1] = static_cast(current); + } + else + { + vec[i] = static_cast(current); // LCOV_EXCL_LINE + } + } + + // step 2: convert array into number of type T and return + std::memcpy(&result, vec.data(), sizeof(NumberType)); + return true; + } + + /*! + @brief create a string by reading characters from the input + + @tparam NumberType the type of the number + @param[in] format the current format (for diagnostics) + @param[in] len number of characters to read + @param[out] result string created by reading @a len bytes + + @return whether string creation completed + + @note We can not reserve @a len bytes for the result, because @a len + may be too large. Usually, @ref unexpect_eof() detects the end of + the input before we run out of string memory. + */ + template + bool get_string(const input_format_t format, + const NumberType len, + string_t& result) + { + bool success = true; + for (NumberType i = 0; i < len; i++) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string"))) + { + success = false; + break; + } + result.push_back(static_cast(current)); + } + return success; + } + + /*! + @brief create a byte array by reading bytes from the input + + @tparam NumberType the type of the number + @param[in] format the current format (for diagnostics) + @param[in] len number of bytes to read + @param[out] result byte array created by reading @a len bytes + + @return whether byte array creation completed + + @note We can not reserve @a len bytes for the result, because @a len + may be too large. Usually, @ref unexpect_eof() detects the end of + the input before we run out of memory. + */ + template + bool get_binary(const input_format_t format, + const NumberType len, + binary_t& result) + { + bool success = true; + for (NumberType i = 0; i < len; i++) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary"))) + { + success = false; + break; + } + result.push_back(static_cast(current)); + } + return success; + } + + /*! + @param[in] format the current format (for diagnostics) + @param[in] context further context information (for diagnostics) + @return whether the last read character is not EOF + */ + JSON_HEDLEY_NON_NULL(3) + bool unexpect_eof(const input_format_t format, const char* context) const + { + if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) + { + return sax->parse_error(chars_read, "", + parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), BasicJsonType())); + } + return true; + } + + /*! + @return a string representation of the last read byte + */ + std::string get_token_string() const + { + std::array cr{{}}; + (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast(current)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + return std::string{cr.data()}; + } + + /*! + @param[in] format the current format + @param[in] detail a detailed error message + @param[in] context further context information + @return a message string to use in the parse_error exceptions + */ + std::string exception_message(const input_format_t format, + const std::string& detail, + const std::string& context) const + { + std::string error_msg = "syntax error while parsing "; + + switch (format) + { + case input_format_t::cbor: + error_msg += "CBOR"; + break; + + case input_format_t::msgpack: + error_msg += "MessagePack"; + break; + + case input_format_t::ubjson: + error_msg += "UBJSON"; + break; + + case input_format_t::bson: + error_msg += "BSON"; + break; + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + + return error_msg + " " + context + ": " + detail; + } + + private: + /// input adapter + InputAdapterType ia; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// the number of characters read + std::size_t chars_read = 0; + + /// whether we can assume little endianess + const bool is_little_endian = little_endianess(); + + /// the SAX parser + json_sax_t* sax = nullptr; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + +// #include + + +#include // isfinite +#include // uint8_t +#include // function +#include // string +#include // move +#include // vector + +// #include + +// #include + +// #include + +// #include + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +//////////// +// parser // +//////////// + +enum class parse_event_t : uint8_t +{ + /// the parser read `{` and started to process a JSON object + object_start, + /// the parser read `}` and finished processing a JSON object + object_end, + /// the parser read `[` and started to process a JSON array + array_start, + /// the parser read `]` and finished processing a JSON array + array_end, + /// the parser read a key of a value in an object + key, + /// the parser finished reading a JSON value + value +}; + +template +using parser_callback_t = + std::function; + +/*! +@brief syntax analysis + +This class implements a recursive descent parser. +*/ +template +class parser +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using lexer_t = lexer; + using token_type = typename lexer_t::token_type; + + public: + /// a parser reading from an input adapter + explicit parser(InputAdapterType&& adapter, + const parser_callback_t cb = nullptr, + const bool allow_exceptions_ = true, + const bool skip_comments = false) + : callback(cb) + , m_lexer(std::move(adapter), skip_comments) + , allow_exceptions(allow_exceptions_) + { + // read first token + get_token(); + } + + /*! + @brief public parser interface + + @param[in] strict whether to expect the last token to be EOF + @param[in,out] result parsed JSON value + + @throw parse_error.101 in case of an unexpected token + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + */ + void parse(const bool strict, BasicJsonType& result) + { + if (callback) + { + json_sax_dom_callback_parser sdp(result, callback, allow_exceptions); + sax_parse_internal(&sdp); + + // in strict mode, input must be completely read + if (strict && (get_token() != token_type::end_of_input)) + { + sdp.parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), + exception_message(token_type::end_of_input, "value"), BasicJsonType())); + } + + // in case of an error, return discarded value + if (sdp.is_errored()) + { + result = value_t::discarded; + return; + } + + // set top-level value to null if it was discarded by the callback + // function + if (result.is_discarded()) + { + result = nullptr; + } + } + else + { + json_sax_dom_parser sdp(result, allow_exceptions); + sax_parse_internal(&sdp); + + // in strict mode, input must be completely read + if (strict && (get_token() != token_type::end_of_input)) + { + sdp.parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType())); + } + + // in case of an error, return discarded value + if (sdp.is_errored()) + { + result = value_t::discarded; + return; + } + } + + result.assert_invariant(); + } + + /*! + @brief public accept interface + + @param[in] strict whether to expect the last token to be EOF + @return whether the input is a proper JSON text + */ + bool accept(const bool strict = true) + { + json_sax_acceptor sax_acceptor; + return sax_parse(&sax_acceptor, strict); + } + + template + JSON_HEDLEY_NON_NULL(2) + bool sax_parse(SAX* sax, const bool strict = true) + { + (void)detail::is_sax_static_asserts {}; + const bool result = sax_parse_internal(sax); + + // strict mode: next byte must be EOF + if (result && strict && (get_token() != token_type::end_of_input)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType())); + } + + return result; + } + + private: + template + JSON_HEDLEY_NON_NULL(2) + bool sax_parse_internal(SAX* sax) + { + // stack to remember the hierarchy of structured values we are parsing + // true = array; false = object + std::vector states; + // value to avoid a goto (see comment where set to true) + bool skip_to_state_evaluation = false; + + while (true) + { + if (!skip_to_state_evaluation) + { + // invariant: get_token() was called before each iteration + switch (last_token) + { + case token_type::begin_object: + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1)))) + { + return false; + } + + // closing } -> we are done + if (get_token() == token_type::end_object) + { + if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) + { + return false; + } + break; + } + + // parse key + if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType())); + } + if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string()))) + { + return false; + } + + // parse separator (:) + if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType())); + } + + // remember we are now inside an object + states.push_back(false); + + // parse values + get_token(); + continue; + } + + case token_type::begin_array: + { + if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1)))) + { + return false; + } + + // closing ] -> we are done + if (get_token() == token_type::end_array) + { + if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) + { + return false; + } + break; + } + + // remember we are now inside an array + states.push_back(true); + + // parse values (no need to call get_token) + continue; + } + + case token_type::value_float: + { + const auto res = m_lexer.get_number_float(); + + if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res))) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'", BasicJsonType())); + } + + if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string()))) + { + return false; + } + + break; + } + + case token_type::literal_false: + { + if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false))) + { + return false; + } + break; + } + + case token_type::literal_null: + { + if (JSON_HEDLEY_UNLIKELY(!sax->null())) + { + return false; + } + break; + } + + case token_type::literal_true: + { + if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true))) + { + return false; + } + break; + } + + case token_type::value_integer: + { + if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer()))) + { + return false; + } + break; + } + + case token_type::value_string: + { + if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string()))) + { + return false; + } + break; + } + + case token_type::value_unsigned: + { + if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned()))) + { + return false; + } + break; + } + + case token_type::parse_error: + { + // using "uninitialized" to avoid "expected" message + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), BasicJsonType())); + } + + default: // the last token was unexpected + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), BasicJsonType())); + } + } + } + else + { + skip_to_state_evaluation = false; + } + + // we reached this line after we successfully parsed a value + if (states.empty()) + { + // empty stack: we reached the end of the hierarchy: done + return true; + } + + if (states.back()) // array + { + // comma -> next value + if (get_token() == token_type::value_separator) + { + // parse a new value + get_token(); + continue; + } + + // closing ] + if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array)) + { + if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) + { + return false; + } + + // We are done with this array. Before we can parse a + // new value, we need to evaluate the new state first. + // By setting skip_to_state_evaluation to false, we + // are effectively jumping to the beginning of this if. + JSON_ASSERT(!states.empty()); + states.pop_back(); + skip_to_state_evaluation = true; + continue; + } + + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), BasicJsonType())); + } + + // states.back() is false -> object + + // comma -> next value + if (get_token() == token_type::value_separator) + { + // parse key + if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType())); + } + + if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string()))) + { + return false; + } + + // parse separator (:) + if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType())); + } + + // parse values + get_token(); + continue; + } + + // closing } + if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object)) + { + if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) + { + return false; + } + + // We are done with this object. Before we can parse a + // new value, we need to evaluate the new state first. + // By setting skip_to_state_evaluation to false, we + // are effectively jumping to the beginning of this if. + JSON_ASSERT(!states.empty()); + states.pop_back(); + skip_to_state_evaluation = true; + continue; + } + + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), BasicJsonType())); + } + } + + /// get next token from lexer + token_type get_token() + { + return last_token = m_lexer.scan(); + } + + std::string exception_message(const token_type expected, const std::string& context) + { + std::string error_msg = "syntax error "; + + if (!context.empty()) + { + error_msg += "while parsing " + context + " "; + } + + error_msg += "- "; + + if (last_token == token_type::parse_error) + { + error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" + + m_lexer.get_token_string() + "'"; + } + else + { + error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token)); + } + + if (expected != token_type::uninitialized) + { + error_msg += "; expected " + std::string(lexer_t::token_type_name(expected)); + } + + return error_msg; + } + + private: + /// callback function + const parser_callback_t callback = nullptr; + /// the type of the last read token + token_type last_token = token_type::uninitialized; + /// the lexer + lexer_t m_lexer; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; +}; + +} // namespace detail +} // namespace nlohmann + +// #include + + +// #include + + +#include // ptrdiff_t +#include // numeric_limits + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/* +@brief an iterator for primitive JSON types + +This class models an iterator for primitive JSON types (boolean, number, +string). It's only purpose is to allow the iterator/const_iterator classes +to "iterate" over primitive values. Internally, the iterator is modeled by +a `difference_type` variable. Value begin_value (`0`) models the begin, +end_value (`1`) models past the end. +*/ +class primitive_iterator_t +{ + private: + using difference_type = std::ptrdiff_t; + static constexpr difference_type begin_value = 0; + static constexpr difference_type end_value = begin_value + 1; + + JSON_PRIVATE_UNLESS_TESTED: + /// iterator as signed integer type + difference_type m_it = (std::numeric_limits::min)(); + + public: + constexpr difference_type get_value() const noexcept + { + return m_it; + } + + /// set iterator to a defined beginning + void set_begin() noexcept + { + m_it = begin_value; + } + + /// set iterator to a defined past the end + void set_end() noexcept + { + m_it = end_value; + } + + /// return whether the iterator can be dereferenced + constexpr bool is_begin() const noexcept + { + return m_it == begin_value; + } + + /// return whether the iterator is at end + constexpr bool is_end() const noexcept + { + return m_it == end_value; + } + + friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept + { + return lhs.m_it == rhs.m_it; + } + + friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept + { + return lhs.m_it < rhs.m_it; + } + + primitive_iterator_t operator+(difference_type n) noexcept + { + auto result = *this; + result += n; + return result; + } + + friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept + { + return lhs.m_it - rhs.m_it; + } + + primitive_iterator_t& operator++() noexcept + { + ++m_it; + return *this; + } + + primitive_iterator_t const operator++(int) noexcept // NOLINT(readability-const-return-type) + { + auto result = *this; + ++m_it; + return result; + } + + primitive_iterator_t& operator--() noexcept + { + --m_it; + return *this; + } + + primitive_iterator_t const operator--(int) noexcept // NOLINT(readability-const-return-type) + { + auto result = *this; + --m_it; + return result; + } + + primitive_iterator_t& operator+=(difference_type n) noexcept + { + m_it += n; + return *this; + } + + primitive_iterator_t& operator-=(difference_type n) noexcept + { + m_it -= n; + return *this; + } +}; +} // namespace detail +} // namespace nlohmann + + +namespace nlohmann +{ +namespace detail +{ +/*! +@brief an iterator value + +@note This structure could easily be a union, but MSVC currently does not allow +unions members with complex constructors, see https://github.com/nlohmann/json/pull/105. +*/ +template struct internal_iterator +{ + /// iterator for JSON objects + typename BasicJsonType::object_t::iterator object_iterator {}; + /// iterator for JSON arrays + typename BasicJsonType::array_t::iterator array_iterator {}; + /// generic iterator for all other types + primitive_iterator_t primitive_iterator {}; +}; +} // namespace detail +} // namespace nlohmann + +// #include + + +#include // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next +#include // conditional, is_const, remove_const + +// #include + +// #include + +// #include + +// #include + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +// forward declare, to be able to friend it later on +template class iteration_proxy; +template class iteration_proxy_value; + +/*! +@brief a template for a bidirectional iterator for the @ref basic_json class +This class implements a both iterators (iterator and const_iterator) for the +@ref basic_json class. +@note An iterator is called *initialized* when a pointer to a JSON value has + been set (e.g., by a constructor or a copy assignment). If the iterator is + default-constructed, it is *uninitialized* and most methods are undefined. + **The library uses assertions to detect calls on uninitialized iterators.** +@requirement The class satisfies the following concept requirements: +- +[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator): + The iterator that can be moved can be moved in both directions (i.e. + incremented and decremented). +@since version 1.0.0, simplified in version 2.0.9, change to bidirectional + iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593) +*/ +template +class iter_impl +{ + /// the iterator with BasicJsonType of different const-ness + using other_iter_impl = iter_impl::value, typename std::remove_const::type, const BasicJsonType>::type>; + /// allow basic_json to access private members + friend other_iter_impl; + friend BasicJsonType; + friend iteration_proxy; + friend iteration_proxy_value; + + using object_t = typename BasicJsonType::object_t; + using array_t = typename BasicJsonType::array_t; + // make sure BasicJsonType is basic_json or const basic_json + static_assert(is_basic_json::type>::value, + "iter_impl only accepts (const) basic_json"); + + public: + + /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17. + /// The C++ Standard has never required user-defined iterators to derive from std::iterator. + /// A user-defined iterator should provide publicly accessible typedefs named + /// iterator_category, value_type, difference_type, pointer, and reference. + /// Note that value_type is required to be non-const, even for constant iterators. + using iterator_category = std::bidirectional_iterator_tag; + + /// the type of the values when the iterator is dereferenced + using value_type = typename BasicJsonType::value_type; + /// a type to represent differences between iterators + using difference_type = typename BasicJsonType::difference_type; + /// defines a pointer to the type iterated over (value_type) + using pointer = typename std::conditional::value, + typename BasicJsonType::const_pointer, + typename BasicJsonType::pointer>::type; + /// defines a reference to the type iterated over (value_type) + using reference = + typename std::conditional::value, + typename BasicJsonType::const_reference, + typename BasicJsonType::reference>::type; + + iter_impl() = default; + ~iter_impl() = default; + iter_impl(iter_impl&&) noexcept = default; + iter_impl& operator=(iter_impl&&) noexcept = default; + + /*! + @brief constructor for a given JSON instance + @param[in] object pointer to a JSON object for this iterator + @pre object != nullptr + @post The iterator is initialized; i.e. `m_object != nullptr`. + */ + explicit iter_impl(pointer object) noexcept : m_object(object) + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + m_it.object_iterator = typename object_t::iterator(); + break; + } + + case value_t::array: + { + m_it.array_iterator = typename array_t::iterator(); + break; + } + + default: + { + m_it.primitive_iterator = primitive_iterator_t(); + break; + } + } + } + + /*! + @note The conventional copy constructor and copy assignment are implicitly + defined. Combined with the following converting constructor and + assignment, they support: (1) copy from iterator to iterator, (2) + copy from const iterator to const iterator, and (3) conversion from + iterator to const iterator. However conversion from const iterator + to iterator is not defined. + */ + + /*! + @brief const copy constructor + @param[in] other const iterator to copy from + @note This copy constructor had to be defined explicitly to circumvent a bug + occurring on msvc v19.0 compiler (VS 2015) debug build. For more + information refer to: https://github.com/nlohmann/json/issues/1608 + */ + iter_impl(const iter_impl& other) noexcept + : m_object(other.m_object), m_it(other.m_it) + {} + + /*! + @brief converting assignment + @param[in] other const iterator to copy from + @return const/non-const iterator + @note It is not checked whether @a other is initialized. + */ + iter_impl& operator=(const iter_impl& other) noexcept + { + if (&other != this) + { + m_object = other.m_object; + m_it = other.m_it; + } + return *this; + } + + /*! + @brief converting constructor + @param[in] other non-const iterator to copy from + @note It is not checked whether @a other is initialized. + */ + iter_impl(const iter_impl::type>& other) noexcept + : m_object(other.m_object), m_it(other.m_it) + {} + + /*! + @brief converting assignment + @param[in] other non-const iterator to copy from + @return const/non-const iterator + @note It is not checked whether @a other is initialized. + */ + iter_impl& operator=(const iter_impl::type>& other) noexcept // NOLINT(cert-oop54-cpp) + { + m_object = other.m_object; + m_it = other.m_it; + return *this; + } + + JSON_PRIVATE_UNLESS_TESTED: + /*! + @brief set the iterator to the first value + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + void set_begin() noexcept + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + m_it.object_iterator = m_object->m_value.object->begin(); + break; + } + + case value_t::array: + { + m_it.array_iterator = m_object->m_value.array->begin(); + break; + } + + case value_t::null: + { + // set to end so begin()==end() is true: null is empty + m_it.primitive_iterator.set_end(); + break; + } + + default: + { + m_it.primitive_iterator.set_begin(); + break; + } + } + } + + /*! + @brief set the iterator past the last value + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + void set_end() noexcept + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + m_it.object_iterator = m_object->m_value.object->end(); + break; + } + + case value_t::array: + { + m_it.array_iterator = m_object->m_value.array->end(); + break; + } + + default: + { + m_it.primitive_iterator.set_end(); + break; + } + } + } + + public: + /*! + @brief return a reference to the value pointed to by the iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + reference operator*() const + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + return m_it.object_iterator->second; + } + + case value_t::array: + { + JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + return *m_it.array_iterator; + } + + case value_t::null: + JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object)); + + default: + { + if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin())) + { + return *m_object; + } + + JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object)); + } + } + } + + /*! + @brief dereference the iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + pointer operator->() const + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + return &(m_it.object_iterator->second); + } + + case value_t::array: + { + JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + return &*m_it.array_iterator; + } + + default: + { + if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin())) + { + return m_object; + } + + JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object)); + } + } + } + + /*! + @brief post-increment (it++) + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl const operator++(int) // NOLINT(readability-const-return-type) + { + auto result = *this; + ++(*this); + return result; + } + + /*! + @brief pre-increment (++it) + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl& operator++() + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + std::advance(m_it.object_iterator, 1); + break; + } + + case value_t::array: + { + std::advance(m_it.array_iterator, 1); + break; + } + + default: + { + ++m_it.primitive_iterator; + break; + } + } + + return *this; + } + + /*! + @brief post-decrement (it--) + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl const operator--(int) // NOLINT(readability-const-return-type) + { + auto result = *this; + --(*this); + return result; + } + + /*! + @brief pre-decrement (--it) + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl& operator--() + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + { + std::advance(m_it.object_iterator, -1); + break; + } + + case value_t::array: + { + std::advance(m_it.array_iterator, -1); + break; + } + + default: + { + --m_it.primitive_iterator; + break; + } + } + + return *this; + } + + /*! + @brief comparison: equal + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + template < typename IterImpl, detail::enable_if_t < (std::is_same::value || std::is_same::value), std::nullptr_t > = nullptr > + bool operator==(const IterImpl& other) const + { + // if objects are not the same, the comparison is undefined + if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object)) + { + JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object)); + } + + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + return (m_it.object_iterator == other.m_it.object_iterator); + + case value_t::array: + return (m_it.array_iterator == other.m_it.array_iterator); + + default: + return (m_it.primitive_iterator == other.m_it.primitive_iterator); + } + } + + /*! + @brief comparison: not equal + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + template < typename IterImpl, detail::enable_if_t < (std::is_same::value || std::is_same::value), std::nullptr_t > = nullptr > + bool operator!=(const IterImpl& other) const + { + return !operator==(other); + } + + /*! + @brief comparison: smaller + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + bool operator<(const iter_impl& other) const + { + // if objects are not the same, the comparison is undefined + if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object)) + { + JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object)); + } + + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", *m_object)); + + case value_t::array: + return (m_it.array_iterator < other.m_it.array_iterator); + + default: + return (m_it.primitive_iterator < other.m_it.primitive_iterator); + } + } + + /*! + @brief comparison: less than or equal + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + bool operator<=(const iter_impl& other) const + { + return !other.operator < (*this); + } + + /*! + @brief comparison: greater than + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + bool operator>(const iter_impl& other) const + { + return !operator<=(other); + } + + /*! + @brief comparison: greater than or equal + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + bool operator>=(const iter_impl& other) const + { + return !operator<(other); + } + + /*! + @brief add to iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl& operator+=(difference_type i) + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object)); + + case value_t::array: + { + std::advance(m_it.array_iterator, i); + break; + } + + default: + { + m_it.primitive_iterator += i; + break; + } + } + + return *this; + } + + /*! + @brief subtract from iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl& operator-=(difference_type i) + { + return operator+=(-i); + } + + /*! + @brief add to iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl operator+(difference_type i) const + { + auto result = *this; + result += i; + return result; + } + + /*! + @brief addition of distance and iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + friend iter_impl operator+(difference_type i, const iter_impl& it) + { + auto result = it; + result += i; + return result; + } + + /*! + @brief subtract from iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + iter_impl operator-(difference_type i) const + { + auto result = *this; + result -= i; + return result; + } + + /*! + @brief return difference + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + difference_type operator-(const iter_impl& other) const + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object)); + + case value_t::array: + return m_it.array_iterator - other.m_it.array_iterator; + + default: + return m_it.primitive_iterator - other.m_it.primitive_iterator; + } + } + + /*! + @brief access to successor + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + reference operator[](difference_type n) const + { + JSON_ASSERT(m_object != nullptr); + + switch (m_object->m_type) + { + case value_t::object: + JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", *m_object)); + + case value_t::array: + return *std::next(m_it.array_iterator, n); + + case value_t::null: + JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object)); + + default: + { + if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n)) + { + return *m_object; + } + + JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object)); + } + } + } + + /*! + @brief return the key of an object iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + const typename object_t::key_type& key() const + { + JSON_ASSERT(m_object != nullptr); + + if (JSON_HEDLEY_LIKELY(m_object->is_object())) + { + return m_it.object_iterator->first; + } + + JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", *m_object)); + } + + /*! + @brief return the value of an iterator + @pre The iterator is initialized; i.e. `m_object != nullptr`. + */ + reference value() const + { + return operator*(); + } + + JSON_PRIVATE_UNLESS_TESTED: + /// associated JSON instance + pointer m_object = nullptr; + /// the actual iterator of the associated instance + internal_iterator::type> m_it {}; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + + +#include // ptrdiff_t +#include // reverse_iterator +#include // declval + +namespace nlohmann +{ +namespace detail +{ +////////////////////// +// reverse_iterator // +////////////////////// + +/*! +@brief a template for a reverse iterator class + +@tparam Base the base iterator type to reverse. Valid types are @ref +iterator (to create @ref reverse_iterator) and @ref const_iterator (to +create @ref const_reverse_iterator). + +@requirement The class satisfies the following concept requirements: +- +[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator): + The iterator that can be moved can be moved in both directions (i.e. + incremented and decremented). +- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator): + It is possible to write to the pointed-to element (only if @a Base is + @ref iterator). + +@since version 1.0.0 +*/ +template +class json_reverse_iterator : public std::reverse_iterator +{ + public: + using difference_type = std::ptrdiff_t; + /// shortcut to the reverse iterator adapter + using base_iterator = std::reverse_iterator; + /// the reference type for the pointed-to element + using reference = typename Base::reference; + + /// create reverse iterator from iterator + explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept + : base_iterator(it) {} + + /// create reverse iterator from base class + explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {} + + /// post-increment (it++) + json_reverse_iterator const operator++(int) // NOLINT(readability-const-return-type) + { + return static_cast(base_iterator::operator++(1)); + } + + /// pre-increment (++it) + json_reverse_iterator& operator++() + { + return static_cast(base_iterator::operator++()); + } + + /// post-decrement (it--) + json_reverse_iterator const operator--(int) // NOLINT(readability-const-return-type) + { + return static_cast(base_iterator::operator--(1)); + } + + /// pre-decrement (--it) + json_reverse_iterator& operator--() + { + return static_cast(base_iterator::operator--()); + } + + /// add to iterator + json_reverse_iterator& operator+=(difference_type i) + { + return static_cast(base_iterator::operator+=(i)); + } + + /// add to iterator + json_reverse_iterator operator+(difference_type i) const + { + return static_cast(base_iterator::operator+(i)); + } + + /// subtract from iterator + json_reverse_iterator operator-(difference_type i) const + { + return static_cast(base_iterator::operator-(i)); + } + + /// return difference + difference_type operator-(const json_reverse_iterator& other) const + { + return base_iterator(*this) - base_iterator(other); + } + + /// access to successor + reference operator[](difference_type n) const + { + return *(this->operator+(n)); + } + + /// return the key of an object iterator + auto key() const -> decltype(std::declval().key()) + { + auto it = --this->base(); + return it.key(); + } + + /// return the value of an iterator + reference value() const + { + auto it = --this->base(); + return it.operator * (); + } +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + + +#include // all_of +#include // isdigit +#include // max +#include // accumulate +#include // string +#include // move +#include // vector + +// #include + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +template +class json_pointer +{ + // allow basic_json to access private members + NLOHMANN_BASIC_JSON_TPL_DECLARATION + friend class basic_json; + + public: + /*! + @brief create JSON pointer + + Create a JSON pointer according to the syntax described in + [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3). + + @param[in] s string representing the JSON pointer; if omitted, the empty + string is assumed which references the whole JSON value + + @throw parse_error.107 if the given JSON pointer @a s is nonempty and does + not begin with a slash (`/`); see example below + + @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is + not followed by `0` (representing `~`) or `1` (representing `/`); see + example below + + @liveexample{The example shows the construction several valid JSON pointers + as well as the exceptional behavior.,json_pointer} + + @since version 2.0.0 + */ + explicit json_pointer(const std::string& s = "") + : reference_tokens(split(s)) + {} + + /*! + @brief return a string representation of the JSON pointer + + @invariant For each JSON pointer `ptr`, it holds: + @code {.cpp} + ptr == json_pointer(ptr.to_string()); + @endcode + + @return a string representation of the JSON pointer + + @liveexample{The example shows the result of `to_string`.,json_pointer__to_string} + + @since version 2.0.0 + */ + std::string to_string() const + { + return std::accumulate(reference_tokens.begin(), reference_tokens.end(), + std::string{}, + [](const std::string & a, const std::string & b) + { + return a + "/" + detail::escape(b); + }); + } + + /// @copydoc to_string() + operator std::string() const + { + return to_string(); + } + + /*! + @brief append another JSON pointer at the end of this JSON pointer + + @param[in] ptr JSON pointer to append + @return JSON pointer with @a ptr appended + + @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add} + + @complexity Linear in the length of @a ptr. + + @sa see @ref operator/=(std::string) to append a reference token + @sa see @ref operator/=(std::size_t) to append an array index + @sa see @ref operator/(const json_pointer&, const json_pointer&) for a binary operator + + @since version 3.6.0 + */ + json_pointer& operator/=(const json_pointer& ptr) + { + reference_tokens.insert(reference_tokens.end(), + ptr.reference_tokens.begin(), + ptr.reference_tokens.end()); + return *this; + } + + /*! + @brief append an unescaped reference token at the end of this JSON pointer + + @param[in] token reference token to append + @return JSON pointer with @a token appended without escaping @a token + + @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add} + + @complexity Amortized constant. + + @sa see @ref operator/=(const json_pointer&) to append a JSON pointer + @sa see @ref operator/=(std::size_t) to append an array index + @sa see @ref operator/(const json_pointer&, std::size_t) for a binary operator + + @since version 3.6.0 + */ + json_pointer& operator/=(std::string token) + { + push_back(std::move(token)); + return *this; + } + + /*! + @brief append an array index at the end of this JSON pointer + + @param[in] array_idx array index to append + @return JSON pointer with @a array_idx appended + + @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add} + + @complexity Amortized constant. + + @sa see @ref operator/=(const json_pointer&) to append a JSON pointer + @sa see @ref operator/=(std::string) to append a reference token + @sa see @ref operator/(const json_pointer&, std::string) for a binary operator + + @since version 3.6.0 + */ + json_pointer& operator/=(std::size_t array_idx) + { + return *this /= std::to_string(array_idx); + } + + /*! + @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer + + @param[in] lhs JSON pointer + @param[in] rhs JSON pointer + @return a new JSON pointer with @a rhs appended to @a lhs + + @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary} + + @complexity Linear in the length of @a lhs and @a rhs. + + @sa see @ref operator/=(const json_pointer&) to append a JSON pointer + + @since version 3.6.0 + */ + friend json_pointer operator/(const json_pointer& lhs, + const json_pointer& rhs) + { + return json_pointer(lhs) /= rhs; + } + + /*! + @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer + + @param[in] ptr JSON pointer + @param[in] token reference token + @return a new JSON pointer with unescaped @a token appended to @a ptr + + @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary} + + @complexity Linear in the length of @a ptr. + + @sa see @ref operator/=(std::string) to append a reference token + + @since version 3.6.0 + */ + friend json_pointer operator/(const json_pointer& ptr, std::string token) // NOLINT(performance-unnecessary-value-param) + { + return json_pointer(ptr) /= std::move(token); + } + + /*! + @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer + + @param[in] ptr JSON pointer + @param[in] array_idx array index + @return a new JSON pointer with @a array_idx appended to @a ptr + + @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary} + + @complexity Linear in the length of @a ptr. + + @sa see @ref operator/=(std::size_t) to append an array index + + @since version 3.6.0 + */ + friend json_pointer operator/(const json_pointer& ptr, std::size_t array_idx) + { + return json_pointer(ptr) /= array_idx; + } + + /*! + @brief returns the parent of this JSON pointer + + @return parent of this JSON pointer; in case this JSON pointer is the root, + the root itself is returned + + @complexity Linear in the length of the JSON pointer. + + @liveexample{The example shows the result of `parent_pointer` for different + JSON Pointers.,json_pointer__parent_pointer} + + @since version 3.6.0 + */ + json_pointer parent_pointer() const + { + if (empty()) + { + return *this; + } + + json_pointer res = *this; + res.pop_back(); + return res; + } + + /*! + @brief remove last reference token + + @pre not `empty()` + + @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back} + + @complexity Constant. + + @throw out_of_range.405 if JSON pointer has no parent + + @since version 3.6.0 + */ + void pop_back() + { + if (JSON_HEDLEY_UNLIKELY(empty())) + { + JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType())); + } + + reference_tokens.pop_back(); + } + + /*! + @brief return last reference token + + @pre not `empty()` + @return last reference token + + @liveexample{The example shows the usage of `back`.,json_pointer__back} + + @complexity Constant. + + @throw out_of_range.405 if JSON pointer has no parent + + @since version 3.6.0 + */ + const std::string& back() const + { + if (JSON_HEDLEY_UNLIKELY(empty())) + { + JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType())); + } + + return reference_tokens.back(); + } + + /*! + @brief append an unescaped token at the end of the reference pointer + + @param[in] token token to add + + @complexity Amortized constant. + + @liveexample{The example shows the result of `push_back` for different + JSON Pointers.,json_pointer__push_back} + + @since version 3.6.0 + */ + void push_back(const std::string& token) + { + reference_tokens.push_back(token); + } + + /// @copydoc push_back(const std::string&) + void push_back(std::string&& token) + { + reference_tokens.push_back(std::move(token)); + } + + /*! + @brief return whether pointer points to the root document + + @return true iff the JSON pointer points to the root document + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example shows the result of `empty` for different JSON + Pointers.,json_pointer__empty} + + @since version 3.6.0 + */ + bool empty() const noexcept + { + return reference_tokens.empty(); + } + + private: + /*! + @param[in] s reference token to be converted into an array index + + @return integer representation of @a s + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index begins not with a digit + @throw out_of_range.404 if string @a s could not be converted to an integer + @throw out_of_range.410 if an array index exceeds size_type + */ + static typename BasicJsonType::size_type array_index(const std::string& s) + { + using size_type = typename BasicJsonType::size_type; + + // error condition (cf. RFC 6901, Sect. 4) + if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0')) + { + JSON_THROW(detail::parse_error::create(106, 0, "array index '" + s + "' must not begin with '0'", BasicJsonType())); + } + + // error condition (cf. RFC 6901, Sect. 4) + if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9'))) + { + JSON_THROW(detail::parse_error::create(109, 0, "array index '" + s + "' is not a number", BasicJsonType())); + } + + std::size_t processed_chars = 0; + unsigned long long res = 0; // NOLINT(runtime/int) + JSON_TRY + { + res = std::stoull(s, &processed_chars); + } + JSON_CATCH(std::out_of_range&) + { + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType())); + } + + // check if the string was completely read + if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size())) + { + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType())); + } + + // only triggered on special platforms (like 32bit), see also + // https://github.com/nlohmann/json/pull/2203 + if (res >= static_cast((std::numeric_limits::max)())) // NOLINT(runtime/int) + { + JSON_THROW(detail::out_of_range::create(410, "array index " + s + " exceeds size_type", BasicJsonType())); // LCOV_EXCL_LINE + } + + return static_cast(res); + } + + JSON_PRIVATE_UNLESS_TESTED: + json_pointer top() const + { + if (JSON_HEDLEY_UNLIKELY(empty())) + { + JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType())); + } + + json_pointer result = *this; + result.reference_tokens = {reference_tokens[0]}; + return result; + } + + private: + /*! + @brief create and return a reference to the pointed to value + + @complexity Linear in the number of reference tokens. + + @throw parse_error.109 if array index is not a number + @throw type_error.313 if value cannot be unflattened + */ + BasicJsonType& get_and_create(BasicJsonType& j) const + { + auto* result = &j; + + // in case no reference tokens exist, return a reference to the JSON value + // j which will be overwritten by a primitive value + for (const auto& reference_token : reference_tokens) + { + switch (result->type()) + { + case detail::value_t::null: + { + if (reference_token == "0") + { + // start a new array if reference token is 0 + result = &result->operator[](0); + } + else + { + // start a new object otherwise + result = &result->operator[](reference_token); + } + break; + } + + case detail::value_t::object: + { + // create an entry in the object + result = &result->operator[](reference_token); + break; + } + + case detail::value_t::array: + { + // create an entry in the array + result = &result->operator[](array_index(reference_token)); + break; + } + + /* + The following code is only reached if there exists a reference + token _and_ the current value is primitive. In this case, we have + an error situation, because primitive values may only occur as + single value; that is, with an empty list of reference tokens. + */ + default: + JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", j)); + } + } + + return *result; + } + + /*! + @brief return a reference to the pointed to value + + @note This version does not throw if a value is not present, but tries to + create nested values instead. For instance, calling this function + with pointer `"/this/that"` on a null value is equivalent to calling + `operator[]("this").operator[]("that")` on that value, effectively + changing the null value to an object. + + @param[in] ptr a JSON value + + @return reference to the JSON value pointed to by the JSON pointer + + @complexity Linear in the length of the JSON pointer. + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.404 if the JSON pointer can not be resolved + */ + BasicJsonType& get_unchecked(BasicJsonType* ptr) const + { + for (const auto& reference_token : reference_tokens) + { + // convert null values to arrays or objects before continuing + if (ptr->is_null()) + { + // check if reference token is a number + const bool nums = + std::all_of(reference_token.begin(), reference_token.end(), + [](const unsigned char x) + { + return std::isdigit(x); + }); + + // change value to array for numbers or "-" or to object otherwise + *ptr = (nums || reference_token == "-") + ? detail::value_t::array + : detail::value_t::object; + } + + switch (ptr->type()) + { + case detail::value_t::object: + { + // use unchecked object access + ptr = &ptr->operator[](reference_token); + break; + } + + case detail::value_t::array: + { + if (reference_token == "-") + { + // explicitly treat "-" as index beyond the end + ptr = &ptr->operator[](ptr->m_value.array->size()); + } + else + { + // convert array index to number; unchecked access + ptr = &ptr->operator[](array_index(reference_token)); + } + break; + } + + default: + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr)); + } + } + + return *ptr; + } + + /*! + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.402 if the array index '-' is used + @throw out_of_range.404 if the JSON pointer can not be resolved + */ + BasicJsonType& get_checked(BasicJsonType* ptr) const + { + for (const auto& reference_token : reference_tokens) + { + switch (ptr->type()) + { + case detail::value_t::object: + { + // note: at performs range check + ptr = &ptr->at(reference_token); + break; + } + + case detail::value_t::array: + { + if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) + { + // "-" always fails the range check + JSON_THROW(detail::out_of_range::create(402, + "array index '-' (" + std::to_string(ptr->m_value.array->size()) + + ") is out of range", *ptr)); + } + + // note: at performs range check + ptr = &ptr->at(array_index(reference_token)); + break; + } + + default: + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr)); + } + } + + return *ptr; + } + + /*! + @brief return a const reference to the pointed to value + + @param[in] ptr a JSON value + + @return const reference to the JSON value pointed to by the JSON + pointer + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.402 if the array index '-' is used + @throw out_of_range.404 if the JSON pointer can not be resolved + */ + const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const + { + for (const auto& reference_token : reference_tokens) + { + switch (ptr->type()) + { + case detail::value_t::object: + { + // use unchecked object access + ptr = &ptr->operator[](reference_token); + break; + } + + case detail::value_t::array: + { + if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) + { + // "-" cannot be used for const access + JSON_THROW(detail::out_of_range::create(402, "array index '-' (" + std::to_string(ptr->m_value.array->size()) + ") is out of range", *ptr)); + } + + // use unchecked array access + ptr = &ptr->operator[](array_index(reference_token)); + break; + } + + default: + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr)); + } + } + + return *ptr; + } + + /*! + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.402 if the array index '-' is used + @throw out_of_range.404 if the JSON pointer can not be resolved + */ + const BasicJsonType& get_checked(const BasicJsonType* ptr) const + { + for (const auto& reference_token : reference_tokens) + { + switch (ptr->type()) + { + case detail::value_t::object: + { + // note: at performs range check + ptr = &ptr->at(reference_token); + break; + } + + case detail::value_t::array: + { + if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) + { + // "-" always fails the range check + JSON_THROW(detail::out_of_range::create(402, + "array index '-' (" + std::to_string(ptr->m_value.array->size()) + + ") is out of range", *ptr)); + } + + // note: at performs range check + ptr = &ptr->at(array_index(reference_token)); + break; + } + + default: + JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr)); + } + } + + return *ptr; + } + + /*! + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + */ + bool contains(const BasicJsonType* ptr) const + { + for (const auto& reference_token : reference_tokens) + { + switch (ptr->type()) + { + case detail::value_t::object: + { + if (!ptr->contains(reference_token)) + { + // we did not find the key in the object + return false; + } + + ptr = &ptr->operator[](reference_token); + break; + } + + case detail::value_t::array: + { + if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) + { + // "-" always fails the range check + return false; + } + if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9"))) + { + // invalid char + return false; + } + if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1)) + { + if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9'))) + { + // first char should be between '1' and '9' + return false; + } + for (std::size_t i = 1; i < reference_token.size(); i++) + { + if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9'))) + { + // other char should be between '0' and '9' + return false; + } + } + } + + const auto idx = array_index(reference_token); + if (idx >= ptr->size()) + { + // index out of range + return false; + } + + ptr = &ptr->operator[](idx); + break; + } + + default: + { + // we do not expect primitive values if there is still a + // reference token to process + return false; + } + } + } + + // no reference token left means we found a primitive value + return true; + } + + /*! + @brief split the string input to reference tokens + + @note This function is only called by the json_pointer constructor. + All exceptions below are documented there. + + @throw parse_error.107 if the pointer is not empty or begins with '/' + @throw parse_error.108 if character '~' is not followed by '0' or '1' + */ + static std::vector split(const std::string& reference_string) + { + std::vector result; + + // special case: empty reference string -> no reference tokens + if (reference_string.empty()) + { + return result; + } + + // check if nonempty reference string begins with slash + if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/')) + { + JSON_THROW(detail::parse_error::create(107, 1, "JSON pointer must be empty or begin with '/' - was: '" + reference_string + "'", BasicJsonType())); + } + + // extract the reference tokens: + // - slash: position of the last read slash (or end of string) + // - start: position after the previous slash + for ( + // search for the first slash after the first character + std::size_t slash = reference_string.find_first_of('/', 1), + // set the beginning of the first reference token + start = 1; + // we can stop if start == 0 (if slash == std::string::npos) + start != 0; + // set the beginning of the next reference token + // (will eventually be 0 if slash == std::string::npos) + start = (slash == std::string::npos) ? 0 : slash + 1, + // find next slash + slash = reference_string.find_first_of('/', start)) + { + // use the text between the beginning of the reference token + // (start) and the last slash (slash). + auto reference_token = reference_string.substr(start, slash - start); + + // check reference tokens are properly escaped + for (std::size_t pos = reference_token.find_first_of('~'); + pos != std::string::npos; + pos = reference_token.find_first_of('~', pos + 1)) + { + JSON_ASSERT(reference_token[pos] == '~'); + + // ~ must be followed by 0 or 1 + if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 || + (reference_token[pos + 1] != '0' && + reference_token[pos + 1] != '1'))) + { + JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", BasicJsonType())); + } + } + + // finally, store the reference token + detail::unescape(reference_token); + result.push_back(reference_token); + } + + return result; + } + + private: + /*! + @param[in] reference_string the reference string to the current value + @param[in] value the value to consider + @param[in,out] result the result object to insert values to + + @note Empty objects or arrays are flattened to `null`. + */ + static void flatten(const std::string& reference_string, + const BasicJsonType& value, + BasicJsonType& result) + { + switch (value.type()) + { + case detail::value_t::array: + { + if (value.m_value.array->empty()) + { + // flatten empty array as null + result[reference_string] = nullptr; + } + else + { + // iterate array and use index as reference string + for (std::size_t i = 0; i < value.m_value.array->size(); ++i) + { + flatten(reference_string + "/" + std::to_string(i), + value.m_value.array->operator[](i), result); + } + } + break; + } + + case detail::value_t::object: + { + if (value.m_value.object->empty()) + { + // flatten empty object as null + result[reference_string] = nullptr; + } + else + { + // iterate object and use keys as reference string + for (const auto& element : *value.m_value.object) + { + flatten(reference_string + "/" + detail::escape(element.first), element.second, result); + } + } + break; + } + + default: + { + // add primitive value with its reference string + result[reference_string] = value; + break; + } + } + } + + /*! + @param[in] value flattened JSON + + @return unflattened JSON + + @throw parse_error.109 if array index is not a number + @throw type_error.314 if value is not an object + @throw type_error.315 if object values are not primitive + @throw type_error.313 if value cannot be unflattened + */ + static BasicJsonType + unflatten(const BasicJsonType& value) + { + if (JSON_HEDLEY_UNLIKELY(!value.is_object())) + { + JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", value)); + } + + BasicJsonType result; + + // iterate the JSON object values + for (const auto& element : *value.m_value.object) + { + if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive())) + { + JSON_THROW(detail::type_error::create(315, "values in object must be primitive", element.second)); + } + + // assign value to reference pointed to by JSON pointer; Note that if + // the JSON pointer is "" (i.e., points to the whole value), function + // get_and_create returns a reference to result itself. An assignment + // will then create a primitive value. + json_pointer(element.first).get_and_create(result) = element.second; + } + + return result; + } + + /*! + @brief compares two JSON pointers for equality + + @param[in] lhs JSON pointer to compare + @param[in] rhs JSON pointer to compare + @return whether @a lhs is equal to @a rhs + + @complexity Linear in the length of the JSON pointer + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + */ + friend bool operator==(json_pointer const& lhs, + json_pointer const& rhs) noexcept + { + return lhs.reference_tokens == rhs.reference_tokens; + } + + /*! + @brief compares two JSON pointers for inequality + + @param[in] lhs JSON pointer to compare + @param[in] rhs JSON pointer to compare + @return whether @a lhs is not equal @a rhs + + @complexity Linear in the length of the JSON pointer + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + */ + friend bool operator!=(json_pointer const& lhs, + json_pointer const& rhs) noexcept + { + return !(lhs == rhs); + } + + /// the reference tokens + std::vector reference_tokens; +}; +} // namespace nlohmann + +// #include + + +#include +#include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +template +class json_ref +{ + public: + using value_type = BasicJsonType; + + json_ref(value_type&& value) + : owned_value(std::move(value)) + {} + + json_ref(const value_type& value) + : value_ref(&value) + {} + + json_ref(std::initializer_list init) + : owned_value(init) + {} + + template < + class... Args, + enable_if_t::value, int> = 0 > + json_ref(Args && ... args) + : owned_value(std::forward(args)...) + {} + + // class should be movable only + json_ref(json_ref&&) noexcept = default; + json_ref(const json_ref&) = delete; + json_ref& operator=(const json_ref&) = delete; + json_ref& operator=(json_ref&&) = delete; + ~json_ref() = default; + + value_type moved_or_copied() const + { + if (value_ref == nullptr) + { + return std::move(owned_value); + } + return *value_ref; + } + + value_type const& operator*() const + { + return value_ref ? *value_ref : owned_value; + } + + value_type const* operator->() const + { + return &** this; + } + + private: + mutable value_type owned_value = nullptr; + value_type const* value_ref = nullptr; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + +// #include + +// #include + +// #include + + +#include // reverse +#include // array +#include // isnan, isinf +#include // uint8_t, uint16_t, uint32_t, uint64_t +#include // memcpy +#include // numeric_limits +#include // string +#include // move + +// #include + +// #include + +// #include + + +#include // copy +#include // size_t +#include // back_inserter +#include // shared_ptr, make_shared +#include // basic_string +#include // vector + +#ifndef JSON_NO_IO + #include // streamsize + #include // basic_ostream +#endif // JSON_NO_IO + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/// abstract output adapter interface +template struct output_adapter_protocol +{ + virtual void write_character(CharType c) = 0; + virtual void write_characters(const CharType* s, std::size_t length) = 0; + virtual ~output_adapter_protocol() = default; + + output_adapter_protocol() = default; + output_adapter_protocol(const output_adapter_protocol&) = default; + output_adapter_protocol(output_adapter_protocol&&) noexcept = default; + output_adapter_protocol& operator=(const output_adapter_protocol&) = default; + output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default; +}; + +/// a type to simplify interfaces +template +using output_adapter_t = std::shared_ptr>; + +/// output adapter for byte vectors +template +class output_vector_adapter : public output_adapter_protocol +{ + public: + explicit output_vector_adapter(std::vector& vec) noexcept + : v(vec) + {} + + void write_character(CharType c) override + { + v.push_back(c); + } + + JSON_HEDLEY_NON_NULL(2) + void write_characters(const CharType* s, std::size_t length) override + { + std::copy(s, s + length, std::back_inserter(v)); + } + + private: + std::vector& v; +}; + +#ifndef JSON_NO_IO +/// output adapter for output streams +template +class output_stream_adapter : public output_adapter_protocol +{ + public: + explicit output_stream_adapter(std::basic_ostream& s) noexcept + : stream(s) + {} + + void write_character(CharType c) override + { + stream.put(c); + } + + JSON_HEDLEY_NON_NULL(2) + void write_characters(const CharType* s, std::size_t length) override + { + stream.write(s, static_cast(length)); + } + + private: + std::basic_ostream& stream; +}; +#endif // JSON_NO_IO + +/// output adapter for basic_string +template> +class output_string_adapter : public output_adapter_protocol +{ + public: + explicit output_string_adapter(StringType& s) noexcept + : str(s) + {} + + void write_character(CharType c) override + { + str.push_back(c); + } + + JSON_HEDLEY_NON_NULL(2) + void write_characters(const CharType* s, std::size_t length) override + { + str.append(s, length); + } + + private: + StringType& str; +}; + +template> +class output_adapter +{ + public: + output_adapter(std::vector& vec) + : oa(std::make_shared>(vec)) {} + +#ifndef JSON_NO_IO + output_adapter(std::basic_ostream& s) + : oa(std::make_shared>(s)) {} +#endif // JSON_NO_IO + + output_adapter(StringType& s) + : oa(std::make_shared>(s)) {} + + operator output_adapter_t() + { + return oa; + } + + private: + output_adapter_t oa = nullptr; +}; +} // namespace detail +} // namespace nlohmann + + +namespace nlohmann +{ +namespace detail +{ +/////////////////// +// binary writer // +/////////////////// + +/*! +@brief serialization to CBOR and MessagePack values +*/ +template +class binary_writer +{ + using string_t = typename BasicJsonType::string_t; + using binary_t = typename BasicJsonType::binary_t; + using number_float_t = typename BasicJsonType::number_float_t; + + public: + /*! + @brief create a binary writer + + @param[in] adapter output adapter to write to + */ + explicit binary_writer(output_adapter_t adapter) : oa(std::move(adapter)) + { + JSON_ASSERT(oa); + } + + /*! + @param[in] j JSON value to serialize + @pre j.type() == value_t::object + */ + void write_bson(const BasicJsonType& j) + { + switch (j.type()) + { + case value_t::object: + { + write_bson_object(*j.m_value.object); + break; + } + + default: + { + JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name()), j));; + } + } + } + + /*! + @param[in] j JSON value to serialize + */ + void write_cbor(const BasicJsonType& j) + { + switch (j.type()) + { + case value_t::null: + { + oa->write_character(to_char_type(0xF6)); + break; + } + + case value_t::boolean: + { + oa->write_character(j.m_value.boolean + ? to_char_type(0xF5) + : to_char_type(0xF4)); + break; + } + + case value_t::number_integer: + { + if (j.m_value.number_integer >= 0) + { + // CBOR does not differentiate between positive signed + // integers and unsigned integers. Therefore, we used the + // code from the value_t::number_unsigned case here. + if (j.m_value.number_integer <= 0x17) + { + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x18)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x19)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x1A)); + write_number(static_cast(j.m_value.number_integer)); + } + else + { + oa->write_character(to_char_type(0x1B)); + write_number(static_cast(j.m_value.number_integer)); + } + } + else + { + // The conversions below encode the sign in the first + // byte, and the value is converted to a positive number. + const auto positive_number = -1 - j.m_value.number_integer; + if (j.m_value.number_integer >= -24) + { + write_number(static_cast(0x20 + positive_number)); + } + else if (positive_number <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x38)); + write_number(static_cast(positive_number)); + } + else if (positive_number <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x39)); + write_number(static_cast(positive_number)); + } + else if (positive_number <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x3A)); + write_number(static_cast(positive_number)); + } + else + { + oa->write_character(to_char_type(0x3B)); + write_number(static_cast(positive_number)); + } + } + break; + } + + case value_t::number_unsigned: + { + if (j.m_value.number_unsigned <= 0x17) + { + write_number(static_cast(j.m_value.number_unsigned)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x18)); + write_number(static_cast(j.m_value.number_unsigned)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x19)); + write_number(static_cast(j.m_value.number_unsigned)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x1A)); + write_number(static_cast(j.m_value.number_unsigned)); + } + else + { + oa->write_character(to_char_type(0x1B)); + write_number(static_cast(j.m_value.number_unsigned)); + } + break; + } + + case value_t::number_float: + { + if (std::isnan(j.m_value.number_float)) + { + // NaN is 0xf97e00 in CBOR + oa->write_character(to_char_type(0xF9)); + oa->write_character(to_char_type(0x7E)); + oa->write_character(to_char_type(0x00)); + } + else if (std::isinf(j.m_value.number_float)) + { + // Infinity is 0xf97c00, -Infinity is 0xf9fc00 + oa->write_character(to_char_type(0xf9)); + oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC)); + oa->write_character(to_char_type(0x00)); + } + else + { + write_compact_float(j.m_value.number_float, detail::input_format_t::cbor); + } + break; + } + + case value_t::string: + { + // step 1: write control byte and the string length + const auto N = j.m_value.string->size(); + if (N <= 0x17) + { + write_number(static_cast(0x60 + N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x78)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x79)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x7A)); + write_number(static_cast(N)); + } + // LCOV_EXCL_START + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x7B)); + write_number(static_cast(N)); + } + // LCOV_EXCL_STOP + + // step 2: write the string + oa->write_characters( + reinterpret_cast(j.m_value.string->c_str()), + j.m_value.string->size()); + break; + } + + case value_t::array: + { + // step 1: write control byte and the array size + const auto N = j.m_value.array->size(); + if (N <= 0x17) + { + write_number(static_cast(0x80 + N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x98)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x99)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x9A)); + write_number(static_cast(N)); + } + // LCOV_EXCL_START + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x9B)); + write_number(static_cast(N)); + } + // LCOV_EXCL_STOP + + // step 2: write each element + for (const auto& el : *j.m_value.array) + { + write_cbor(el); + } + break; + } + + case value_t::binary: + { + if (j.m_value.binary->has_subtype()) + { + write_number(static_cast(0xd8)); + write_number(j.m_value.binary->subtype()); + } + + // step 1: write control byte and the binary array size + const auto N = j.m_value.binary->size(); + if (N <= 0x17) + { + write_number(static_cast(0x40 + N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x58)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x59)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x5A)); + write_number(static_cast(N)); + } + // LCOV_EXCL_START + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0x5B)); + write_number(static_cast(N)); + } + // LCOV_EXCL_STOP + + // step 2: write each element + oa->write_characters( + reinterpret_cast(j.m_value.binary->data()), + N); + + break; + } + + case value_t::object: + { + // step 1: write control byte and the object size + const auto N = j.m_value.object->size(); + if (N <= 0x17) + { + write_number(static_cast(0xA0 + N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0xB8)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0xB9)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0xBA)); + write_number(static_cast(N)); + } + // LCOV_EXCL_START + else if (N <= (std::numeric_limits::max)()) + { + oa->write_character(to_char_type(0xBB)); + write_number(static_cast(N)); + } + // LCOV_EXCL_STOP + + // step 2: write each element + for (const auto& el : *j.m_value.object) + { + write_cbor(el.first); + write_cbor(el.second); + } + break; + } + + default: + break; + } + } + + /*! + @param[in] j JSON value to serialize + */ + void write_msgpack(const BasicJsonType& j) + { + switch (j.type()) + { + case value_t::null: // nil + { + oa->write_character(to_char_type(0xC0)); + break; + } + + case value_t::boolean: // true and false + { + oa->write_character(j.m_value.boolean + ? to_char_type(0xC3) + : to_char_type(0xC2)); + break; + } + + case value_t::number_integer: + { + if (j.m_value.number_integer >= 0) + { + // MessagePack does not differentiate between positive + // signed integers and unsigned integers. Therefore, we used + // the code from the value_t::number_unsigned case here. + if (j.m_value.number_unsigned < 128) + { + // positive fixnum + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 8 + oa->write_character(to_char_type(0xCC)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 16 + oa->write_character(to_char_type(0xCD)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 32 + oa->write_character(to_char_type(0xCE)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 64 + oa->write_character(to_char_type(0xCF)); + write_number(static_cast(j.m_value.number_integer)); + } + } + else + { + if (j.m_value.number_integer >= -32) + { + // negative fixnum + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_value.number_integer <= (std::numeric_limits::max)()) + { + // int 8 + oa->write_character(to_char_type(0xD0)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_value.number_integer <= (std::numeric_limits::max)()) + { + // int 16 + oa->write_character(to_char_type(0xD1)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_value.number_integer <= (std::numeric_limits::max)()) + { + // int 32 + oa->write_character(to_char_type(0xD2)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_value.number_integer <= (std::numeric_limits::max)()) + { + // int 64 + oa->write_character(to_char_type(0xD3)); + write_number(static_cast(j.m_value.number_integer)); + } + } + break; + } + + case value_t::number_unsigned: + { + if (j.m_value.number_unsigned < 128) + { + // positive fixnum + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 8 + oa->write_character(to_char_type(0xCC)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 16 + oa->write_character(to_char_type(0xCD)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 32 + oa->write_character(to_char_type(0xCE)); + write_number(static_cast(j.m_value.number_integer)); + } + else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + { + // uint 64 + oa->write_character(to_char_type(0xCF)); + write_number(static_cast(j.m_value.number_integer)); + } + break; + } + + case value_t::number_float: + { + write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack); + break; + } + + case value_t::string: + { + // step 1: write control byte and the string length + const auto N = j.m_value.string->size(); + if (N <= 31) + { + // fixstr + write_number(static_cast(0xA0 | N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // str 8 + oa->write_character(to_char_type(0xD9)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // str 16 + oa->write_character(to_char_type(0xDA)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // str 32 + oa->write_character(to_char_type(0xDB)); + write_number(static_cast(N)); + } + + // step 2: write the string + oa->write_characters( + reinterpret_cast(j.m_value.string->c_str()), + j.m_value.string->size()); + break; + } + + case value_t::array: + { + // step 1: write control byte and the array size + const auto N = j.m_value.array->size(); + if (N <= 15) + { + // fixarray + write_number(static_cast(0x90 | N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // array 16 + oa->write_character(to_char_type(0xDC)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // array 32 + oa->write_character(to_char_type(0xDD)); + write_number(static_cast(N)); + } + + // step 2: write each element + for (const auto& el : *j.m_value.array) + { + write_msgpack(el); + } + break; + } + + case value_t::binary: + { + // step 0: determine if the binary type has a set subtype to + // determine whether or not to use the ext or fixext types + const bool use_ext = j.m_value.binary->has_subtype(); + + // step 1: write control byte and the byte string length + const auto N = j.m_value.binary->size(); + if (N <= (std::numeric_limits::max)()) + { + std::uint8_t output_type{}; + bool fixed = true; + if (use_ext) + { + switch (N) + { + case 1: + output_type = 0xD4; // fixext 1 + break; + case 2: + output_type = 0xD5; // fixext 2 + break; + case 4: + output_type = 0xD6; // fixext 4 + break; + case 8: + output_type = 0xD7; // fixext 8 + break; + case 16: + output_type = 0xD8; // fixext 16 + break; + default: + output_type = 0xC7; // ext 8 + fixed = false; + break; + } + + } + else + { + output_type = 0xC4; // bin 8 + fixed = false; + } + + oa->write_character(to_char_type(output_type)); + if (!fixed) + { + write_number(static_cast(N)); + } + } + else if (N <= (std::numeric_limits::max)()) + { + std::uint8_t output_type = use_ext + ? 0xC8 // ext 16 + : 0xC5; // bin 16 + + oa->write_character(to_char_type(output_type)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + std::uint8_t output_type = use_ext + ? 0xC9 // ext 32 + : 0xC6; // bin 32 + + oa->write_character(to_char_type(output_type)); + write_number(static_cast(N)); + } + + // step 1.5: if this is an ext type, write the subtype + if (use_ext) + { + write_number(static_cast(j.m_value.binary->subtype())); + } + + // step 2: write the byte string + oa->write_characters( + reinterpret_cast(j.m_value.binary->data()), + N); + + break; + } + + case value_t::object: + { + // step 1: write control byte and the object size + const auto N = j.m_value.object->size(); + if (N <= 15) + { + // fixmap + write_number(static_cast(0x80 | (N & 0xF))); + } + else if (N <= (std::numeric_limits::max)()) + { + // map 16 + oa->write_character(to_char_type(0xDE)); + write_number(static_cast(N)); + } + else if (N <= (std::numeric_limits::max)()) + { + // map 32 + oa->write_character(to_char_type(0xDF)); + write_number(static_cast(N)); + } + + // step 2: write each element + for (const auto& el : *j.m_value.object) + { + write_msgpack(el.first); + write_msgpack(el.second); + } + break; + } + + default: + break; + } + } + + /*! + @param[in] j JSON value to serialize + @param[in] use_count whether to use '#' prefixes (optimized format) + @param[in] use_type whether to use '$' prefixes (optimized format) + @param[in] add_prefix whether prefixes need to be used for this value + */ + void write_ubjson(const BasicJsonType& j, const bool use_count, + const bool use_type, const bool add_prefix = true) + { + switch (j.type()) + { + case value_t::null: + { + if (add_prefix) + { + oa->write_character(to_char_type('Z')); + } + break; + } + + case value_t::boolean: + { + if (add_prefix) + { + oa->write_character(j.m_value.boolean + ? to_char_type('T') + : to_char_type('F')); + } + break; + } + + case value_t::number_integer: + { + write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix); + break; + } + + case value_t::number_unsigned: + { + write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix); + break; + } + + case value_t::number_float: + { + write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix); + break; + } + + case value_t::string: + { + if (add_prefix) + { + oa->write_character(to_char_type('S')); + } + write_number_with_ubjson_prefix(j.m_value.string->size(), true); + oa->write_characters( + reinterpret_cast(j.m_value.string->c_str()), + j.m_value.string->size()); + break; + } + + case value_t::array: + { + if (add_prefix) + { + oa->write_character(to_char_type('[')); + } + + bool prefix_required = true; + if (use_type && !j.m_value.array->empty()) + { + JSON_ASSERT(use_count); + const CharType first_prefix = ubjson_prefix(j.front()); + const bool same_prefix = std::all_of(j.begin() + 1, j.end(), + [this, first_prefix](const BasicJsonType & v) + { + return ubjson_prefix(v) == first_prefix; + }); + + if (same_prefix) + { + prefix_required = false; + oa->write_character(to_char_type('$')); + oa->write_character(first_prefix); + } + } + + if (use_count) + { + oa->write_character(to_char_type('#')); + write_number_with_ubjson_prefix(j.m_value.array->size(), true); + } + + for (const auto& el : *j.m_value.array) + { + write_ubjson(el, use_count, use_type, prefix_required); + } + + if (!use_count) + { + oa->write_character(to_char_type(']')); + } + + break; + } + + case value_t::binary: + { + if (add_prefix) + { + oa->write_character(to_char_type('[')); + } + + if (use_type && !j.m_value.binary->empty()) + { + JSON_ASSERT(use_count); + oa->write_character(to_char_type('$')); + oa->write_character('U'); + } + + if (use_count) + { + oa->write_character(to_char_type('#')); + write_number_with_ubjson_prefix(j.m_value.binary->size(), true); + } + + if (use_type) + { + oa->write_characters( + reinterpret_cast(j.m_value.binary->data()), + j.m_value.binary->size()); + } + else + { + for (size_t i = 0; i < j.m_value.binary->size(); ++i) + { + oa->write_character(to_char_type('U')); + oa->write_character(j.m_value.binary->data()[i]); + } + } + + if (!use_count) + { + oa->write_character(to_char_type(']')); + } + + break; + } + + case value_t::object: + { + if (add_prefix) + { + oa->write_character(to_char_type('{')); + } + + bool prefix_required = true; + if (use_type && !j.m_value.object->empty()) + { + JSON_ASSERT(use_count); + const CharType first_prefix = ubjson_prefix(j.front()); + const bool same_prefix = std::all_of(j.begin(), j.end(), + [this, first_prefix](const BasicJsonType & v) + { + return ubjson_prefix(v) == first_prefix; + }); + + if (same_prefix) + { + prefix_required = false; + oa->write_character(to_char_type('$')); + oa->write_character(first_prefix); + } + } + + if (use_count) + { + oa->write_character(to_char_type('#')); + write_number_with_ubjson_prefix(j.m_value.object->size(), true); + } + + for (const auto& el : *j.m_value.object) + { + write_number_with_ubjson_prefix(el.first.size(), true); + oa->write_characters( + reinterpret_cast(el.first.c_str()), + el.first.size()); + write_ubjson(el.second, use_count, use_type, prefix_required); + } + + if (!use_count) + { + oa->write_character(to_char_type('}')); + } + + break; + } + + default: + break; + } + } + + private: + ////////// + // BSON // + ////////// + + /*! + @return The size of a BSON document entry header, including the id marker + and the entry name size (and its null-terminator). + */ + static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j) + { + const auto it = name.find(static_cast(0)); + if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos)) + { + JSON_THROW(out_of_range::create(409, "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")", j)); + } + + return /*id*/ 1ul + name.size() + /*zero-terminator*/1u; + } + + /*! + @brief Writes the given @a element_type and @a name to the output adapter + */ + void write_bson_entry_header(const string_t& name, + const std::uint8_t element_type) + { + oa->write_character(to_char_type(element_type)); // boolean + oa->write_characters( + reinterpret_cast(name.c_str()), + name.size() + 1u); + } + + /*! + @brief Writes a BSON element with key @a name and boolean value @a value + */ + void write_bson_boolean(const string_t& name, + const bool value) + { + write_bson_entry_header(name, 0x08); + oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00)); + } + + /*! + @brief Writes a BSON element with key @a name and double value @a value + */ + void write_bson_double(const string_t& name, + const double value) + { + write_bson_entry_header(name, 0x01); + write_number(value); + } + + /*! + @return The size of the BSON-encoded string in @a value + */ + static std::size_t calc_bson_string_size(const string_t& value) + { + return sizeof(std::int32_t) + value.size() + 1ul; + } + + /*! + @brief Writes a BSON element with key @a name and string value @a value + */ + void write_bson_string(const string_t& name, + const string_t& value) + { + write_bson_entry_header(name, 0x02); + + write_number(static_cast(value.size() + 1ul)); + oa->write_characters( + reinterpret_cast(value.c_str()), + value.size() + 1); + } + + /*! + @brief Writes a BSON element with key @a name and null value + */ + void write_bson_null(const string_t& name) + { + write_bson_entry_header(name, 0x0A); + } + + /*! + @return The size of the BSON-encoded integer @a value + */ + static std::size_t calc_bson_integer_size(const std::int64_t value) + { + return (std::numeric_limits::min)() <= value && value <= (std::numeric_limits::max)() + ? sizeof(std::int32_t) + : sizeof(std::int64_t); + } + + /*! + @brief Writes a BSON element with key @a name and integer @a value + */ + void write_bson_integer(const string_t& name, + const std::int64_t value) + { + if ((std::numeric_limits::min)() <= value && value <= (std::numeric_limits::max)()) + { + write_bson_entry_header(name, 0x10); // int32 + write_number(static_cast(value)); + } + else + { + write_bson_entry_header(name, 0x12); // int64 + write_number(static_cast(value)); + } + } + + /*! + @return The size of the BSON-encoded unsigned integer in @a j + */ + static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept + { + return (value <= static_cast((std::numeric_limits::max)())) + ? sizeof(std::int32_t) + : sizeof(std::int64_t); + } + + /*! + @brief Writes a BSON element with key @a name and unsigned @a value + */ + void write_bson_unsigned(const string_t& name, + const BasicJsonType& j) + { + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + write_bson_entry_header(name, 0x10 /* int32 */); + write_number(static_cast(j.m_value.number_unsigned)); + } + else if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + write_bson_entry_header(name, 0x12 /* int64 */); + write_number(static_cast(j.m_value.number_unsigned)); + } + else + { + JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(j.m_value.number_unsigned) + " cannot be represented by BSON as it does not fit int64", j)); + } + } + + /*! + @brief Writes a BSON element with key @a name and object @a value + */ + void write_bson_object_entry(const string_t& name, + const typename BasicJsonType::object_t& value) + { + write_bson_entry_header(name, 0x03); // object + write_bson_object(value); + } + + /*! + @return The size of the BSON-encoded array @a value + */ + static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value) + { + std::size_t array_index = 0ul; + + const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), std::size_t(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el) + { + return result + calc_bson_element_size(std::to_string(array_index++), el); + }); + + return sizeof(std::int32_t) + embedded_document_size + 1ul; + } + + /*! + @return The size of the BSON-encoded binary array @a value + */ + static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value) + { + return sizeof(std::int32_t) + value.size() + 1ul; + } + + /*! + @brief Writes a BSON element with key @a name and array @a value + */ + void write_bson_array(const string_t& name, + const typename BasicJsonType::array_t& value) + { + write_bson_entry_header(name, 0x04); // array + write_number(static_cast(calc_bson_array_size(value))); + + std::size_t array_index = 0ul; + + for (const auto& el : value) + { + write_bson_element(std::to_string(array_index++), el); + } + + oa->write_character(to_char_type(0x00)); + } + + /*! + @brief Writes a BSON element with key @a name and binary value @a value + */ + void write_bson_binary(const string_t& name, + const binary_t& value) + { + write_bson_entry_header(name, 0x05); + + write_number(static_cast(value.size())); + write_number(value.has_subtype() ? value.subtype() : std::uint8_t(0x00)); + + oa->write_characters(reinterpret_cast(value.data()), value.size()); + } + + /*! + @brief Calculates the size necessary to serialize the JSON value @a j with its @a name + @return The calculated size for the BSON document entry for @a j with the given @a name. + */ + static std::size_t calc_bson_element_size(const string_t& name, + const BasicJsonType& j) + { + const auto header_size = calc_bson_entry_header_size(name, j); + switch (j.type()) + { + case value_t::object: + return header_size + calc_bson_object_size(*j.m_value.object); + + case value_t::array: + return header_size + calc_bson_array_size(*j.m_value.array); + + case value_t::binary: + return header_size + calc_bson_binary_size(*j.m_value.binary); + + case value_t::boolean: + return header_size + 1ul; + + case value_t::number_float: + return header_size + 8ul; + + case value_t::number_integer: + return header_size + calc_bson_integer_size(j.m_value.number_integer); + + case value_t::number_unsigned: + return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned); + + case value_t::string: + return header_size + calc_bson_string_size(*j.m_value.string); + + case value_t::null: + return header_size + 0ul; + + // LCOV_EXCL_START + default: + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) + return 0ul; + // LCOV_EXCL_STOP + } + } + + /*! + @brief Serializes the JSON value @a j to BSON and associates it with the + key @a name. + @param name The name to associate with the JSON entity @a j within the + current BSON document + */ + void write_bson_element(const string_t& name, + const BasicJsonType& j) + { + switch (j.type()) + { + case value_t::object: + return write_bson_object_entry(name, *j.m_value.object); + + case value_t::array: + return write_bson_array(name, *j.m_value.array); + + case value_t::binary: + return write_bson_binary(name, *j.m_value.binary); + + case value_t::boolean: + return write_bson_boolean(name, j.m_value.boolean); + + case value_t::number_float: + return write_bson_double(name, j.m_value.number_float); + + case value_t::number_integer: + return write_bson_integer(name, j.m_value.number_integer); + + case value_t::number_unsigned: + return write_bson_unsigned(name, j); + + case value_t::string: + return write_bson_string(name, *j.m_value.string); + + case value_t::null: + return write_bson_null(name); + + // LCOV_EXCL_START + default: + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) + return; + // LCOV_EXCL_STOP + } + } + + /*! + @brief Calculates the size of the BSON serialization of the given + JSON-object @a j. + @param[in] value JSON value to serialize + @pre value.type() == value_t::object + */ + static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value) + { + std::size_t document_size = std::accumulate(value.begin(), value.end(), std::size_t(0), + [](size_t result, const typename BasicJsonType::object_t::value_type & el) + { + return result += calc_bson_element_size(el.first, el.second); + }); + + return sizeof(std::int32_t) + document_size + 1ul; + } + + /*! + @param[in] value JSON value to serialize + @pre value.type() == value_t::object + */ + void write_bson_object(const typename BasicJsonType::object_t& value) + { + write_number(static_cast(calc_bson_object_size(value))); + + for (const auto& el : value) + { + write_bson_element(el.first, el.second); + } + + oa->write_character(to_char_type(0x00)); + } + + ////////// + // CBOR // + ////////// + + static constexpr CharType get_cbor_float_prefix(float /*unused*/) + { + return to_char_type(0xFA); // Single-Precision Float + } + + static constexpr CharType get_cbor_float_prefix(double /*unused*/) + { + return to_char_type(0xFB); // Double-Precision Float + } + + ///////////// + // MsgPack // + ///////////// + + static constexpr CharType get_msgpack_float_prefix(float /*unused*/) + { + return to_char_type(0xCA); // float 32 + } + + static constexpr CharType get_msgpack_float_prefix(double /*unused*/) + { + return to_char_type(0xCB); // float 64 + } + + //////////// + // UBJSON // + //////////// + + // UBJSON: write number (floating point) + template::value, int>::type = 0> + void write_number_with_ubjson_prefix(const NumberType n, + const bool add_prefix) + { + if (add_prefix) + { + oa->write_character(get_ubjson_float_prefix(n)); + } + write_number(n); + } + + // UBJSON: write number (unsigned integer) + template::value, int>::type = 0> + void write_number_with_ubjson_prefix(const NumberType n, + const bool add_prefix) + { + if (n <= static_cast((std::numeric_limits::max)())) + { + if (add_prefix) + { + oa->write_character(to_char_type('i')); // int8 + } + write_number(static_cast(n)); + } + else if (n <= (std::numeric_limits::max)()) + { + if (add_prefix) + { + oa->write_character(to_char_type('U')); // uint8 + } + write_number(static_cast(n)); + } + else if (n <= static_cast((std::numeric_limits::max)())) + { + if (add_prefix) + { + oa->write_character(to_char_type('I')); // int16 + } + write_number(static_cast(n)); + } + else if (n <= static_cast((std::numeric_limits::max)())) + { + if (add_prefix) + { + oa->write_character(to_char_type('l')); // int32 + } + write_number(static_cast(n)); + } + else if (n <= static_cast((std::numeric_limits::max)())) + { + if (add_prefix) + { + oa->write_character(to_char_type('L')); // int64 + } + write_number(static_cast(n)); + } + else + { + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } + } + } + + // UBJSON: write number (signed integer) + template < typename NumberType, typename std::enable_if < + std::is_signed::value&& + !std::is_floating_point::value, int >::type = 0 > + void write_number_with_ubjson_prefix(const NumberType n, + const bool add_prefix) + { + if ((std::numeric_limits::min)() <= n && n <= (std::numeric_limits::max)()) + { + if (add_prefix) + { + oa->write_character(to_char_type('i')); // int8 + } + write_number(static_cast(n)); + } + else if (static_cast((std::numeric_limits::min)()) <= n && n <= static_cast((std::numeric_limits::max)())) + { + if (add_prefix) + { + oa->write_character(to_char_type('U')); // uint8 + } + write_number(static_cast(n)); + } + else if ((std::numeric_limits::min)() <= n && n <= (std::numeric_limits::max)()) + { + if (add_prefix) + { + oa->write_character(to_char_type('I')); // int16 + } + write_number(static_cast(n)); + } + else if ((std::numeric_limits::min)() <= n && n <= (std::numeric_limits::max)()) + { + if (add_prefix) + { + oa->write_character(to_char_type('l')); // int32 + } + write_number(static_cast(n)); + } + else if ((std::numeric_limits::min)() <= n && n <= (std::numeric_limits::max)()) + { + if (add_prefix) + { + oa->write_character(to_char_type('L')); // int64 + } + write_number(static_cast(n)); + } + // LCOV_EXCL_START + else + { + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } + } + // LCOV_EXCL_STOP + } + + /*! + @brief determine the type prefix of container values + */ + CharType ubjson_prefix(const BasicJsonType& j) const noexcept + { + switch (j.type()) + { + case value_t::null: + return 'Z'; + + case value_t::boolean: + return j.m_value.boolean ? 'T' : 'F'; + + case value_t::number_integer: + { + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'i'; + } + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'U'; + } + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'I'; + } + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'l'; + } + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; // LCOV_EXCL_LINE + } + + case value_t::number_unsigned: + { + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'i'; + } + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'U'; + } + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'I'; + } + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'l'; + } + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; // LCOV_EXCL_LINE + } + + case value_t::number_float: + return get_ubjson_float_prefix(j.m_value.number_float); + + case value_t::string: + return 'S'; + + case value_t::array: // fallthrough + case value_t::binary: + return '['; + + case value_t::object: + return '{'; + + default: // discarded values + return 'N'; + } + } + + static constexpr CharType get_ubjson_float_prefix(float /*unused*/) + { + return 'd'; // float 32 + } + + static constexpr CharType get_ubjson_float_prefix(double /*unused*/) + { + return 'D'; // float 64 + } + + /////////////////////// + // Utility functions // + /////////////////////// + + /* + @brief write a number to output input + @param[in] n number of type @a NumberType + @tparam NumberType the type of the number + @tparam OutputIsLittleEndian Set to true if output data is + required to be little endian + + @note This function needs to respect the system's endianess, because bytes + in CBOR, MessagePack, and UBJSON are stored in network order (big + endian) and therefore need reordering on little endian systems. + */ + template + void write_number(const NumberType n) + { + // step 1: write number to array of length NumberType + std::array vec{}; + std::memcpy(vec.data(), &n, sizeof(NumberType)); + + // step 2: write array to output (with possible reordering) + if (is_little_endian != OutputIsLittleEndian) + { + // reverse byte order prior to conversion if necessary + std::reverse(vec.begin(), vec.end()); + } + + oa->write_characters(vec.data(), sizeof(NumberType)); + } + + void write_compact_float(const number_float_t n, detail::input_format_t format) + { + if (static_cast(n) >= static_cast(std::numeric_limits::lowest()) && + static_cast(n) <= static_cast((std::numeric_limits::max)()) && + static_cast(static_cast(n)) == static_cast(n)) + { + oa->write_character(format == detail::input_format_t::cbor + ? get_cbor_float_prefix(static_cast(n)) + : get_msgpack_float_prefix(static_cast(n))); + write_number(static_cast(n)); + } + else + { + oa->write_character(format == detail::input_format_t::cbor + ? get_cbor_float_prefix(n) + : get_msgpack_float_prefix(n)); + write_number(n); + } + } + + public: + // The following to_char_type functions are implement the conversion + // between uint8_t and CharType. In case CharType is not unsigned, + // such a conversion is required to allow values greater than 128. + // See for a discussion. + template < typename C = CharType, + enable_if_t < std::is_signed::value && std::is_signed::value > * = nullptr > + static constexpr CharType to_char_type(std::uint8_t x) noexcept + { + return *reinterpret_cast(&x); + } + + template < typename C = CharType, + enable_if_t < std::is_signed::value && std::is_unsigned::value > * = nullptr > + static CharType to_char_type(std::uint8_t x) noexcept + { + static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t"); + static_assert(std::is_trivial::value, "CharType must be trivial"); + CharType result; + std::memcpy(&result, &x, sizeof(x)); + return result; + } + + template::value>* = nullptr> + static constexpr CharType to_char_type(std::uint8_t x) noexcept + { + return x; + } + + template < typename InputCharType, typename C = CharType, + enable_if_t < + std::is_signed::value && + std::is_signed::value && + std::is_same::type>::value + > * = nullptr > + static constexpr CharType to_char_type(InputCharType x) noexcept + { + return x; + } + + private: + /// whether we can assume little endianess + const bool is_little_endian = little_endianess(); + + /// the output + output_adapter_t oa = nullptr; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + + +#include // reverse, remove, fill, find, none_of +#include // array +#include // localeconv, lconv +#include // labs, isfinite, isnan, signbit +#include // size_t, ptrdiff_t +#include // uint8_t +#include // snprintf +#include // numeric_limits +#include // string, char_traits +#include // is_same +#include // move + +// #include + + +#include // array +#include // signbit, isfinite +#include // intN_t, uintN_t +#include // memcpy, memmove +#include // numeric_limits +#include // conditional + +// #include + + +namespace nlohmann +{ +namespace detail +{ + +/*! +@brief implements the Grisu2 algorithm for binary to decimal floating-point +conversion. + +This implementation is a slightly modified version of the reference +implementation which may be obtained from +http://florian.loitsch.com/publications (bench.tar.gz). + +The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch. + +For a detailed description of the algorithm see: + +[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with + Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming + Language Design and Implementation, PLDI 2010 +[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately", + Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language + Design and Implementation, PLDI 1996 +*/ +namespace dtoa_impl +{ + +template +Target reinterpret_bits(const Source source) +{ + static_assert(sizeof(Target) == sizeof(Source), "size mismatch"); + + Target target; + std::memcpy(&target, &source, sizeof(Source)); + return target; +} + +struct diyfp // f * 2^e +{ + static constexpr int kPrecision = 64; // = q + + std::uint64_t f = 0; + int e = 0; + + constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {} + + /*! + @brief returns x - y + @pre x.e == y.e and x.f >= y.f + */ + static diyfp sub(const diyfp& x, const diyfp& y) noexcept + { + JSON_ASSERT(x.e == y.e); + JSON_ASSERT(x.f >= y.f); + + return {x.f - y.f, x.e}; + } + + /*! + @brief returns x * y + @note The result is rounded. (Only the upper q bits are returned.) + */ + static diyfp mul(const diyfp& x, const diyfp& y) noexcept + { + static_assert(kPrecision == 64, "internal error"); + + // Computes: + // f = round((x.f * y.f) / 2^q) + // e = x.e + y.e + q + + // Emulate the 64-bit * 64-bit multiplication: + // + // p = u * v + // = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi) + // = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) + 2^64 (u_hi v_hi ) + // = (p0 ) + 2^32 ((p1 ) + (p2 )) + 2^64 (p3 ) + // = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) + // = (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi + p2_hi + p3) + // = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) + // = (p0_lo ) + 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H ) + // + // (Since Q might be larger than 2^32 - 1) + // + // = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H) + // + // (Q_hi + H does not overflow a 64-bit int) + // + // = p_lo + 2^64 p_hi + + const std::uint64_t u_lo = x.f & 0xFFFFFFFFu; + const std::uint64_t u_hi = x.f >> 32u; + const std::uint64_t v_lo = y.f & 0xFFFFFFFFu; + const std::uint64_t v_hi = y.f >> 32u; + + const std::uint64_t p0 = u_lo * v_lo; + const std::uint64_t p1 = u_lo * v_hi; + const std::uint64_t p2 = u_hi * v_lo; + const std::uint64_t p3 = u_hi * v_hi; + + const std::uint64_t p0_hi = p0 >> 32u; + const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu; + const std::uint64_t p1_hi = p1 >> 32u; + const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu; + const std::uint64_t p2_hi = p2 >> 32u; + + std::uint64_t Q = p0_hi + p1_lo + p2_lo; + + // The full product might now be computed as + // + // p_hi = p3 + p2_hi + p1_hi + (Q >> 32) + // p_lo = p0_lo + (Q << 32) + // + // But in this particular case here, the full p_lo is not required. + // Effectively we only need to add the highest bit in p_lo to p_hi (and + // Q_hi + 1 does not overflow). + + Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up + + const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u); + + return {h, x.e + y.e + 64}; + } + + /*! + @brief normalize x such that the significand is >= 2^(q-1) + @pre x.f != 0 + */ + static diyfp normalize(diyfp x) noexcept + { + JSON_ASSERT(x.f != 0); + + while ((x.f >> 63u) == 0) + { + x.f <<= 1u; + x.e--; + } + + return x; + } + + /*! + @brief normalize x such that the result has the exponent E + @pre e >= x.e and the upper e - x.e bits of x.f must be zero. + */ + static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept + { + const int delta = x.e - target_exponent; + + JSON_ASSERT(delta >= 0); + JSON_ASSERT(((x.f << delta) >> delta) == x.f); + + return {x.f << delta, target_exponent}; + } +}; + +struct boundaries +{ + diyfp w; + diyfp minus; + diyfp plus; +}; + +/*! +Compute the (normalized) diyfp representing the input number 'value' and its +boundaries. + +@pre value must be finite and positive +*/ +template +boundaries compute_boundaries(FloatType value) +{ + JSON_ASSERT(std::isfinite(value)); + JSON_ASSERT(value > 0); + + // Convert the IEEE representation into a diyfp. + // + // If v is denormal: + // value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1)) + // If v is normalized: + // value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1)) + + static_assert(std::numeric_limits::is_iec559, + "internal error: dtoa_short requires an IEEE-754 floating-point implementation"); + + constexpr int kPrecision = std::numeric_limits::digits; // = p (includes the hidden bit) + constexpr int kBias = std::numeric_limits::max_exponent - 1 + (kPrecision - 1); + constexpr int kMinExp = 1 - kBias; + constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1) + + using bits_type = typename std::conditional::type; + + const auto bits = static_cast(reinterpret_bits(value)); + const std::uint64_t E = bits >> (kPrecision - 1); + const std::uint64_t F = bits & (kHiddenBit - 1); + + const bool is_denormal = E == 0; + const diyfp v = is_denormal + ? diyfp(F, kMinExp) + : diyfp(F + kHiddenBit, static_cast(E) - kBias); + + // Compute the boundaries m- and m+ of the floating-point value + // v = f * 2^e. + // + // Determine v- and v+, the floating-point predecessor and successor if v, + // respectively. + // + // v- = v - 2^e if f != 2^(p-1) or e == e_min (A) + // = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B) + // + // v+ = v + 2^e + // + // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_ + // between m- and m+ round to v, regardless of how the input rounding + // algorithm breaks ties. + // + // ---+-------------+-------------+-------------+-------------+--- (A) + // v- m- v m+ v+ + // + // -----------------+------+------+-------------+-------------+--- (B) + // v- m- v m+ v+ + + const bool lower_boundary_is_closer = F == 0 && E > 1; + const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1); + const diyfp m_minus = lower_boundary_is_closer + ? diyfp(4 * v.f - 1, v.e - 2) // (B) + : diyfp(2 * v.f - 1, v.e - 1); // (A) + + // Determine the normalized w+ = m+. + const diyfp w_plus = diyfp::normalize(m_plus); + + // Determine w- = m- such that e_(w-) = e_(w+). + const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e); + + return {diyfp::normalize(v), w_minus, w_plus}; +} + +// Given normalized diyfp w, Grisu needs to find a (normalized) cached +// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies +// within a certain range [alpha, gamma] (Definition 3.2 from [1]) +// +// alpha <= e = e_c + e_w + q <= gamma +// +// or +// +// f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q +// <= f_c * f_w * 2^gamma +// +// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies +// +// 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma +// +// or +// +// 2^(q - 2 + alpha) <= c * w < 2^(q + gamma) +// +// The choice of (alpha,gamma) determines the size of the table and the form of +// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well +// in practice: +// +// The idea is to cut the number c * w = f * 2^e into two parts, which can be +// processed independently: An integral part p1, and a fractional part p2: +// +// f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e +// = (f div 2^-e) + (f mod 2^-e) * 2^e +// = p1 + p2 * 2^e +// +// The conversion of p1 into decimal form requires a series of divisions and +// modulos by (a power of) 10. These operations are faster for 32-bit than for +// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be +// achieved by choosing +// +// -e >= 32 or e <= -32 := gamma +// +// In order to convert the fractional part +// +// p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ... +// +// into decimal form, the fraction is repeatedly multiplied by 10 and the digits +// d[-i] are extracted in order: +// +// (10 * p2) div 2^-e = d[-1] +// (10 * p2) mod 2^-e = d[-2] / 10^1 + ... +// +// The multiplication by 10 must not overflow. It is sufficient to choose +// +// 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64. +// +// Since p2 = f mod 2^-e < 2^-e, +// +// -e <= 60 or e >= -60 := alpha + +constexpr int kAlpha = -60; +constexpr int kGamma = -32; + +struct cached_power // c = f * 2^e ~= 10^k +{ + std::uint64_t f; + int e; + int k; +}; + +/*! +For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached +power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c +satisfies (Definition 3.2 from [1]) + + alpha <= e_c + e + q <= gamma. +*/ +inline cached_power get_cached_power_for_binary_exponent(int e) +{ + // Now + // + // alpha <= e_c + e + q <= gamma (1) + // ==> f_c * 2^alpha <= c * 2^e * 2^q + // + // and since the c's are normalized, 2^(q-1) <= f_c, + // + // ==> 2^(q - 1 + alpha) <= c * 2^(e + q) + // ==> 2^(alpha - e - 1) <= c + // + // If c were an exact power of ten, i.e. c = 10^k, one may determine k as + // + // k = ceil( log_10( 2^(alpha - e - 1) ) ) + // = ceil( (alpha - e - 1) * log_10(2) ) + // + // From the paper: + // "In theory the result of the procedure could be wrong since c is rounded, + // and the computation itself is approximated [...]. In practice, however, + // this simple function is sufficient." + // + // For IEEE double precision floating-point numbers converted into + // normalized diyfp's w = f * 2^e, with q = 64, + // + // e >= -1022 (min IEEE exponent) + // -52 (p - 1) + // -52 (p - 1, possibly normalize denormal IEEE numbers) + // -11 (normalize the diyfp) + // = -1137 + // + // and + // + // e <= +1023 (max IEEE exponent) + // -52 (p - 1) + // -11 (normalize the diyfp) + // = 960 + // + // This binary exponent range [-1137,960] results in a decimal exponent + // range [-307,324]. One does not need to store a cached power for each + // k in this range. For each such k it suffices to find a cached power + // such that the exponent of the product lies in [alpha,gamma]. + // This implies that the difference of the decimal exponents of adjacent + // table entries must be less than or equal to + // + // floor( (gamma - alpha) * log_10(2) ) = 8. + // + // (A smaller distance gamma-alpha would require a larger table.) + + // NB: + // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34. + + constexpr int kCachedPowersMinDecExp = -300; + constexpr int kCachedPowersDecStep = 8; + + static constexpr std::array kCachedPowers = + { + { + { 0xAB70FE17C79AC6CA, -1060, -300 }, + { 0xFF77B1FCBEBCDC4F, -1034, -292 }, + { 0xBE5691EF416BD60C, -1007, -284 }, + { 0x8DD01FAD907FFC3C, -980, -276 }, + { 0xD3515C2831559A83, -954, -268 }, + { 0x9D71AC8FADA6C9B5, -927, -260 }, + { 0xEA9C227723EE8BCB, -901, -252 }, + { 0xAECC49914078536D, -874, -244 }, + { 0x823C12795DB6CE57, -847, -236 }, + { 0xC21094364DFB5637, -821, -228 }, + { 0x9096EA6F3848984F, -794, -220 }, + { 0xD77485CB25823AC7, -768, -212 }, + { 0xA086CFCD97BF97F4, -741, -204 }, + { 0xEF340A98172AACE5, -715, -196 }, + { 0xB23867FB2A35B28E, -688, -188 }, + { 0x84C8D4DFD2C63F3B, -661, -180 }, + { 0xC5DD44271AD3CDBA, -635, -172 }, + { 0x936B9FCEBB25C996, -608, -164 }, + { 0xDBAC6C247D62A584, -582, -156 }, + { 0xA3AB66580D5FDAF6, -555, -148 }, + { 0xF3E2F893DEC3F126, -529, -140 }, + { 0xB5B5ADA8AAFF80B8, -502, -132 }, + { 0x87625F056C7C4A8B, -475, -124 }, + { 0xC9BCFF6034C13053, -449, -116 }, + { 0x964E858C91BA2655, -422, -108 }, + { 0xDFF9772470297EBD, -396, -100 }, + { 0xA6DFBD9FB8E5B88F, -369, -92 }, + { 0xF8A95FCF88747D94, -343, -84 }, + { 0xB94470938FA89BCF, -316, -76 }, + { 0x8A08F0F8BF0F156B, -289, -68 }, + { 0xCDB02555653131B6, -263, -60 }, + { 0x993FE2C6D07B7FAC, -236, -52 }, + { 0xE45C10C42A2B3B06, -210, -44 }, + { 0xAA242499697392D3, -183, -36 }, + { 0xFD87B5F28300CA0E, -157, -28 }, + { 0xBCE5086492111AEB, -130, -20 }, + { 0x8CBCCC096F5088CC, -103, -12 }, + { 0xD1B71758E219652C, -77, -4 }, + { 0x9C40000000000000, -50, 4 }, + { 0xE8D4A51000000000, -24, 12 }, + { 0xAD78EBC5AC620000, 3, 20 }, + { 0x813F3978F8940984, 30, 28 }, + { 0xC097CE7BC90715B3, 56, 36 }, + { 0x8F7E32CE7BEA5C70, 83, 44 }, + { 0xD5D238A4ABE98068, 109, 52 }, + { 0x9F4F2726179A2245, 136, 60 }, + { 0xED63A231D4C4FB27, 162, 68 }, + { 0xB0DE65388CC8ADA8, 189, 76 }, + { 0x83C7088E1AAB65DB, 216, 84 }, + { 0xC45D1DF942711D9A, 242, 92 }, + { 0x924D692CA61BE758, 269, 100 }, + { 0xDA01EE641A708DEA, 295, 108 }, + { 0xA26DA3999AEF774A, 322, 116 }, + { 0xF209787BB47D6B85, 348, 124 }, + { 0xB454E4A179DD1877, 375, 132 }, + { 0x865B86925B9BC5C2, 402, 140 }, + { 0xC83553C5C8965D3D, 428, 148 }, + { 0x952AB45CFA97A0B3, 455, 156 }, + { 0xDE469FBD99A05FE3, 481, 164 }, + { 0xA59BC234DB398C25, 508, 172 }, + { 0xF6C69A72A3989F5C, 534, 180 }, + { 0xB7DCBF5354E9BECE, 561, 188 }, + { 0x88FCF317F22241E2, 588, 196 }, + { 0xCC20CE9BD35C78A5, 614, 204 }, + { 0x98165AF37B2153DF, 641, 212 }, + { 0xE2A0B5DC971F303A, 667, 220 }, + { 0xA8D9D1535CE3B396, 694, 228 }, + { 0xFB9B7CD9A4A7443C, 720, 236 }, + { 0xBB764C4CA7A44410, 747, 244 }, + { 0x8BAB8EEFB6409C1A, 774, 252 }, + { 0xD01FEF10A657842C, 800, 260 }, + { 0x9B10A4E5E9913129, 827, 268 }, + { 0xE7109BFBA19C0C9D, 853, 276 }, + { 0xAC2820D9623BF429, 880, 284 }, + { 0x80444B5E7AA7CF85, 907, 292 }, + { 0xBF21E44003ACDD2D, 933, 300 }, + { 0x8E679C2F5E44FF8F, 960, 308 }, + { 0xD433179D9C8CB841, 986, 316 }, + { 0x9E19DB92B4E31BA9, 1013, 324 }, + } + }; + + // This computation gives exactly the same results for k as + // k = ceil((kAlpha - e - 1) * 0.30102999566398114) + // for |e| <= 1500, but doesn't require floating-point operations. + // NB: log_10(2) ~= 78913 / 2^18 + JSON_ASSERT(e >= -1500); + JSON_ASSERT(e <= 1500); + const int f = kAlpha - e - 1; + const int k = (f * 78913) / (1 << 18) + static_cast(f > 0); + + const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep; + JSON_ASSERT(index >= 0); + JSON_ASSERT(static_cast(index) < kCachedPowers.size()); + + const cached_power cached = kCachedPowers[static_cast(index)]; + JSON_ASSERT(kAlpha <= cached.e + e + 64); + JSON_ASSERT(kGamma >= cached.e + e + 64); + + return cached; +} + +/*! +For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k. +For n == 0, returns 1 and sets pow10 := 1. +*/ +inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10) +{ + // LCOV_EXCL_START + if (n >= 1000000000) + { + pow10 = 1000000000; + return 10; + } + // LCOV_EXCL_STOP + if (n >= 100000000) + { + pow10 = 100000000; + return 9; + } + if (n >= 10000000) + { + pow10 = 10000000; + return 8; + } + if (n >= 1000000) + { + pow10 = 1000000; + return 7; + } + if (n >= 100000) + { + pow10 = 100000; + return 6; + } + if (n >= 10000) + { + pow10 = 10000; + return 5; + } + if (n >= 1000) + { + pow10 = 1000; + return 4; + } + if (n >= 100) + { + pow10 = 100; + return 3; + } + if (n >= 10) + { + pow10 = 10; + return 2; + } + + pow10 = 1; + return 1; +} + +inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta, + std::uint64_t rest, std::uint64_t ten_k) +{ + JSON_ASSERT(len >= 1); + JSON_ASSERT(dist <= delta); + JSON_ASSERT(rest <= delta); + JSON_ASSERT(ten_k > 0); + + // <--------------------------- delta ----> + // <---- dist ---------> + // --------------[------------------+-------------------]-------------- + // M- w M+ + // + // ten_k + // <------> + // <---- rest ----> + // --------------[------------------+----+--------------]-------------- + // w V + // = buf * 10^k + // + // ten_k represents a unit-in-the-last-place in the decimal representation + // stored in buf. + // Decrement buf by ten_k while this takes buf closer to w. + + // The tests are written in this order to avoid overflow in unsigned + // integer arithmetic. + + while (rest < dist + && delta - rest >= ten_k + && (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) + { + JSON_ASSERT(buf[len - 1] != '0'); + buf[len - 1]--; + rest += ten_k; + } +} + +/*! +Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+. +M- and M+ must be normalized and share the same exponent -60 <= e <= -32. +*/ +inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent, + diyfp M_minus, diyfp w, diyfp M_plus) +{ + static_assert(kAlpha >= -60, "internal error"); + static_assert(kGamma <= -32, "internal error"); + + // Generates the digits (and the exponent) of a decimal floating-point + // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's + // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma. + // + // <--------------------------- delta ----> + // <---- dist ---------> + // --------------[------------------+-------------------]-------------- + // M- w M+ + // + // Grisu2 generates the digits of M+ from left to right and stops as soon as + // V is in [M-,M+]. + + JSON_ASSERT(M_plus.e >= kAlpha); + JSON_ASSERT(M_plus.e <= kGamma); + + std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e) + std::uint64_t dist = diyfp::sub(M_plus, w ).f; // (significand of (M+ - w ), implicit exponent is e) + + // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0): + // + // M+ = f * 2^e + // = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e + // = ((p1 ) * 2^-e + (p2 )) * 2^e + // = p1 + p2 * 2^e + + const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e); + + auto p1 = static_cast(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.) + std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e + + // 1) + // + // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0] + + JSON_ASSERT(p1 > 0); + + std::uint32_t pow10{}; + const int k = find_largest_pow10(p1, pow10); + + // 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1) + // + // p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1)) + // = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1)) + // + // M+ = p1 + p2 * 2^e + // = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e + // = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e + // = d[k-1] * 10^(k-1) + ( rest) * 2^e + // + // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0) + // + // p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0] + // + // but stop as soon as + // + // rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e + + int n = k; + while (n > 0) + { + // Invariants: + // M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k) + // pow10 = 10^(n-1) <= p1 < 10^n + // + const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1) + const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1) + // + // M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e + // = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e) + // + JSON_ASSERT(d <= 9); + buffer[length++] = static_cast('0' + d); // buffer := buffer * 10 + d + // + // M+ = buffer * 10^(n-1) + (r + p2 * 2^e) + // + p1 = r; + n--; + // + // M+ = buffer * 10^n + (p1 + p2 * 2^e) + // pow10 = 10^n + // + + // Now check if enough digits have been generated. + // Compute + // + // p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e + // + // Note: + // Since rest and delta share the same exponent e, it suffices to + // compare the significands. + const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2; + if (rest <= delta) + { + // V = buffer * 10^n, with M- <= V <= M+. + + decimal_exponent += n; + + // We may now just stop. But instead look if the buffer could be + // decremented to bring V closer to w. + // + // pow10 = 10^n is now 1 ulp in the decimal representation V. + // The rounding procedure works with diyfp's with an implicit + // exponent of e. + // + // 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e + // + const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e; + grisu2_round(buffer, length, dist, delta, rest, ten_n); + + return; + } + + pow10 /= 10; + // + // pow10 = 10^(n-1) <= p1 < 10^n + // Invariants restored. + } + + // 2) + // + // The digits of the integral part have been generated: + // + // M+ = d[k-1]...d[1]d[0] + p2 * 2^e + // = buffer + p2 * 2^e + // + // Now generate the digits of the fractional part p2 * 2^e. + // + // Note: + // No decimal point is generated: the exponent is adjusted instead. + // + // p2 actually represents the fraction + // + // p2 * 2^e + // = p2 / 2^-e + // = d[-1] / 10^1 + d[-2] / 10^2 + ... + // + // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...) + // + // p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m + // + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...) + // + // using + // + // 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e) + // = ( d) * 2^-e + ( r) + // + // or + // 10^m * p2 * 2^e = d + r * 2^e + // + // i.e. + // + // M+ = buffer + p2 * 2^e + // = buffer + 10^-m * (d + r * 2^e) + // = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e + // + // and stop as soon as 10^-m * r * 2^e <= delta * 2^e + + JSON_ASSERT(p2 > delta); + + int m = 0; + for (;;) + { + // Invariant: + // M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e + // = buffer * 10^-m + 10^-m * (p2 ) * 2^e + // = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e + // = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e + // + JSON_ASSERT(p2 <= (std::numeric_limits::max)() / 10); + p2 *= 10; + const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e + const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e + // + // M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e + // = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e)) + // = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e + // + JSON_ASSERT(d <= 9); + buffer[length++] = static_cast('0' + d); // buffer := buffer * 10 + d + // + // M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e + // + p2 = r; + m++; + // + // M+ = buffer * 10^-m + 10^-m * p2 * 2^e + // Invariant restored. + + // Check if enough digits have been generated. + // + // 10^-m * p2 * 2^e <= delta * 2^e + // p2 * 2^e <= 10^m * delta * 2^e + // p2 <= 10^m * delta + delta *= 10; + dist *= 10; + if (p2 <= delta) + { + break; + } + } + + // V = buffer * 10^-m, with M- <= V <= M+. + + decimal_exponent -= m; + + // 1 ulp in the decimal representation is now 10^-m. + // Since delta and dist are now scaled by 10^m, we need to do the + // same with ulp in order to keep the units in sync. + // + // 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e + // + const std::uint64_t ten_m = one.f; + grisu2_round(buffer, length, dist, delta, p2, ten_m); + + // By construction this algorithm generates the shortest possible decimal + // number (Loitsch, Theorem 6.2) which rounds back to w. + // For an input number of precision p, at least + // + // N = 1 + ceil(p * log_10(2)) + // + // decimal digits are sufficient to identify all binary floating-point + // numbers (Matula, "In-and-Out conversions"). + // This implies that the algorithm does not produce more than N decimal + // digits. + // + // N = 17 for p = 53 (IEEE double precision) + // N = 9 for p = 24 (IEEE single precision) +} + +/*! +v = buf * 10^decimal_exponent +len is the length of the buffer (number of decimal digits) +The buffer must be large enough, i.e. >= max_digits10. +*/ +JSON_HEDLEY_NON_NULL(1) +inline void grisu2(char* buf, int& len, int& decimal_exponent, + diyfp m_minus, diyfp v, diyfp m_plus) +{ + JSON_ASSERT(m_plus.e == m_minus.e); + JSON_ASSERT(m_plus.e == v.e); + + // --------(-----------------------+-----------------------)-------- (A) + // m- v m+ + // + // --------------------(-----------+-----------------------)-------- (B) + // m- v m+ + // + // First scale v (and m- and m+) such that the exponent is in the range + // [alpha, gamma]. + + const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e); + + const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k + + // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma] + const diyfp w = diyfp::mul(v, c_minus_k); + const diyfp w_minus = diyfp::mul(m_minus, c_minus_k); + const diyfp w_plus = diyfp::mul(m_plus, c_minus_k); + + // ----(---+---)---------------(---+---)---------------(---+---)---- + // w- w w+ + // = c*m- = c*v = c*m+ + // + // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and + // w+ are now off by a small amount. + // In fact: + // + // w - v * 10^k < 1 ulp + // + // To account for this inaccuracy, add resp. subtract 1 ulp. + // + // --------+---[---------------(---+---)---------------]---+-------- + // w- M- w M+ w+ + // + // Now any number in [M-, M+] (bounds included) will round to w when input, + // regardless of how the input rounding algorithm breaks ties. + // + // And digit_gen generates the shortest possible such number in [M-, M+]. + // Note that this does not mean that Grisu2 always generates the shortest + // possible number in the interval (m-, m+). + const diyfp M_minus(w_minus.f + 1, w_minus.e); + const diyfp M_plus (w_plus.f - 1, w_plus.e ); + + decimal_exponent = -cached.k; // = -(-k) = k + + grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus); +} + +/*! +v = buf * 10^decimal_exponent +len is the length of the buffer (number of decimal digits) +The buffer must be large enough, i.e. >= max_digits10. +*/ +template +JSON_HEDLEY_NON_NULL(1) +void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value) +{ + static_assert(diyfp::kPrecision >= std::numeric_limits::digits + 3, + "internal error: not enough precision"); + + JSON_ASSERT(std::isfinite(value)); + JSON_ASSERT(value > 0); + + // If the neighbors (and boundaries) of 'value' are always computed for double-precision + // numbers, all float's can be recovered using strtod (and strtof). However, the resulting + // decimal representations are not exactly "short". + // + // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars) + // says "value is converted to a string as if by std::sprintf in the default ("C") locale" + // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars' + // does. + // On the other hand, the documentation for 'std::to_chars' requires that "parsing the + // representation using the corresponding std::from_chars function recovers value exactly". That + // indicates that single precision floating-point numbers should be recovered using + // 'std::strtof'. + // + // NB: If the neighbors are computed for single-precision numbers, there is a single float + // (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision + // value is off by 1 ulp. +#if 0 + const boundaries w = compute_boundaries(static_cast(value)); +#else + const boundaries w = compute_boundaries(value); +#endif + + grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus); +} + +/*! +@brief appends a decimal representation of e to buf +@return a pointer to the element following the exponent. +@pre -1000 < e < 1000 +*/ +JSON_HEDLEY_NON_NULL(1) +JSON_HEDLEY_RETURNS_NON_NULL +inline char* append_exponent(char* buf, int e) +{ + JSON_ASSERT(e > -1000); + JSON_ASSERT(e < 1000); + + if (e < 0) + { + e = -e; + *buf++ = '-'; + } + else + { + *buf++ = '+'; + } + + auto k = static_cast(e); + if (k < 10) + { + // Always print at least two digits in the exponent. + // This is for compatibility with printf("%g"). + *buf++ = '0'; + *buf++ = static_cast('0' + k); + } + else if (k < 100) + { + *buf++ = static_cast('0' + k / 10); + k %= 10; + *buf++ = static_cast('0' + k); + } + else + { + *buf++ = static_cast('0' + k / 100); + k %= 100; + *buf++ = static_cast('0' + k / 10); + k %= 10; + *buf++ = static_cast('0' + k); + } + + return buf; +} + +/*! +@brief prettify v = buf * 10^decimal_exponent + +If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point +notation. Otherwise it will be printed in exponential notation. + +@pre min_exp < 0 +@pre max_exp > 0 +*/ +JSON_HEDLEY_NON_NULL(1) +JSON_HEDLEY_RETURNS_NON_NULL +inline char* format_buffer(char* buf, int len, int decimal_exponent, + int min_exp, int max_exp) +{ + JSON_ASSERT(min_exp < 0); + JSON_ASSERT(max_exp > 0); + + const int k = len; + const int n = len + decimal_exponent; + + // v = buf * 10^(n-k) + // k is the length of the buffer (number of decimal digits) + // n is the position of the decimal point relative to the start of the buffer. + + if (k <= n && n <= max_exp) + { + // digits[000] + // len <= max_exp + 2 + + std::memset(buf + k, '0', static_cast(n) - static_cast(k)); + // Make it look like a floating-point number (#362, #378) + buf[n + 0] = '.'; + buf[n + 1] = '0'; + return buf + (static_cast(n) + 2); + } + + if (0 < n && n <= max_exp) + { + // dig.its + // len <= max_digits10 + 1 + + JSON_ASSERT(k > n); + + std::memmove(buf + (static_cast(n) + 1), buf + n, static_cast(k) - static_cast(n)); + buf[n] = '.'; + return buf + (static_cast(k) + 1U); + } + + if (min_exp < n && n <= 0) + { + // 0.[000]digits + // len <= 2 + (-min_exp - 1) + max_digits10 + + std::memmove(buf + (2 + static_cast(-n)), buf, static_cast(k)); + buf[0] = '0'; + buf[1] = '.'; + std::memset(buf + 2, '0', static_cast(-n)); + return buf + (2U + static_cast(-n) + static_cast(k)); + } + + if (k == 1) + { + // dE+123 + // len <= 1 + 5 + + buf += 1; + } + else + { + // d.igitsE+123 + // len <= max_digits10 + 1 + 5 + + std::memmove(buf + 2, buf + 1, static_cast(k) - 1); + buf[1] = '.'; + buf += 1 + static_cast(k); + } + + *buf++ = 'e'; + return append_exponent(buf, n - 1); +} + +} // namespace dtoa_impl + +/*! +@brief generates a decimal representation of the floating-point number value in [first, last). + +The format of the resulting decimal representation is similar to printf's %g +format. Returns an iterator pointing past-the-end of the decimal representation. + +@note The input number must be finite, i.e. NaN's and Inf's are not supported. +@note The buffer must be large enough. +@note The result is NOT null-terminated. +*/ +template +JSON_HEDLEY_NON_NULL(1, 2) +JSON_HEDLEY_RETURNS_NON_NULL +char* to_chars(char* first, const char* last, FloatType value) +{ + static_cast(last); // maybe unused - fix warning + JSON_ASSERT(std::isfinite(value)); + + // Use signbit(value) instead of (value < 0) since signbit works for -0. + if (std::signbit(value)) + { + value = -value; + *first++ = '-'; + } + + if (value == 0) // +-0 + { + *first++ = '0'; + // Make it look like a floating-point number (#362, #378) + *first++ = '.'; + *first++ = '0'; + return first; + } + + JSON_ASSERT(last - first >= std::numeric_limits::max_digits10); + + // Compute v = buffer * 10^decimal_exponent. + // The decimal digits are stored in the buffer, which needs to be interpreted + // as an unsigned decimal integer. + // len is the length of the buffer, i.e. the number of decimal digits. + int len = 0; + int decimal_exponent = 0; + dtoa_impl::grisu2(first, len, decimal_exponent, value); + + JSON_ASSERT(len <= std::numeric_limits::max_digits10); + + // Format the buffer like printf("%.*g", prec, value) + constexpr int kMinExp = -4; + // Use digits10 here to increase compatibility with version 2. + constexpr int kMaxExp = std::numeric_limits::digits10; + + JSON_ASSERT(last - first >= kMaxExp + 2); + JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits::max_digits10); + JSON_ASSERT(last - first >= std::numeric_limits::max_digits10 + 6); + + return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp); +} + +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + +// #include + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////////////// +// serialization // +/////////////////// + +/// how to treat decoding errors +enum class error_handler_t +{ + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences +}; + +template +class serializer +{ + using string_t = typename BasicJsonType::string_t; + using number_float_t = typename BasicJsonType::number_float_t; + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using binary_char_t = typename BasicJsonType::binary_t::value_type; + static constexpr std::uint8_t UTF8_ACCEPT = 0; + static constexpr std::uint8_t UTF8_REJECT = 1; + + public: + /*! + @param[in] s output stream to serialize to + @param[in] ichar indentation character to use + @param[in] error_handler_ how to react on decoding errors + */ + serializer(output_adapter_t s, const char ichar, + error_handler_t error_handler_ = error_handler_t::strict) + : o(std::move(s)) + , loc(std::localeconv()) + , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits::to_char_type(* (loc->thousands_sep))) + , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits::to_char_type(* (loc->decimal_point))) + , indent_char(ichar) + , indent_string(512, indent_char) + , error_handler(error_handler_) + {} + + // delete because of pointer members + serializer(const serializer&) = delete; + serializer& operator=(const serializer&) = delete; + serializer(serializer&&) = delete; + serializer& operator=(serializer&&) = delete; + ~serializer() = default; + + /*! + @brief internal implementation of the serialization function + + This function is called by the public member function dump and organizes + the serialization internally. The indentation level is propagated as + additional parameter. In case of arrays and objects, the function is + called recursively. + + - strings and object keys are escaped using `escape_string()` + - integer numbers are converted implicitly via `operator<<` + - floating-point numbers are converted to a string using `"%g"` format + - binary values are serialized as objects containing the subtype and the + byte array + + @param[in] val value to serialize + @param[in] pretty_print whether the output shall be pretty-printed + @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters + in the output are escaped with `\uXXXX` sequences, and the result consists + of ASCII characters only. + @param[in] indent_step the indent level + @param[in] current_indent the current indent level (only used internally) + */ + void dump(const BasicJsonType& val, + const bool pretty_print, + const bool ensure_ascii, + const unsigned int indent_step, + const unsigned int current_indent = 0) + { + switch (val.m_type) + { + case value_t::object: + { + if (val.m_value.object->empty()) + { + o->write_characters("{}", 2); + return; + } + + if (pretty_print) + { + o->write_characters("{\n", 2); + + // variable to hold indentation for recursive calls + const auto new_indent = current_indent + indent_step; + if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) + { + indent_string.resize(indent_string.size() * 2, ' '); + } + + // first n-1 elements + auto i = val.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + { + o->write_characters(indent_string.c_str(), new_indent); + o->write_character('\"'); + dump_escaped(i->first, ensure_ascii); + o->write_characters("\": ", 3); + dump(i->second, true, ensure_ascii, indent_step, new_indent); + o->write_characters(",\n", 2); + } + + // last element + JSON_ASSERT(i != val.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + o->write_characters(indent_string.c_str(), new_indent); + o->write_character('\"'); + dump_escaped(i->first, ensure_ascii); + o->write_characters("\": ", 3); + dump(i->second, true, ensure_ascii, indent_step, new_indent); + + o->write_character('\n'); + o->write_characters(indent_string.c_str(), current_indent); + o->write_character('}'); + } + else + { + o->write_character('{'); + + // first n-1 elements + auto i = val.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + { + o->write_character('\"'); + dump_escaped(i->first, ensure_ascii); + o->write_characters("\":", 2); + dump(i->second, false, ensure_ascii, indent_step, current_indent); + o->write_character(','); + } + + // last element + JSON_ASSERT(i != val.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + o->write_character('\"'); + dump_escaped(i->first, ensure_ascii); + o->write_characters("\":", 2); + dump(i->second, false, ensure_ascii, indent_step, current_indent); + + o->write_character('}'); + } + + return; + } + + case value_t::array: + { + if (val.m_value.array->empty()) + { + o->write_characters("[]", 2); + return; + } + + if (pretty_print) + { + o->write_characters("[\n", 2); + + // variable to hold indentation for recursive calls + const auto new_indent = current_indent + indent_step; + if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) + { + indent_string.resize(indent_string.size() * 2, ' '); + } + + // first n-1 elements + for (auto i = val.m_value.array->cbegin(); + i != val.m_value.array->cend() - 1; ++i) + { + o->write_characters(indent_string.c_str(), new_indent); + dump(*i, true, ensure_ascii, indent_step, new_indent); + o->write_characters(",\n", 2); + } + + // last element + JSON_ASSERT(!val.m_value.array->empty()); + o->write_characters(indent_string.c_str(), new_indent); + dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent); + + o->write_character('\n'); + o->write_characters(indent_string.c_str(), current_indent); + o->write_character(']'); + } + else + { + o->write_character('['); + + // first n-1 elements + for (auto i = val.m_value.array->cbegin(); + i != val.m_value.array->cend() - 1; ++i) + { + dump(*i, false, ensure_ascii, indent_step, current_indent); + o->write_character(','); + } + + // last element + JSON_ASSERT(!val.m_value.array->empty()); + dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent); + + o->write_character(']'); + } + + return; + } + + case value_t::string: + { + o->write_character('\"'); + dump_escaped(*val.m_value.string, ensure_ascii); + o->write_character('\"'); + return; + } + + case value_t::binary: + { + if (pretty_print) + { + o->write_characters("{\n", 2); + + // variable to hold indentation for recursive calls + const auto new_indent = current_indent + indent_step; + if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) + { + indent_string.resize(indent_string.size() * 2, ' '); + } + + o->write_characters(indent_string.c_str(), new_indent); + + o->write_characters("\"bytes\": [", 10); + + if (!val.m_value.binary->empty()) + { + for (auto i = val.m_value.binary->cbegin(); + i != val.m_value.binary->cend() - 1; ++i) + { + dump_integer(*i); + o->write_characters(", ", 2); + } + dump_integer(val.m_value.binary->back()); + } + + o->write_characters("],\n", 3); + o->write_characters(indent_string.c_str(), new_indent); + + o->write_characters("\"subtype\": ", 11); + if (val.m_value.binary->has_subtype()) + { + dump_integer(val.m_value.binary->subtype()); + } + else + { + o->write_characters("null", 4); + } + o->write_character('\n'); + o->write_characters(indent_string.c_str(), current_indent); + o->write_character('}'); + } + else + { + o->write_characters("{\"bytes\":[", 10); + + if (!val.m_value.binary->empty()) + { + for (auto i = val.m_value.binary->cbegin(); + i != val.m_value.binary->cend() - 1; ++i) + { + dump_integer(*i); + o->write_character(','); + } + dump_integer(val.m_value.binary->back()); + } + + o->write_characters("],\"subtype\":", 12); + if (val.m_value.binary->has_subtype()) + { + dump_integer(val.m_value.binary->subtype()); + o->write_character('}'); + } + else + { + o->write_characters("null}", 5); + } + } + return; + } + + case value_t::boolean: + { + if (val.m_value.boolean) + { + o->write_characters("true", 4); + } + else + { + o->write_characters("false", 5); + } + return; + } + + case value_t::number_integer: + { + dump_integer(val.m_value.number_integer); + return; + } + + case value_t::number_unsigned: + { + dump_integer(val.m_value.number_unsigned); + return; + } + + case value_t::number_float: + { + dump_float(val.m_value.number_float); + return; + } + + case value_t::discarded: + { + o->write_characters("", 11); + return; + } + + case value_t::null: + { + o->write_characters("null", 4); + return; + } + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + } + + JSON_PRIVATE_UNLESS_TESTED: + /*! + @brief dump escaped string + + Escape a string by replacing certain special characters by a sequence of an + escape character (backslash) and another character and other control + characters by a sequence of "\u" followed by a four-digit hex + representation. The escaped string is written to output stream @a o. + + @param[in] s the string to escape + @param[in] ensure_ascii whether to escape non-ASCII characters with + \uXXXX sequences + + @complexity Linear in the length of string @a s. + */ + void dump_escaped(const string_t& s, const bool ensure_ascii) + { + std::uint32_t codepoint{}; + std::uint8_t state = UTF8_ACCEPT; + std::size_t bytes = 0; // number of bytes written to string_buffer + + // number of bytes written at the point of the last valid byte + std::size_t bytes_after_last_accept = 0; + std::size_t undumped_chars = 0; + + for (std::size_t i = 0; i < s.size(); ++i) + { + const auto byte = static_cast(s[i]); + + switch (decode(state, codepoint, byte)) + { + case UTF8_ACCEPT: // decode found a new code point + { + switch (codepoint) + { + case 0x08: // backspace + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'b'; + break; + } + + case 0x09: // horizontal tab + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 't'; + break; + } + + case 0x0A: // newline + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'n'; + break; + } + + case 0x0C: // formfeed + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'f'; + break; + } + + case 0x0D: // carriage return + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'r'; + break; + } + + case 0x22: // quotation mark + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\"'; + break; + } + + case 0x5C: // reverse solidus + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\\'; + break; + } + + default: + { + // escape control characters (0x00..0x1F) or, if + // ensure_ascii parameter is used, non-ASCII characters + if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F))) + { + if (codepoint <= 0xFFFF) + { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x", + static_cast(codepoint)); + bytes += 6; + } + else + { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x", + static_cast(0xD7C0u + (codepoint >> 10u)), + static_cast(0xDC00u + (codepoint & 0x3FFu))); + bytes += 12; + } + } + else + { + // copy byte to buffer (all previous bytes + // been copied have in default case above) + string_buffer[bytes++] = s[i]; + } + break; + } + } + + // write buffer and reset index; there must be 13 bytes + // left, as this is the maximal number of bytes to be + // written ("\uxxxx\uxxxx\0") for one code point + if (string_buffer.size() - bytes < 13) + { + o->write_characters(string_buffer.data(), bytes); + bytes = 0; + } + + // remember the byte position of this accept + bytes_after_last_accept = bytes; + undumped_chars = 0; + break; + } + + case UTF8_REJECT: // decode found invalid UTF-8 byte + { + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + (std::snprintf)(&sn[0], sn.size(), "%.2X", byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn, BasicJsonType())); + } + + case error_handler_t::ignore: + case error_handler_t::replace: + { + // in case we saw this character the first time, we + // would like to read it again, because the byte + // may be OK for itself, but just not OK for the + // previous sequence + if (undumped_chars > 0) + { + --i; + } + + // reset length buffer to the last accepted index; + // thus removing/ignoring the invalid characters + bytes = bytes_after_last_accept; + + if (error_handler == error_handler_t::replace) + { + // add a replacement character + if (ensure_ascii) + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + } + else + { + string_buffer[bytes++] = detail::binary_writer::to_char_type('\xEF'); + string_buffer[bytes++] = detail::binary_writer::to_char_type('\xBF'); + string_buffer[bytes++] = detail::binary_writer::to_char_type('\xBD'); + } + + // write buffer and reset index; there must be 13 bytes + // left, as this is the maximal number of bytes to be + // written ("\uxxxx\uxxxx\0") for one code point + if (string_buffer.size() - bytes < 13) + { + o->write_characters(string_buffer.data(), bytes); + bytes = 0; + } + + bytes_after_last_accept = bytes; + } + + undumped_chars = 0; + + // continue processing the string + state = UTF8_ACCEPT; + break; + } + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + break; + } + + default: // decode found yet incomplete multi-byte code point + { + if (!ensure_ascii) + { + // code point will not be escaped - copy byte to buffer + string_buffer[bytes++] = s[i]; + } + ++undumped_chars; + break; + } + } + } + + // we finished processing the string + if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT)) + { + // write buffer + if (bytes > 0) + { + o->write_characters(string_buffer.data(), bytes); + } + } + else + { + // we finish reading, but do not accept: string was incomplete + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn, BasicJsonType())); + } + + case error_handler_t::ignore: + { + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); + break; + } + + case error_handler_t::replace: + { + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); + // add a replacement character + if (ensure_ascii) + { + o->write_characters("\\ufffd", 6); + } + else + { + o->write_characters("\xEF\xBF\xBD", 3); + } + break; + } + + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + } + } + + private: + /*! + @brief count digits + + Count the number of decimal (base 10) digits for an input unsigned integer. + + @param[in] x unsigned integer number to count its digits + @return number of decimal digits + */ + inline unsigned int count_digits(number_unsigned_t x) noexcept + { + unsigned int n_digits = 1; + for (;;) + { + if (x < 10) + { + return n_digits; + } + if (x < 100) + { + return n_digits + 1; + } + if (x < 1000) + { + return n_digits + 2; + } + if (x < 10000) + { + return n_digits + 3; + } + x = x / 10000u; + n_digits += 4; + } + } + + /*! + @brief dump an integer + + Dump a given integer to output stream @a o. Works internally with + @a number_buffer. + + @param[in] x integer number (signed or unsigned) to dump + @tparam NumberType either @a number_integer_t or @a number_unsigned_t + */ + template < typename NumberType, detail::enable_if_t < + std::is_same::value || + std::is_same::value || + std::is_same::value, + int > = 0 > + void dump_integer(NumberType x) + { + static constexpr std::array, 100> digits_to_99 + { + { + {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}}, + {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}}, + {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}}, + {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}}, + {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}}, + {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}}, + {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}}, + {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}}, + {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}}, + {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}}, + } + }; + + // special case for "0" + if (x == 0) + { + o->write_character('0'); + return; + } + + // use a pointer to fill the buffer + auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg) + + const bool is_negative = std::is_same::value && !(x >= 0); // see issue #755 + number_unsigned_t abs_value; + + unsigned int n_chars{}; + + if (is_negative) + { + *buffer_ptr = '-'; + abs_value = remove_sign(static_cast(x)); + + // account one more byte for the minus sign + n_chars = 1 + count_digits(abs_value); + } + else + { + abs_value = static_cast(x); + n_chars = count_digits(abs_value); + } + + // spare 1 byte for '\0' + JSON_ASSERT(n_chars < number_buffer.size() - 1); + + // jump to the end to generate the string from backward + // so we later avoid reversing the result + buffer_ptr += n_chars; + + // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu + // See: https://www.youtube.com/watch?v=o4-CwDo2zpg + while (abs_value >= 100) + { + const auto digits_index = static_cast((abs_value % 100)); + abs_value /= 100; + *(--buffer_ptr) = digits_to_99[digits_index][1]; + *(--buffer_ptr) = digits_to_99[digits_index][0]; + } + + if (abs_value >= 10) + { + const auto digits_index = static_cast(abs_value); + *(--buffer_ptr) = digits_to_99[digits_index][1]; + *(--buffer_ptr) = digits_to_99[digits_index][0]; + } + else + { + *(--buffer_ptr) = static_cast('0' + abs_value); + } + + o->write_characters(number_buffer.data(), n_chars); + } + + /*! + @brief dump a floating-point number + + Dump a given floating-point number to output stream @a o. Works internally + with @a number_buffer. + + @param[in] x floating-point number to dump + */ + void dump_float(number_float_t x) + { + // NaN / inf + if (!std::isfinite(x)) + { + o->write_characters("null", 4); + return; + } + + // If number_float_t is an IEEE-754 single or double precision number, + // use the Grisu2 algorithm to produce short numbers which are + // guaranteed to round-trip, using strtof and strtod, resp. + // + // NB: The test below works if == . + static constexpr bool is_ieee_single_or_double + = (std::numeric_limits::is_iec559 && std::numeric_limits::digits == 24 && std::numeric_limits::max_exponent == 128) || + (std::numeric_limits::is_iec559 && std::numeric_limits::digits == 53 && std::numeric_limits::max_exponent == 1024); + + dump_float(x, std::integral_constant()); + } + + void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/) + { + auto* begin = number_buffer.data(); + auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x); + + o->write_characters(begin, static_cast(end - begin)); + } + + void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/) + { + // get number of digits for a float -> text -> float round-trip + static constexpr auto d = std::numeric_limits::max_digits10; + + // the actual conversion + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) + std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x); + + // negative value indicates an error + JSON_ASSERT(len > 0); + // check if buffer was large enough + JSON_ASSERT(static_cast(len) < number_buffer.size()); + + // erase thousands separator + if (thousands_sep != '\0') + { + auto* const end = std::remove(number_buffer.begin(), + number_buffer.begin() + len, thousands_sep); + std::fill(end, number_buffer.end(), '\0'); + JSON_ASSERT((end - number_buffer.begin()) <= len); + len = (end - number_buffer.begin()); + } + + // convert decimal point to '.' + if (decimal_point != '\0' && decimal_point != '.') + { + auto* const dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point); + if (dec_pos != number_buffer.end()) + { + *dec_pos = '.'; + } + } + + o->write_characters(number_buffer.data(), static_cast(len)); + + // determine if need to append ".0" + const bool value_is_int_like = + std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1, + [](char c) + { + return c == '.' || c == 'e'; + }); + + if (value_is_int_like) + { + o->write_characters(".0", 2); + } + } + + /*! + @brief check whether a string is UTF-8 encoded + + The function checks each byte of a string whether it is UTF-8 encoded. The + result of the check is stored in the @a state parameter. The function must + be called initially with state 0 (accept). State 1 means the string must + be rejected, because the current byte is not allowed. If the string is + completely processed, but the state is non-zero, the string ended + prematurely; that is, the last byte indicated more bytes should have + followed. + + @param[in,out] state the state of the decoding + @param[in,out] codep codepoint (valid only if resulting state is UTF8_ACCEPT) + @param[in] byte next byte to decode + @return new state + + @note The function has been edited: a std::array is used. + + @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann + @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ + static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept + { + static const std::array utf8d = + { + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF + 0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF + 0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF + 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2 + 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4 + 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6 + 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8 + } + }; + + JSON_ASSERT(byte < utf8d.size()); + const std::uint8_t type = utf8d[byte]; + + codep = (state != UTF8_ACCEPT) + ? (byte & 0x3fu) | (codep << 6u) + : (0xFFu >> type) & (byte); + + std::size_t index = 256u + static_cast(state) * 16u + static_cast(type); + JSON_ASSERT(index < 400); + state = utf8d[index]; + return state; + } + + /* + * Overload to make the compiler happy while it is instantiating + * dump_integer for number_unsigned_t. + * Must never be called. + */ + number_unsigned_t remove_sign(number_unsigned_t x) + { + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + return x; // LCOV_EXCL_LINE + } + + /* + * Helper function for dump_integer + * + * This function takes a negative signed integer and returns its absolute + * value as unsigned integer. The plus/minus shuffling is necessary as we can + * not directly remove the sign of an arbitrary signed integer as the + * absolute values of INT_MIN and INT_MAX are usually not the same. See + * #1708 for details. + */ + inline number_unsigned_t remove_sign(number_integer_t x) noexcept + { + JSON_ASSERT(x < 0 && x < (std::numeric_limits::max)()); // NOLINT(misc-redundant-expression) + return static_cast(-(x + 1)) + 1; + } + + private: + /// the output of the serializer + output_adapter_t o = nullptr; + + /// a (hopefully) large enough character buffer + std::array number_buffer{{}}; + + /// the locale + const std::lconv* loc = nullptr; + /// the locale's thousand separator character + const char thousands_sep = '\0'; + /// the locale's decimal point character + const char decimal_point = '\0'; + + /// string buffer + std::array string_buffer{{}}; + + /// the indentation character + const char indent_char; + /// the indentation string + string_t indent_string; + + /// error_handler how to react on decoding errors + const error_handler_t error_handler; +}; +} // namespace detail +} // namespace nlohmann + +// #include + +// #include + +// #include + + +#include // less +#include // initializer_list +#include // input_iterator_tag, iterator_traits +#include // allocator +#include // for out_of_range +#include // enable_if, is_convertible +#include // pair +#include // vector + +// #include + + +namespace nlohmann +{ + +/// ordered_map: a minimal map-like container that preserves insertion order +/// for use within nlohmann::basic_json +template , + class Allocator = std::allocator>> + struct ordered_map : std::vector, Allocator> +{ + using key_type = Key; + using mapped_type = T; + using Container = std::vector, Allocator>; + using typename Container::iterator; + using typename Container::const_iterator; + using typename Container::size_type; + using typename Container::value_type; + + // Explicit constructors instead of `using Container::Container` + // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4) + ordered_map(const Allocator& alloc = Allocator()) : Container{alloc} {} + template + ordered_map(It first, It last, const Allocator& alloc = Allocator()) + : Container{first, last, alloc} {} + ordered_map(std::initializer_list init, const Allocator& alloc = Allocator() ) + : Container{init, alloc} {} + + std::pair emplace(const key_type& key, T&& t) + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return {it, false}; + } + } + Container::emplace_back(key, t); + return {--this->end(), true}; + } + + T& operator[](const Key& key) + { + return emplace(key, T{}).first->second; + } + + const T& operator[](const Key& key) const + { + return at(key); + } + + T& at(const Key& key) + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return it->second; + } + } + + JSON_THROW(std::out_of_range("key not found")); + } + + const T& at(const Key& key) const + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return it->second; + } + } + + JSON_THROW(std::out_of_range("key not found")); + } + + size_type erase(const Key& key) + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + // Since we cannot move const Keys, re-construct them in place + for (auto next = it; ++next != this->end(); ++it) + { + it->~value_type(); // Destroy but keep allocation + new (&*it) value_type{std::move(*next)}; + } + Container::pop_back(); + return 1; + } + } + return 0; + } + + iterator erase(iterator pos) + { + auto it = pos; + + // Since we cannot move const Keys, re-construct them in place + for (auto next = it; ++next != this->end(); ++it) + { + it->~value_type(); // Destroy but keep allocation + new (&*it) value_type{std::move(*next)}; + } + Container::pop_back(); + return pos; + } + + size_type count(const Key& key) const + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return 1; + } + } + return 0; + } + + iterator find(const Key& key) + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return it; + } + } + return Container::end(); + } + + const_iterator find(const Key& key) const + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == key) + { + return it; + } + } + return Container::end(); + } + + std::pair insert( value_type&& value ) + { + return emplace(value.first, std::move(value.second)); + } + + std::pair insert( const value_type& value ) + { + for (auto it = this->begin(); it != this->end(); ++it) + { + if (it->first == value.first) + { + return {it, false}; + } + } + Container::push_back(value); + return {--this->end(), true}; + } + + template + using require_input_iter = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type; + + template> + void insert(InputIt first, InputIt last) + { + for (auto it = first; it != last; ++it) + { + insert(*it); + } + } +}; + +} // namespace nlohmann + + +#if defined(JSON_HAS_CPP_17) + #include +#endif + +/*! +@brief namespace for Niels Lohmann +@see https://github.com/nlohmann +@since version 1.0.0 +*/ +namespace nlohmann +{ + +/*! +@brief a class to store JSON values + +@tparam ObjectType type for JSON objects (`std::map` by default; will be used +in @ref object_t) +@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used +in @ref array_t) +@tparam StringType type for JSON strings and object keys (`std::string` by +default; will be used in @ref string_t) +@tparam BooleanType type for JSON booleans (`bool` by default; will be used +in @ref boolean_t) +@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by +default; will be used in @ref number_integer_t) +@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c +`uint64_t` by default; will be used in @ref number_unsigned_t) +@tparam NumberFloatType type for JSON floating-point numbers (`double` by +default; will be used in @ref number_float_t) +@tparam BinaryType type for packed binary data for compatibility with binary +serialization formats (`std::vector` by default; will be used in +@ref binary_t) +@tparam AllocatorType type of the allocator to use (`std::allocator` by +default) +@tparam JSONSerializer the serializer to resolve internal calls to `to_json()` +and `from_json()` (@ref adl_serializer by default) + +@requirement The class satisfies the following concept requirements: +- Basic + - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible): + JSON values can be default constructed. The result will be a JSON null + value. + - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible): + A JSON value can be constructed from an rvalue argument. + - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible): + A JSON value can be copy-constructed from an lvalue expression. + - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable): + A JSON value van be assigned from an rvalue argument. + - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable): + A JSON value can be copy-assigned from an lvalue expression. + - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible): + JSON values can be destructed. +- Layout + - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType): + JSON values have + [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout): + All non-static data members are private and standard layout types, the + class has no virtual functions or (virtual) base classes. +- Library-wide + - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable): + JSON values can be compared with `==`, see @ref + operator==(const_reference,const_reference). + - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable): + JSON values can be compared with `<`, see @ref + operator<(const_reference,const_reference). + - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable): + Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of + other compatible types, using unqualified function call @ref swap(). + - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer): + JSON values can be compared against `std::nullptr_t` objects which are used + to model the `null` value. +- Container + - [Container](https://en.cppreference.com/w/cpp/named_req/Container): + JSON values can be used like STL containers and provide iterator access. + - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer); + JSON values can be used like STL containers and provide reverse iterator + access. + +@invariant The member variables @a m_value and @a m_type have the following +relationship: +- If `m_type == value_t::object`, then `m_value.object != nullptr`. +- If `m_type == value_t::array`, then `m_value.array != nullptr`. +- If `m_type == value_t::string`, then `m_value.string != nullptr`. +The invariants are checked by member function assert_invariant(). + +@internal +@note ObjectType trick from https://stackoverflow.com/a/9860911 +@endinternal + +@see [RFC 8259: The JavaScript Object Notation (JSON) Data Interchange +Format](https://tools.ietf.org/html/rfc8259) + +@since version 1.0.0 + +@nosubgrouping +*/ +NLOHMANN_BASIC_JSON_TPL_DECLARATION +class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) +{ + private: + template friend struct detail::external_constructor; + friend ::nlohmann::json_pointer; + + template + friend class ::nlohmann::detail::parser; + friend ::nlohmann::detail::serializer; + template + friend class ::nlohmann::detail::iter_impl; + template + friend class ::nlohmann::detail::binary_writer; + template + friend class ::nlohmann::detail::binary_reader; + template + friend class ::nlohmann::detail::json_sax_dom_parser; + template + friend class ::nlohmann::detail::json_sax_dom_callback_parser; + friend class ::nlohmann::detail::exception; + + /// workaround type for MSVC + using basic_json_t = NLOHMANN_BASIC_JSON_TPL; + + JSON_PRIVATE_UNLESS_TESTED: + // convenience aliases for types residing in namespace detail; + using lexer = ::nlohmann::detail::lexer_base; + + template + static ::nlohmann::detail::parser parser( + InputAdapterType adapter, + detail::parser_callback_tcb = nullptr, + const bool allow_exceptions = true, + const bool ignore_comments = false + ) + { + return ::nlohmann::detail::parser(std::move(adapter), + std::move(cb), allow_exceptions, ignore_comments); + } + + private: + using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t; + template + using internal_iterator = ::nlohmann::detail::internal_iterator; + template + using iter_impl = ::nlohmann::detail::iter_impl; + template + using iteration_proxy = ::nlohmann::detail::iteration_proxy; + template using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator; + + template + using output_adapter_t = ::nlohmann::detail::output_adapter_t; + + template + using binary_reader = ::nlohmann::detail::binary_reader; + template using binary_writer = ::nlohmann::detail::binary_writer; + + JSON_PRIVATE_UNLESS_TESTED: + using serializer = ::nlohmann::detail::serializer; + + public: + using value_t = detail::value_t; + /// JSON Pointer, see @ref nlohmann::json_pointer + using json_pointer = ::nlohmann::json_pointer; + template + using json_serializer = JSONSerializer; + /// how to treat decoding errors + using error_handler_t = detail::error_handler_t; + /// how to treat CBOR tags + using cbor_tag_handler_t = detail::cbor_tag_handler_t; + /// helper type for initializer lists of basic_json values + using initializer_list_t = std::initializer_list>; + + using input_format_t = detail::input_format_t; + /// SAX interface type, see @ref nlohmann::json_sax + using json_sax_t = json_sax; + + //////////////// + // exceptions // + //////////////// + + /// @name exceptions + /// Classes to implement user-defined exceptions. + /// @{ + + /// @copydoc detail::exception + using exception = detail::exception; + /// @copydoc detail::parse_error + using parse_error = detail::parse_error; + /// @copydoc detail::invalid_iterator + using invalid_iterator = detail::invalid_iterator; + /// @copydoc detail::type_error + using type_error = detail::type_error; + /// @copydoc detail::out_of_range + using out_of_range = detail::out_of_range; + /// @copydoc detail::other_error + using other_error = detail::other_error; + + /// @} + + + ///////////////////// + // container types // + ///////////////////// + + /// @name container types + /// The canonic container types to use @ref basic_json like any other STL + /// container. + /// @{ + + /// the type of elements in a basic_json container + using value_type = basic_json; + + /// the type of an element reference + using reference = value_type&; + /// the type of an element const reference + using const_reference = const value_type&; + + /// a type to represent differences between iterators + using difference_type = std::ptrdiff_t; + /// a type to represent container sizes + using size_type = std::size_t; + + /// the allocator type + using allocator_type = AllocatorType; + + /// the type of an element pointer + using pointer = typename std::allocator_traits::pointer; + /// the type of an element const pointer + using const_pointer = typename std::allocator_traits::const_pointer; + + /// an iterator for a basic_json container + using iterator = iter_impl; + /// a const iterator for a basic_json container + using const_iterator = iter_impl; + /// a reverse iterator for a basic_json container + using reverse_iterator = json_reverse_iterator; + /// a const reverse iterator for a basic_json container + using const_reverse_iterator = json_reverse_iterator; + + /// @} + + + /*! + @brief returns the allocator associated with the container + */ + static allocator_type get_allocator() + { + return allocator_type(); + } + + /*! + @brief returns version information on the library + + This function returns a JSON object with information about the library, + including the version number and information on the platform and compiler. + + @return JSON object holding version information + key | description + ----------- | --------------- + `compiler` | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version). + `copyright` | The copyright line for the library as string. + `name` | The name of the library as string. + `platform` | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`. + `url` | The URL of the project as string. + `version` | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string). + + @liveexample{The following code shows an example output of the `meta()` + function.,meta} + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @complexity Constant. + + @since 2.1.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json meta() + { + basic_json result; + + result["copyright"] = "(C) 2013-2021 Niels Lohmann"; + result["name"] = "JSON for Modern C++"; + result["url"] = "https://github.com/nlohmann/json"; + result["version"]["string"] = + std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." + + std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." + + std::to_string(NLOHMANN_JSON_VERSION_PATCH); + result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR; + result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR; + result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH; + +#ifdef _WIN32 + result["platform"] = "win32"; +#elif defined __linux__ + result["platform"] = "linux"; +#elif defined __APPLE__ + result["platform"] = "apple"; +#elif defined __unix__ + result["platform"] = "unix"; +#else + result["platform"] = "unknown"; +#endif + +#if defined(__ICC) || defined(__INTEL_COMPILER) + result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}}; +#elif defined(__clang__) + result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}}; +#elif defined(__GNUC__) || defined(__GNUG__) + result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}}; +#elif defined(__HP_cc) || defined(__HP_aCC) + result["compiler"] = "hp" +#elif defined(__IBMCPP__) + result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}}; +#elif defined(_MSC_VER) + result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}}; +#elif defined(__PGI) + result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}}; +#elif defined(__SUNPRO_CC) + result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}}; +#else + result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}}; +#endif + +#ifdef __cplusplus + result["compiler"]["c++"] = std::to_string(__cplusplus); +#else + result["compiler"]["c++"] = "unknown"; +#endif + return result; + } + + + /////////////////////////// + // JSON value data types // + /////////////////////////// + + /// @name JSON value data types + /// The data types to store a JSON value. These types are derived from + /// the template arguments passed to class @ref basic_json. + /// @{ + +#if defined(JSON_HAS_CPP_14) + // Use transparent comparator if possible, combined with perfect forwarding + // on find() and count() calls prevents unnecessary string construction. + using object_comparator_t = std::less<>; +#else + using object_comparator_t = std::less; +#endif + + /*! + @brief a type for an object + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON objects as follows: + > An object is an unordered collection of zero or more name/value pairs, + > where a name is a string and a value is a string, number, boolean, null, + > object, or array. + + To store objects in C++, a type is defined by the template parameters + described below. + + @tparam ObjectType the container to store objects (e.g., `std::map` or + `std::unordered_map`) + @tparam StringType the type of the keys or names (e.g., `std::string`). + The comparison function `std::less` is used to order elements + inside the container. + @tparam AllocatorType the allocator to use for objects (e.g., + `std::allocator`) + + #### Default type + + With the default values for @a ObjectType (`std::map`), @a StringType + (`std::string`), and @a AllocatorType (`std::allocator`), the default + value for @a object_t is: + + @code {.cpp} + std::map< + std::string, // key_type + basic_json, // value_type + std::less, // key_compare + std::allocator> // allocator_type + > + @endcode + + #### Behavior + + The choice of @a object_t influences the behavior of the JSON class. With + the default type, objects have the following behavior: + + - When all names are unique, objects will be interoperable in the sense + that all software implementations receiving that object will agree on + the name-value mappings. + - When the names within an object are not unique, it is unspecified which + one of the values for a given key will be chosen. For instance, + `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or + `{"key": 2}`. + - Internally, name/value pairs are stored in lexicographical order of the + names. Objects will also be serialized (see @ref dump) in this order. + For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored + and serialized as `{"a": 2, "b": 1}`. + - When comparing objects, the order of the name/value pairs is irrelevant. + This makes objects interoperable in the sense that they will not be + affected by these differences. For instance, `{"b": 1, "a": 2}` and + `{"a": 2, "b": 1}` will be treated as equal. + + #### Limits + + [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies: + > An implementation may set limits on the maximum depth of nesting. + + In this class, the object's limit of nesting is not explicitly constrained. + However, a maximum depth of nesting may be introduced by the compiler or + runtime environment. A theoretical limit can be queried by calling the + @ref max_size function of a JSON object. + + #### Storage + + Objects are stored as pointers in a @ref basic_json type. That is, for any + access to object values, a pointer of type `object_t*` must be + dereferenced. + + @sa see @ref array_t -- type for an array value + + @since version 1.0.0 + + @note The order name/value pairs are added to the object is *not* + preserved by the library. Therefore, iterating an object may return + name/value pairs in a different order than they were originally stored. In + fact, keys will be traversed in alphabetical order as `std::map` with + `std::less` is used by default. Please note this behavior conforms to [RFC + 8259](https://tools.ietf.org/html/rfc8259), because any order implements the + specified "unordered" nature of JSON objects. + */ + using object_t = ObjectType>>; + + /*! + @brief a type for an array + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON arrays as follows: + > An array is an ordered sequence of zero or more values. + + To store objects in C++, a type is defined by the template parameters + explained below. + + @tparam ArrayType container type to store arrays (e.g., `std::vector` or + `std::list`) + @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`) + + #### Default type + + With the default values for @a ArrayType (`std::vector`) and @a + AllocatorType (`std::allocator`), the default value for @a array_t is: + + @code {.cpp} + std::vector< + basic_json, // value_type + std::allocator // allocator_type + > + @endcode + + #### Limits + + [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies: + > An implementation may set limits on the maximum depth of nesting. + + In this class, the array's limit of nesting is not explicitly constrained. + However, a maximum depth of nesting may be introduced by the compiler or + runtime environment. A theoretical limit can be queried by calling the + @ref max_size function of a JSON array. + + #### Storage + + Arrays are stored as pointers in a @ref basic_json type. That is, for any + access to array values, a pointer of type `array_t*` must be dereferenced. + + @sa see @ref object_t -- type for an object value + + @since version 1.0.0 + */ + using array_t = ArrayType>; + + /*! + @brief a type for a string + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON strings as follows: + > A string is a sequence of zero or more Unicode characters. + + To store objects in C++, a type is defined by the template parameter + described below. Unicode values are split by the JSON class into + byte-sized characters during deserialization. + + @tparam StringType the container to store strings (e.g., `std::string`). + Note this container is used for keys/names in objects, see @ref object_t. + + #### Default type + + With the default values for @a StringType (`std::string`), the default + value for @a string_t is: + + @code {.cpp} + std::string + @endcode + + #### Encoding + + Strings are stored in UTF-8 encoding. Therefore, functions like + `std::string::size()` or `std::string::length()` return the number of + bytes in the string rather than the number of characters or glyphs. + + #### String comparison + + [RFC 8259](https://tools.ietf.org/html/rfc8259) states: + > Software implementations are typically required to test names of object + > members for equality. Implementations that transform the textual + > representation into sequences of Unicode code units and then perform the + > comparison numerically, code unit by code unit, are interoperable in the + > sense that implementations will agree in all cases on equality or + > inequality of two strings. For example, implementations that compare + > strings with escaped characters unconverted may incorrectly find that + > `"a\\b"` and `"a\u005Cb"` are not equal. + + This implementation is interoperable as it does compare strings code unit + by code unit. + + #### Storage + + String values are stored as pointers in a @ref basic_json type. That is, + for any access to string values, a pointer of type `string_t*` must be + dereferenced. + + @since version 1.0.0 + */ + using string_t = StringType; + + /*! + @brief a type for a boolean + + [RFC 8259](https://tools.ietf.org/html/rfc8259) implicitly describes a boolean as a + type which differentiates the two literals `true` and `false`. + + To store objects in C++, a type is defined by the template parameter @a + BooleanType which chooses the type to use. + + #### Default type + + With the default values for @a BooleanType (`bool`), the default value for + @a boolean_t is: + + @code {.cpp} + bool + @endcode + + #### Storage + + Boolean values are stored directly inside a @ref basic_json type. + + @since version 1.0.0 + */ + using boolean_t = BooleanType; + + /*! + @brief a type for a number (integer) + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows: + > The representation of numbers is similar to that used in most + > programming languages. A number is represented in base 10 using decimal + > digits. It contains an integer component that may be prefixed with an + > optional minus sign, which may be followed by a fraction part and/or an + > exponent part. Leading zeros are not allowed. (...) Numeric values that + > cannot be represented in the grammar below (such as Infinity and NaN) + > are not permitted. + + This description includes both integer and floating-point numbers. + However, C++ allows more precise storage if it is known whether the number + is a signed integer, an unsigned integer or a floating-point number. + Therefore, three different types, @ref number_integer_t, @ref + number_unsigned_t and @ref number_float_t are used. + + To store integer numbers in C++, a type is defined by the template + parameter @a NumberIntegerType which chooses the type to use. + + #### Default type + + With the default values for @a NumberIntegerType (`int64_t`), the default + value for @a number_integer_t is: + + @code {.cpp} + int64_t + @endcode + + #### Default behavior + + - The restrictions about leading zeros is not enforced in C++. Instead, + leading zeros in integer literals lead to an interpretation as octal + number. Internally, the value will be stored as decimal number. For + instance, the C++ integer literal `010` will be serialized to `8`. + During deserialization, leading zeros yield an error. + - Not-a-number (NaN) values will be serialized to `null`. + + #### Limits + + [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies: + > An implementation may set limits on the range and precision of numbers. + + When the default type is used, the maximal integer number that can be + stored is `9223372036854775807` (INT64_MAX) and the minimal integer number + that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers + that are out of range will yield over/underflow when used in a + constructor. During deserialization, too large or small integer numbers + will be automatically be stored as @ref number_unsigned_t or @ref + number_float_t. + + [RFC 8259](https://tools.ietf.org/html/rfc8259) further states: + > Note that when such software is used, numbers that are integers and are + > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense + > that implementations will agree exactly on their numeric values. + + As this range is a subrange of the exactly supported range [INT64_MIN, + INT64_MAX], this class's integer type is interoperable. + + #### Storage + + Integer number values are stored directly inside a @ref basic_json type. + + @sa see @ref number_float_t -- type for number values (floating-point) + + @sa see @ref number_unsigned_t -- type for number values (unsigned integer) + + @since version 1.0.0 + */ + using number_integer_t = NumberIntegerType; + + /*! + @brief a type for a number (unsigned) + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows: + > The representation of numbers is similar to that used in most + > programming languages. A number is represented in base 10 using decimal + > digits. It contains an integer component that may be prefixed with an + > optional minus sign, which may be followed by a fraction part and/or an + > exponent part. Leading zeros are not allowed. (...) Numeric values that + > cannot be represented in the grammar below (such as Infinity and NaN) + > are not permitted. + + This description includes both integer and floating-point numbers. + However, C++ allows more precise storage if it is known whether the number + is a signed integer, an unsigned integer or a floating-point number. + Therefore, three different types, @ref number_integer_t, @ref + number_unsigned_t and @ref number_float_t are used. + + To store unsigned integer numbers in C++, a type is defined by the + template parameter @a NumberUnsignedType which chooses the type to use. + + #### Default type + + With the default values for @a NumberUnsignedType (`uint64_t`), the + default value for @a number_unsigned_t is: + + @code {.cpp} + uint64_t + @endcode + + #### Default behavior + + - The restrictions about leading zeros is not enforced in C++. Instead, + leading zeros in integer literals lead to an interpretation as octal + number. Internally, the value will be stored as decimal number. For + instance, the C++ integer literal `010` will be serialized to `8`. + During deserialization, leading zeros yield an error. + - Not-a-number (NaN) values will be serialized to `null`. + + #### Limits + + [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies: + > An implementation may set limits on the range and precision of numbers. + + When the default type is used, the maximal integer number that can be + stored is `18446744073709551615` (UINT64_MAX) and the minimal integer + number that can be stored is `0`. Integer numbers that are out of range + will yield over/underflow when used in a constructor. During + deserialization, too large or small integer numbers will be automatically + be stored as @ref number_integer_t or @ref number_float_t. + + [RFC 8259](https://tools.ietf.org/html/rfc8259) further states: + > Note that when such software is used, numbers that are integers and are + > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense + > that implementations will agree exactly on their numeric values. + + As this range is a subrange (when considered in conjunction with the + number_integer_t type) of the exactly supported range [0, UINT64_MAX], + this class's integer type is interoperable. + + #### Storage + + Integer number values are stored directly inside a @ref basic_json type. + + @sa see @ref number_float_t -- type for number values (floating-point) + @sa see @ref number_integer_t -- type for number values (integer) + + @since version 2.0.0 + */ + using number_unsigned_t = NumberUnsignedType; + + /*! + @brief a type for a number (floating-point) + + [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows: + > The representation of numbers is similar to that used in most + > programming languages. A number is represented in base 10 using decimal + > digits. It contains an integer component that may be prefixed with an + > optional minus sign, which may be followed by a fraction part and/or an + > exponent part. Leading zeros are not allowed. (...) Numeric values that + > cannot be represented in the grammar below (such as Infinity and NaN) + > are not permitted. + + This description includes both integer and floating-point numbers. + However, C++ allows more precise storage if it is known whether the number + is a signed integer, an unsigned integer or a floating-point number. + Therefore, three different types, @ref number_integer_t, @ref + number_unsigned_t and @ref number_float_t are used. + + To store floating-point numbers in C++, a type is defined by the template + parameter @a NumberFloatType which chooses the type to use. + + #### Default type + + With the default values for @a NumberFloatType (`double`), the default + value for @a number_float_t is: + + @code {.cpp} + double + @endcode + + #### Default behavior + + - The restrictions about leading zeros is not enforced in C++. Instead, + leading zeros in floating-point literals will be ignored. Internally, + the value will be stored as decimal number. For instance, the C++ + floating-point literal `01.2` will be serialized to `1.2`. During + deserialization, leading zeros yield an error. + - Not-a-number (NaN) values will be serialized to `null`. + + #### Limits + + [RFC 8259](https://tools.ietf.org/html/rfc8259) states: + > This specification allows implementations to set limits on the range and + > precision of numbers accepted. Since software that implements IEEE + > 754-2008 binary64 (double precision) numbers is generally available and + > widely used, good interoperability can be achieved by implementations + > that expect no more precision or range than these provide, in the sense + > that implementations will approximate JSON numbers within the expected + > precision. + + This implementation does exactly follow this approach, as it uses double + precision floating-point numbers. Note values smaller than + `-1.79769313486232e+308` and values greater than `1.79769313486232e+308` + will be stored as NaN internally and be serialized to `null`. + + #### Storage + + Floating-point number values are stored directly inside a @ref basic_json + type. + + @sa see @ref number_integer_t -- type for number values (integer) + + @sa see @ref number_unsigned_t -- type for number values (unsigned integer) + + @since version 1.0.0 + */ + using number_float_t = NumberFloatType; + + /*! + @brief a type for a packed binary type + + This type is a type designed to carry binary data that appears in various + serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and + BSON's generic binary subtype. This type is NOT a part of standard JSON and + exists solely for compatibility with these binary types. As such, it is + simply defined as an ordered sequence of zero or more byte values. + + Additionally, as an implementation detail, the subtype of the binary data is + carried around as a `std::uint8_t`, which is compatible with both of the + binary data formats that use binary subtyping, (though the specific + numbering is incompatible with each other, and it is up to the user to + translate between them). + + [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type + as: + > Major type 2: a byte string. The string's length in bytes is represented + > following the rules for positive integers (major type 0). + + [MessagePack's documentation on the bin type + family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family) + describes this type as: + > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes + > in addition to the size of the byte array. + + [BSON's specifications](http://bsonspec.org/spec.html) describe several + binary types; however, this type is intended to represent the generic binary + type which has the description: + > Generic binary subtype - This is the most commonly used binary subtype and + > should be the 'default' for drivers and tools. + + None of these impose any limitations on the internal representation other + than the basic unit of storage be some type of array whose parts are + decomposable into bytes. + + The default representation of this binary format is a + `std::vector`, which is a very common way to represent a byte + array in modern C++. + + #### Default type + + The default values for @a BinaryType is `std::vector` + + #### Storage + + Binary Arrays are stored as pointers in a @ref basic_json type. That is, + for any access to array values, a pointer of the type `binary_t*` must be + dereferenced. + + #### Notes on subtypes + + - CBOR + - Binary values are represented as byte strings. No subtypes are + supported and will be ignored when CBOR is written. + - MessagePack + - If a subtype is given and the binary array contains exactly 1, 2, 4, 8, + or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8) + is used. For other sizes, the ext family (ext8, ext16, ext32) is used. + The subtype is then added as singed 8-bit integer. + - If no subtype is given, the bin family (bin8, bin16, bin32) is used. + - BSON + - If a subtype is given, it is used and added as unsigned 8-bit integer. + - If no subtype is given, the generic binary subtype 0x00 is used. + + @sa see @ref binary -- create a binary array + + @since version 3.8.0 + */ + using binary_t = nlohmann::byte_container_with_subtype; + /// @} + + private: + + /// helper for exception-safe object creation + template + JSON_HEDLEY_RETURNS_NON_NULL + static T* create(Args&& ... args) + { + AllocatorType alloc; + using AllocatorTraits = std::allocator_traits>; + + auto deleter = [&](T * obj) + { + AllocatorTraits::deallocate(alloc, obj, 1); + }; + std::unique_ptr obj(AllocatorTraits::allocate(alloc, 1), deleter); + AllocatorTraits::construct(alloc, obj.get(), std::forward(args)...); + JSON_ASSERT(obj != nullptr); + return obj.release(); + } + + //////////////////////// + // JSON value storage // + //////////////////////// + + JSON_PRIVATE_UNLESS_TESTED: + /*! + @brief a JSON value + + The actual storage for a JSON value of the @ref basic_json class. This + union combines the different storage types for the JSON value types + defined in @ref value_t. + + JSON type | value_t type | used type + --------- | --------------- | ------------------------ + object | object | pointer to @ref object_t + array | array | pointer to @ref array_t + string | string | pointer to @ref string_t + boolean | boolean | @ref boolean_t + number | number_integer | @ref number_integer_t + number | number_unsigned | @ref number_unsigned_t + number | number_float | @ref number_float_t + binary | binary | pointer to @ref binary_t + null | null | *no value is stored* + + @note Variable-length types (objects, arrays, and strings) are stored as + pointers. The size of the union should not exceed 64 bits if the default + value types are used. + + @since version 1.0.0 + */ + union json_value + { + /// object (stored with pointer to save storage) + object_t* object; + /// array (stored with pointer to save storage) + array_t* array; + /// string (stored with pointer to save storage) + string_t* string; + /// binary (stored with pointer to save storage) + binary_t* binary; + /// boolean + boolean_t boolean; + /// number (integer) + number_integer_t number_integer; + /// number (unsigned integer) + number_unsigned_t number_unsigned; + /// number (floating-point) + number_float_t number_float; + + /// default constructor (for null values) + json_value() = default; + /// constructor for booleans + json_value(boolean_t v) noexcept : boolean(v) {} + /// constructor for numbers (integer) + json_value(number_integer_t v) noexcept : number_integer(v) {} + /// constructor for numbers (unsigned) + json_value(number_unsigned_t v) noexcept : number_unsigned(v) {} + /// constructor for numbers (floating-point) + json_value(number_float_t v) noexcept : number_float(v) {} + /// constructor for empty values of a given type + json_value(value_t t) + { + switch (t) + { + case value_t::object: + { + object = create(); + break; + } + + case value_t::array: + { + array = create(); + break; + } + + case value_t::string: + { + string = create(""); + break; + } + + case value_t::binary: + { + binary = create(); + break; + } + + case value_t::boolean: + { + boolean = boolean_t(false); + break; + } + + case value_t::number_integer: + { + number_integer = number_integer_t(0); + break; + } + + case value_t::number_unsigned: + { + number_unsigned = number_unsigned_t(0); + break; + } + + case value_t::number_float: + { + number_float = number_float_t(0.0); + break; + } + + case value_t::null: + { + object = nullptr; // silence warning, see #821 + break; + } + + default: + { + object = nullptr; // silence warning, see #821 + if (JSON_HEDLEY_UNLIKELY(t == value_t::null)) + { + JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.9.1", basic_json())); // LCOV_EXCL_LINE + } + break; + } + } + } + + /// constructor for strings + json_value(const string_t& value) + { + string = create(value); + } + + /// constructor for rvalue strings + json_value(string_t&& value) + { + string = create(std::move(value)); + } + + /// constructor for objects + json_value(const object_t& value) + { + object = create(value); + } + + /// constructor for rvalue objects + json_value(object_t&& value) + { + object = create(std::move(value)); + } + + /// constructor for arrays + json_value(const array_t& value) + { + array = create(value); + } + + /// constructor for rvalue arrays + json_value(array_t&& value) + { + array = create(std::move(value)); + } + + /// constructor for binary arrays + json_value(const typename binary_t::container_type& value) + { + binary = create(value); + } + + /// constructor for rvalue binary arrays + json_value(typename binary_t::container_type&& value) + { + binary = create(std::move(value)); + } + + /// constructor for binary arrays (internal type) + json_value(const binary_t& value) + { + binary = create(value); + } + + /// constructor for rvalue binary arrays (internal type) + json_value(binary_t&& value) + { + binary = create(std::move(value)); + } + + void destroy(value_t t) noexcept + { + // flatten the current json_value to a heap-allocated stack + std::vector stack; + + // move the top-level items to stack + if (t == value_t::array) + { + stack.reserve(array->size()); + std::move(array->begin(), array->end(), std::back_inserter(stack)); + } + else if (t == value_t::object) + { + stack.reserve(object->size()); + for (auto&& it : *object) + { + stack.push_back(std::move(it.second)); + } + } + + while (!stack.empty()) + { + // move the last item to local variable to be processed + basic_json current_item(std::move(stack.back())); + stack.pop_back(); + + // if current_item is array/object, move + // its children to the stack to be processed later + if (current_item.is_array()) + { + std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(), + std::back_inserter(stack)); + + current_item.m_value.array->clear(); + } + else if (current_item.is_object()) + { + for (auto&& it : *current_item.m_value.object) + { + stack.push_back(std::move(it.second)); + } + + current_item.m_value.object->clear(); + } + + // it's now safe that current_item get destructed + // since it doesn't have any children + } + + switch (t) + { + case value_t::object: + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, object); + std::allocator_traits::deallocate(alloc, object, 1); + break; + } + + case value_t::array: + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, array); + std::allocator_traits::deallocate(alloc, array, 1); + break; + } + + case value_t::string: + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, string); + std::allocator_traits::deallocate(alloc, string, 1); + break; + } + + case value_t::binary: + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, binary); + std::allocator_traits::deallocate(alloc, binary, 1); + break; + } + + default: + { + break; + } + } + } + }; + + private: + /*! + @brief checks the class invariants + + This function asserts the class invariants. It needs to be called at the + end of every constructor to make sure that created objects respect the + invariant. Furthermore, it has to be called each time the type of a JSON + value is changed, because the invariant expresses a relationship between + @a m_type and @a m_value. + + Furthermore, the parent relation is checked for arrays and objects: If + @a check_parents true and the value is an array or object, then the + container's elements must have the current value as parent. + + @param[in] check_parents whether the parent relation should be checked. + The value is true by default and should only be set to false + during destruction of objects when the invariant does not + need to hold. + */ + void assert_invariant(bool check_parents = true) const noexcept + { + JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr); + JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr); + JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr); + JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr); + +#if JSON_DIAGNOSTICS + JSON_TRY + { + // cppcheck-suppress assertWithSideEffect + JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j) + { + return j.m_parent == this; + })); + } + JSON_CATCH(...) {} // LCOV_EXCL_LINE +#endif + static_cast(check_parents); + } + + void set_parents() + { +#if JSON_DIAGNOSTICS + switch (m_type) + { + case value_t::array: + { + for (auto& element : *m_value.array) + { + element.m_parent = this; + } + break; + } + + case value_t::object: + { + for (auto& element : *m_value.object) + { + element.second.m_parent = this; + } + break; + } + + default: + break; + } +#endif + } + + iterator set_parents(iterator it, typename iterator::difference_type count) + { +#if JSON_DIAGNOSTICS + for (typename iterator::difference_type i = 0; i < count; ++i) + { + (it + i)->m_parent = this; + } +#else + static_cast(count); +#endif + return it; + } + + reference set_parent(reference j) + { +#if JSON_DIAGNOSTICS + j.m_parent = this; +#else + static_cast(j); +#endif + return j; + } + + public: + ////////////////////////// + // JSON parser callback // + ////////////////////////// + + /*! + @brief parser event types + + The parser callback distinguishes the following events: + - `object_start`: the parser read `{` and started to process a JSON object + - `key`: the parser read a key of a value in an object + - `object_end`: the parser read `}` and finished processing a JSON object + - `array_start`: the parser read `[` and started to process a JSON array + - `array_end`: the parser read `]` and finished processing a JSON array + - `value`: the parser finished reading a JSON value + + @image html callback_events.png "Example when certain parse events are triggered" + + @sa see @ref parser_callback_t for more information and examples + */ + using parse_event_t = detail::parse_event_t; + + /*! + @brief per-element parser callback type + + With a parser callback function, the result of parsing a JSON text can be + influenced. When passed to @ref parse, it is called on certain events + (passed as @ref parse_event_t via parameter @a event) with a set recursion + depth @a depth and context JSON value @a parsed. The return value of the + callback function is a boolean indicating whether the element that emitted + the callback shall be kept or not. + + We distinguish six scenarios (determined by the event type) in which the + callback function can be called. The following table describes the values + of the parameters @a depth, @a event, and @a parsed. + + parameter @a event | description | parameter @a depth | parameter @a parsed + ------------------ | ----------- | ------------------ | ------------------- + parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded + parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key + parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object + parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded + parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array + parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value + + @image html callback_events.png "Example when certain parse events are triggered" + + Discarding a value (i.e., returning `false`) has different effects + depending on the context in which function was called: + + - Discarded values in structured types are skipped. That is, the parser + will behave as if the discarded value was never read. + - In case a value outside a structured type is skipped, it is replaced + with `null`. This case happens if the top-level element is skipped. + + @param[in] depth the depth of the recursion during parsing + + @param[in] event an event of type parse_event_t indicating the context in + the callback function has been called + + @param[in,out] parsed the current intermediate parse result; note that + writing to this value has no effect for parse_event_t::key events + + @return Whether the JSON value which called the function during parsing + should be kept (`true`) or not (`false`). In the latter case, it is either + skipped completely or replaced by an empty discarded object. + + @sa see @ref parse for examples + + @since version 1.0.0 + */ + using parser_callback_t = detail::parser_callback_t; + + ////////////////// + // constructors // + ////////////////// + + /// @name constructors and destructors + /// Constructors of class @ref basic_json, copy/move constructor, copy + /// assignment, static functions creating objects, and the destructor. + /// @{ + + /*! + @brief create an empty value with a given type + + Create an empty JSON value with a given type. The value will be default + initialized with an empty value which depends on the type: + + Value type | initial value + ----------- | ------------- + null | `null` + boolean | `false` + string | `""` + number | `0` + object | `{}` + array | `[]` + binary | empty array + + @param[in] v the type of the value to create + + @complexity Constant. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The following code shows the constructor for different @ref + value_t values,basic_json__value_t} + + @sa see @ref clear() -- restores the postcondition of this constructor + + @since version 1.0.0 + */ + basic_json(const value_t v) + : m_type(v), m_value(v) + { + assert_invariant(); + } + + /*! + @brief create a null object + + Create a `null` JSON value. It either takes a null pointer as parameter + (explicitly creating `null`) or no parameter (implicitly creating `null`). + The passed null pointer itself is not read -- it is only used to choose + the right constructor. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this constructor never throws + exceptions. + + @liveexample{The following code shows the constructor with and without a + null pointer parameter.,basic_json__nullptr_t} + + @since version 1.0.0 + */ + basic_json(std::nullptr_t = nullptr) noexcept + : basic_json(value_t::null) + { + assert_invariant(); + } + + /*! + @brief create a JSON value + + This is a "catch all" constructor for all compatible JSON types; that is, + types for which a `to_json()` method exists. The constructor forwards the + parameter @a val to that method (to `json_serializer::to_json` method + with `U = uncvref_t`, to be exact). + + Template type @a CompatibleType includes, but is not limited to, the + following types: + - **arrays**: @ref array_t and all kinds of compatible containers such as + `std::vector`, `std::deque`, `std::list`, `std::forward_list`, + `std::array`, `std::valarray`, `std::set`, `std::unordered_set`, + `std::multiset`, and `std::unordered_multiset` with a `value_type` from + which a @ref basic_json value can be constructed. + - **objects**: @ref object_t and all kinds of compatible associative + containers such as `std::map`, `std::unordered_map`, `std::multimap`, + and `std::unordered_multimap` with a `key_type` compatible to + @ref string_t and a `value_type` from which a @ref basic_json value can + be constructed. + - **strings**: @ref string_t, string literals, and all compatible string + containers can be used. + - **numbers**: @ref number_integer_t, @ref number_unsigned_t, + @ref number_float_t, and all convertible number types such as `int`, + `size_t`, `int64_t`, `float` or `double` can be used. + - **boolean**: @ref boolean_t / `bool` can be used. + - **binary**: @ref binary_t / `std::vector` may be used, + unfortunately because string literals cannot be distinguished from binary + character arrays by the C++ type system, all types compatible with `const + char*` will be directed to the string constructor instead. This is both + for backwards compatibility, and due to the fact that a binary type is not + a standard JSON type. + + See the examples below. + + @tparam CompatibleType a type such that: + - @a CompatibleType is not derived from `std::istream`, + - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move + constructors), + - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments) + - @a CompatibleType is not a @ref basic_json nested type (e.g., + @ref json_pointer, @ref iterator, etc ...) + - `json_serializer` has a `to_json(basic_json_t&, CompatibleType&&)` method + + @tparam U = `uncvref_t` + + @param[in] val the value to be forwarded to the respective constructor + + @complexity Usually linear in the size of the passed @a val, also + depending on the implementation of the called `to_json()` + method. + + @exceptionsafety Depends on the called constructor. For types directly + supported by the library (i.e., all types for which no `to_json()` function + was provided), strong guarantee holds: if an exception is thrown, there are + no changes to any JSON value. + + @liveexample{The following code shows the constructor with several + compatible types.,basic_json__CompatibleType} + + @since version 2.1.0 + */ + template < typename CompatibleType, + typename U = detail::uncvref_t, + detail::enable_if_t < + !detail::is_basic_json::value && detail::is_compatible_type::value, int > = 0 > + basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape) + JSONSerializer::to_json(std::declval(), + std::forward(val)))) + { + JSONSerializer::to_json(*this, std::forward(val)); + set_parents(); + assert_invariant(); + } + + /*! + @brief create a JSON value from an existing one + + This is a constructor for existing @ref basic_json types. + It does not hijack copy/move constructors, since the parameter has different + template arguments than the current ones. + + The constructor tries to convert the internal @ref m_value of the parameter. + + @tparam BasicJsonType a type such that: + - @a BasicJsonType is a @ref basic_json type. + - @a BasicJsonType has different template arguments than @ref basic_json_t. + + @param[in] val the @ref basic_json value to be converted. + + @complexity Usually linear in the size of the passed @a val, also + depending on the implementation of the called `to_json()` + method. + + @exceptionsafety Depends on the called constructor. For types directly + supported by the library (i.e., all types for which no `to_json()` function + was provided), strong guarantee holds: if an exception is thrown, there are + no changes to any JSON value. + + @since version 3.2.0 + */ + template < typename BasicJsonType, + detail::enable_if_t < + detail::is_basic_json::value&& !std::is_same::value, int > = 0 > + basic_json(const BasicJsonType& val) + { + using other_boolean_t = typename BasicJsonType::boolean_t; + using other_number_float_t = typename BasicJsonType::number_float_t; + using other_number_integer_t = typename BasicJsonType::number_integer_t; + using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using other_string_t = typename BasicJsonType::string_t; + using other_object_t = typename BasicJsonType::object_t; + using other_array_t = typename BasicJsonType::array_t; + using other_binary_t = typename BasicJsonType::binary_t; + + switch (val.type()) + { + case value_t::boolean: + JSONSerializer::to_json(*this, val.template get()); + break; + case value_t::number_float: + JSONSerializer::to_json(*this, val.template get()); + break; + case value_t::number_integer: + JSONSerializer::to_json(*this, val.template get()); + break; + case value_t::number_unsigned: + JSONSerializer::to_json(*this, val.template get()); + break; + case value_t::string: + JSONSerializer::to_json(*this, val.template get_ref()); + break; + case value_t::object: + JSONSerializer::to_json(*this, val.template get_ref()); + break; + case value_t::array: + JSONSerializer::to_json(*this, val.template get_ref()); + break; + case value_t::binary: + JSONSerializer::to_json(*this, val.template get_ref()); + break; + case value_t::null: + *this = nullptr; + break; + case value_t::discarded: + m_type = value_t::discarded; + break; + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + set_parents(); + assert_invariant(); + } + + /*! + @brief create a container (array or object) from an initializer list + + Creates a JSON value of type array or object from the passed initializer + list @a init. In case @a type_deduction is `true` (default), the type of + the JSON value to be created is deducted from the initializer list @a init + according to the following rules: + + 1. If the list is empty, an empty JSON object value `{}` is created. + 2. If the list consists of pairs whose first element is a string, a JSON + object value is created where the first elements of the pairs are + treated as keys and the second elements are as values. + 3. In all other cases, an array is created. + + The rules aim to create the best fit between a C++ initializer list and + JSON values. The rationale is as follows: + + 1. The empty initializer list is written as `{}` which is exactly an empty + JSON object. + 2. C++ has no way of describing mapped types other than to list a list of + pairs. As JSON requires that keys must be of type string, rule 2 is the + weakest constraint one can pose on initializer lists to interpret them + as an object. + 3. In all other cases, the initializer list could not be interpreted as + JSON object type, so interpreting it as JSON array type is safe. + + With the rules described above, the following JSON values cannot be + expressed by an initializer list: + + - the empty array (`[]`): use @ref array(initializer_list_t) + with an empty initializer list in this case + - arrays whose elements satisfy rule 2: use @ref + array(initializer_list_t) with the same initializer list + in this case + + @note When used without parentheses around an empty initializer list, @ref + basic_json() is called instead of this function, yielding the JSON null + value. + + @param[in] init initializer list with JSON values + + @param[in] type_deduction internal parameter; when set to `true`, the type + of the JSON value is deducted from the initializer list @a init; when set + to `false`, the type provided via @a manual_type is forced. This mode is + used by the functions @ref array(initializer_list_t) and + @ref object(initializer_list_t). + + @param[in] manual_type internal parameter; when @a type_deduction is set + to `false`, the created JSON value will use the provided type (only @ref + value_t::array and @ref value_t::object are valid); when @a type_deduction + is set to `true`, this parameter has no effect + + @throw type_error.301 if @a type_deduction is `false`, @a manual_type is + `value_t::object`, but @a init contains an element which is not a pair + whose first element is a string. In this case, the constructor could not + create an object. If @a type_deduction would have be `true`, an array + would have been created. See @ref object(initializer_list_t) + for an example. + + @complexity Linear in the size of the initializer list @a init. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The example below shows how JSON values are created from + initializer lists.,basic_json__list_init_t} + + @sa see @ref array(initializer_list_t) -- create a JSON array + value from an initializer list + @sa see @ref object(initializer_list_t) -- create a JSON object + value from an initializer list + + @since version 1.0.0 + */ + basic_json(initializer_list_t init, + bool type_deduction = true, + value_t manual_type = value_t::array) + { + // check if each element is an array with two elements whose first + // element is a string + bool is_an_object = std::all_of(init.begin(), init.end(), + [](const detail::json_ref& element_ref) + { + return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string(); + }); + + // adjust type if type deduction is not wanted + if (!type_deduction) + { + // if array is wanted, do not create an object though possible + if (manual_type == value_t::array) + { + is_an_object = false; + } + + // if object is wanted but impossible, throw an exception + if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object)) + { + JSON_THROW(type_error::create(301, "cannot create object from initializer list", basic_json())); + } + } + + if (is_an_object) + { + // the initializer list is a list of pairs -> create object + m_type = value_t::object; + m_value = value_t::object; + + for (auto& element_ref : init) + { + auto element = element_ref.moved_or_copied(); + m_value.object->emplace( + std::move(*((*element.m_value.array)[0].m_value.string)), + std::move((*element.m_value.array)[1])); + } + } + else + { + // the initializer list describes an array -> create array + m_type = value_t::array; + m_value.array = create(init.begin(), init.end()); + } + + set_parents(); + assert_invariant(); + } + + /*! + @brief explicitly create a binary array (without subtype) + + Creates a JSON binary array value from a given binary container. Binary + values are part of various binary formats, such as CBOR, MessagePack, and + BSON. This constructor is used to create a value for serialization to those + formats. + + @note Note, this function exists because of the difficulty in correctly + specifying the correct template overload in the standard value ctor, as both + JSON arrays and JSON binary arrays are backed with some form of a + `std::vector`. Because JSON binary arrays are a non-standard extension it + was decided that it would be best to prevent automatic initialization of a + binary array type, for backwards compatibility and so it does not happen on + accident. + + @param[in] init container containing bytes to use as binary type + + @return JSON binary array value + + @complexity Linear in the size of @a init. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @since version 3.8.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json binary(const typename binary_t::container_type& init) + { + auto res = basic_json(); + res.m_type = value_t::binary; + res.m_value = init; + return res; + } + + /*! + @brief explicitly create a binary array (with subtype) + + Creates a JSON binary array value from a given binary container. Binary + values are part of various binary formats, such as CBOR, MessagePack, and + BSON. This constructor is used to create a value for serialization to those + formats. + + @note Note, this function exists because of the difficulty in correctly + specifying the correct template overload in the standard value ctor, as both + JSON arrays and JSON binary arrays are backed with some form of a + `std::vector`. Because JSON binary arrays are a non-standard extension it + was decided that it would be best to prevent automatic initialization of a + binary array type, for backwards compatibility and so it does not happen on + accident. + + @param[in] init container containing bytes to use as binary type + @param[in] subtype subtype to use in MessagePack and BSON + + @return JSON binary array value + + @complexity Linear in the size of @a init. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @since version 3.8.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json binary(const typename binary_t::container_type& init, std::uint8_t subtype) + { + auto res = basic_json(); + res.m_type = value_t::binary; + res.m_value = binary_t(init, subtype); + return res; + } + + /// @copydoc binary(const typename binary_t::container_type&) + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json binary(typename binary_t::container_type&& init) + { + auto res = basic_json(); + res.m_type = value_t::binary; + res.m_value = std::move(init); + return res; + } + + /// @copydoc binary(const typename binary_t::container_type&, std::uint8_t) + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json binary(typename binary_t::container_type&& init, std::uint8_t subtype) + { + auto res = basic_json(); + res.m_type = value_t::binary; + res.m_value = binary_t(std::move(init), subtype); + return res; + } + + /*! + @brief explicitly create an array from an initializer list + + Creates a JSON array value from a given initializer list. That is, given a + list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the + initializer list is empty, the empty array `[]` is created. + + @note This function is only needed to express two edge cases that cannot + be realized with the initializer list constructor (@ref + basic_json(initializer_list_t, bool, value_t)). These cases + are: + 1. creating an array whose elements are all pairs whose first element is a + string -- in this case, the initializer list constructor would create an + object, taking the first elements as keys + 2. creating an empty array -- passing the empty initializer list to the + initializer list constructor yields an empty object + + @param[in] init initializer list with JSON values to create an array from + (optional) + + @return JSON array value + + @complexity Linear in the size of @a init. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The following code shows an example for the `array` + function.,array} + + @sa see @ref basic_json(initializer_list_t, bool, value_t) -- + create a JSON value from an initializer list + @sa see @ref object(initializer_list_t) -- create a JSON object + value from an initializer list + + @since version 1.0.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json array(initializer_list_t init = {}) + { + return basic_json(init, false, value_t::array); + } + + /*! + @brief explicitly create an object from an initializer list + + Creates a JSON object value from a given initializer list. The initializer + lists elements must be pairs, and their first elements must be strings. If + the initializer list is empty, the empty object `{}` is created. + + @note This function is only added for symmetry reasons. In contrast to the + related function @ref array(initializer_list_t), there are + no cases which can only be expressed by this function. That is, any + initializer list @a init can also be passed to the initializer list + constructor @ref basic_json(initializer_list_t, bool, value_t). + + @param[in] init initializer list to create an object from (optional) + + @return JSON object value + + @throw type_error.301 if @a init is not a list of pairs whose first + elements are strings. In this case, no object can be created. When such a + value is passed to @ref basic_json(initializer_list_t, bool, value_t), + an array would have been created from the passed initializer list @a init. + See example below. + + @complexity Linear in the size of @a init. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The following code shows an example for the `object` + function.,object} + + @sa see @ref basic_json(initializer_list_t, bool, value_t) -- + create a JSON value from an initializer list + @sa see @ref array(initializer_list_t) -- create a JSON array + value from an initializer list + + @since version 1.0.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json object(initializer_list_t init = {}) + { + return basic_json(init, false, value_t::object); + } + + /*! + @brief construct an array with count copies of given value + + Constructs a JSON array value by creating @a cnt copies of a passed value. + In case @a cnt is `0`, an empty array is created. + + @param[in] cnt the number of JSON copies of @a val to create + @param[in] val the JSON value to copy + + @post `std::distance(begin(),end()) == cnt` holds. + + @complexity Linear in @a cnt. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The following code shows examples for the @ref + basic_json(size_type\, const basic_json&) + constructor.,basic_json__size_type_basic_json} + + @since version 1.0.0 + */ + basic_json(size_type cnt, const basic_json& val) + : m_type(value_t::array) + { + m_value.array = create(cnt, val); + set_parents(); + assert_invariant(); + } + + /*! + @brief construct a JSON container given an iterator range + + Constructs the JSON value with the contents of the range `[first, last)`. + The semantics depends on the different types a JSON value can have: + - In case of a null type, invalid_iterator.206 is thrown. + - In case of other primitive types (number, boolean, or string), @a first + must be `begin()` and @a last must be `end()`. In this case, the value is + copied. Otherwise, invalid_iterator.204 is thrown. + - In case of structured types (array, object), the constructor behaves as + similar versions for `std::vector` or `std::map`; that is, a JSON array + or object is constructed from the values in the range. + + @tparam InputIT an input iterator type (@ref iterator or @ref + const_iterator) + + @param[in] first begin of the range to copy from (included) + @param[in] last end of the range to copy from (excluded) + + @pre Iterators @a first and @a last must be initialized. **This + precondition is enforced with an assertion (see warning).** If + assertions are switched off, a violation of this precondition yields + undefined behavior. + + @pre Range `[first, last)` is valid. Usually, this precondition cannot be + checked efficiently. Only certain edge cases are detected; see the + description of the exceptions below. A violation of this precondition + yields undefined behavior. + + @warning A precondition is enforced with a runtime assertion that will + result in calling `std::abort` if this precondition is not met. + Assertions can be disabled by defining `NDEBUG` at compile time. + See https://en.cppreference.com/w/cpp/error/assert for more + information. + + @throw invalid_iterator.201 if iterators @a first and @a last are not + compatible (i.e., do not belong to the same JSON value). In this case, + the range `[first, last)` is undefined. + @throw invalid_iterator.204 if iterators @a first and @a last belong to a + primitive type (number, boolean, or string), but @a first does not point + to the first element any more. In this case, the range `[first, last)` is + undefined. See example code below. + @throw invalid_iterator.206 if iterators @a first and @a last belong to a + null value. In this case, the range `[first, last)` is undefined. + + @complexity Linear in distance between @a first and @a last. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @liveexample{The example below shows several ways to create JSON values by + specifying a subrange with iterators.,basic_json__InputIt_InputIt} + + @since version 1.0.0 + */ + template < class InputIT, typename std::enable_if < + std::is_same::value || + std::is_same::value, int >::type = 0 > + basic_json(InputIT first, InputIT last) + { + JSON_ASSERT(first.m_object != nullptr); + JSON_ASSERT(last.m_object != nullptr); + + // make sure iterator fits the current value + if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) + { + JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", basic_json())); + } + + // copy type from first iterator + m_type = first.m_object->m_type; + + // check if iterator range is complete for primitive values + switch (m_type) + { + case value_t::boolean: + case value_t::number_float: + case value_t::number_integer: + case value_t::number_unsigned: + case value_t::string: + { + if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin() + || !last.m_it.primitive_iterator.is_end())) + { + JSON_THROW(invalid_iterator::create(204, "iterators out of range", *first.m_object)); + } + break; + } + + default: + break; + } + + switch (m_type) + { + case value_t::number_integer: + { + m_value.number_integer = first.m_object->m_value.number_integer; + break; + } + + case value_t::number_unsigned: + { + m_value.number_unsigned = first.m_object->m_value.number_unsigned; + break; + } + + case value_t::number_float: + { + m_value.number_float = first.m_object->m_value.number_float; + break; + } + + case value_t::boolean: + { + m_value.boolean = first.m_object->m_value.boolean; + break; + } + + case value_t::string: + { + m_value = *first.m_object->m_value.string; + break; + } + + case value_t::object: + { + m_value.object = create(first.m_it.object_iterator, + last.m_it.object_iterator); + break; + } + + case value_t::array: + { + m_value.array = create(first.m_it.array_iterator, + last.m_it.array_iterator); + break; + } + + case value_t::binary: + { + m_value = *first.m_object->m_value.binary; + break; + } + + default: + JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " + std::string(first.m_object->type_name()), *first.m_object)); + } + + set_parents(); + assert_invariant(); + } + + + /////////////////////////////////////// + // other constructors and destructor // + /////////////////////////////////////// + + template, + std::is_same>::value, int> = 0 > + basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {} + + /*! + @brief copy constructor + + Creates a copy of a given JSON value. + + @param[in] other the JSON value to copy + + @post `*this == other` + + @complexity Linear in the size of @a other. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes to any JSON value. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is linear. + - As postcondition, it holds: `other == basic_json(other)`. + + @liveexample{The following code shows an example for the copy + constructor.,basic_json__basic_json} + + @since version 1.0.0 + */ + basic_json(const basic_json& other) + : m_type(other.m_type) + { + // check of passed value is valid + other.assert_invariant(); + + switch (m_type) + { + case value_t::object: + { + m_value = *other.m_value.object; + break; + } + + case value_t::array: + { + m_value = *other.m_value.array; + break; + } + + case value_t::string: + { + m_value = *other.m_value.string; + break; + } + + case value_t::boolean: + { + m_value = other.m_value.boolean; + break; + } + + case value_t::number_integer: + { + m_value = other.m_value.number_integer; + break; + } + + case value_t::number_unsigned: + { + m_value = other.m_value.number_unsigned; + break; + } + + case value_t::number_float: + { + m_value = other.m_value.number_float; + break; + } + + case value_t::binary: + { + m_value = *other.m_value.binary; + break; + } + + default: + break; + } + + set_parents(); + assert_invariant(); + } + + /*! + @brief move constructor + + Move constructor. Constructs a JSON value with the contents of the given + value @a other using move semantics. It "steals" the resources from @a + other and leaves it as JSON null value. + + @param[in,out] other value to move to this object + + @post `*this` has the same value as @a other before the call. + @post @a other is a JSON null value. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this constructor never throws + exceptions. + + @requirement This function helps `basic_json` satisfying the + [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible) + requirements. + + @liveexample{The code below shows the move constructor explicitly called + via std::move.,basic_json__moveconstructor} + + @since version 1.0.0 + */ + basic_json(basic_json&& other) noexcept + : m_type(std::move(other.m_type)), + m_value(std::move(other.m_value)) + { + // check that passed value is valid + other.assert_invariant(false); + + // invalidate payload + other.m_type = value_t::null; + other.m_value = {}; + + set_parents(); + assert_invariant(); + } + + /*! + @brief copy assignment + + Copy assignment operator. Copies a JSON value via the "copy and swap" + strategy: It is expressed in terms of the copy constructor, destructor, + and the `swap()` member function. + + @param[in] other value to copy from + + @complexity Linear. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is linear. + + @liveexample{The code below shows and example for the copy assignment. It + creates a copy of value `a` which is then swapped with `b`. Finally\, the + copy of `a` (which is the null value after the swap) is + destroyed.,basic_json__copyassignment} + + @since version 1.0.0 + */ + basic_json& operator=(basic_json other) noexcept ( + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value + ) + { + // check that passed value is valid + other.assert_invariant(); + + using std::swap; + swap(m_type, other.m_type); + swap(m_value, other.m_value); + + set_parents(); + assert_invariant(); + return *this; + } + + /*! + @brief destructor + + Destroys the JSON value and frees all allocated memory. + + @complexity Linear. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is linear. + - All stored elements are destroyed and all memory is freed. + + @since version 1.0.0 + */ + ~basic_json() noexcept + { + assert_invariant(false); + m_value.destroy(m_type); + } + + /// @} + + public: + /////////////////////// + // object inspection // + /////////////////////// + + /// @name object inspection + /// Functions to inspect the type of a JSON value. + /// @{ + + /*! + @brief serialization + + Serialization function for JSON values. The function tries to mimic + Python's `json.dumps()` function, and currently supports its @a indent + and @a ensure_ascii parameters. + + @param[in] indent If indent is nonnegative, then array elements and object + members will be pretty-printed with that indent level. An indent level of + `0` will only insert newlines. `-1` (the default) selects the most compact + representation. + @param[in] indent_char The character to use for indentation if @a indent is + greater than `0`. The default is ` ` (space). + @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters + in the output are escaped with `\uXXXX` sequences, and the result consists + of ASCII characters only. + @param[in] error_handler how to react on decoding errors; there are three + possible values: `strict` (throws and exception in case a decoding error + occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD), + and `ignore` (ignore invalid UTF-8 sequences during serialization; all + bytes are copied to the output unchanged). + + @return string containing the serialization of the JSON value + + @throw type_error.316 if a string stored inside the JSON value is not + UTF-8 encoded and @a error_handler is set to strict + + @note Binary values are serialized as object containing two keys: + - "bytes": an array of bytes as integers + - "subtype": the subtype as integer or "null" if the binary has no subtype + + @complexity Linear. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @liveexample{The following example shows the effect of different @a indent\, + @a indent_char\, and @a ensure_ascii parameters to the result of the + serialization.,dump} + + @see https://docs.python.org/2/library/json.html#json.dump + + @since version 1.0.0; indentation character @a indent_char, option + @a ensure_ascii and exceptions added in version 3.0.0; error + handlers added in version 3.4.0; serialization of binary values added + in version 3.8.0. + */ + string_t dump(const int indent = -1, + const char indent_char = ' ', + const bool ensure_ascii = false, + const error_handler_t error_handler = error_handler_t::strict) const + { + string_t result; + serializer s(detail::output_adapter(result), indent_char, error_handler); + + if (indent >= 0) + { + s.dump(*this, true, ensure_ascii, static_cast(indent)); + } + else + { + s.dump(*this, false, ensure_ascii, 0); + } + + return result; + } + + /*! + @brief return the type of the JSON value (explicit) + + Return the type of the JSON value as a value from the @ref value_t + enumeration. + + @return the type of the JSON value + Value type | return value + ------------------------- | ------------------------- + null | value_t::null + boolean | value_t::boolean + string | value_t::string + number (integer) | value_t::number_integer + number (unsigned integer) | value_t::number_unsigned + number (floating-point) | value_t::number_float + object | value_t::object + array | value_t::array + binary | value_t::binary + discarded | value_t::discarded + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `type()` for all JSON + types.,type} + + @sa see @ref operator value_t() -- return the type of the JSON value (implicit) + @sa see @ref type_name() -- return the type as string + + @since version 1.0.0 + */ + constexpr value_t type() const noexcept + { + return m_type; + } + + /*! + @brief return whether type is primitive + + This function returns true if and only if the JSON type is primitive + (string, number, boolean, or null). + + @return `true` if type is primitive (string, number, boolean, or null), + `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_primitive()` for all JSON + types.,is_primitive} + + @sa see @ref is_structured() -- returns whether JSON value is structured + @sa see @ref is_null() -- returns whether JSON value is `null` + @sa see @ref is_string() -- returns whether JSON value is a string + @sa see @ref is_boolean() -- returns whether JSON value is a boolean + @sa see @ref is_number() -- returns whether JSON value is a number + @sa see @ref is_binary() -- returns whether JSON value is a binary array + + @since version 1.0.0 + */ + constexpr bool is_primitive() const noexcept + { + return is_null() || is_string() || is_boolean() || is_number() || is_binary(); + } + + /*! + @brief return whether type is structured + + This function returns true if and only if the JSON type is structured + (array or object). + + @return `true` if type is structured (array or object), `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_structured()` for all JSON + types.,is_structured} + + @sa see @ref is_primitive() -- returns whether value is primitive + @sa see @ref is_array() -- returns whether value is an array + @sa see @ref is_object() -- returns whether value is an object + + @since version 1.0.0 + */ + constexpr bool is_structured() const noexcept + { + return is_array() || is_object(); + } + + /*! + @brief return whether value is null + + This function returns true if and only if the JSON value is null. + + @return `true` if type is null, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_null()` for all JSON + types.,is_null} + + @since version 1.0.0 + */ + constexpr bool is_null() const noexcept + { + return m_type == value_t::null; + } + + /*! + @brief return whether value is a boolean + + This function returns true if and only if the JSON value is a boolean. + + @return `true` if type is boolean, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_boolean()` for all JSON + types.,is_boolean} + + @since version 1.0.0 + */ + constexpr bool is_boolean() const noexcept + { + return m_type == value_t::boolean; + } + + /*! + @brief return whether value is a number + + This function returns true if and only if the JSON value is a number. This + includes both integer (signed and unsigned) and floating-point values. + + @return `true` if type is number (regardless whether integer, unsigned + integer or floating-type), `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_number()` for all JSON + types.,is_number} + + @sa see @ref is_number_integer() -- check if value is an integer or unsigned + integer number + @sa see @ref is_number_unsigned() -- check if value is an unsigned integer + number + @sa see @ref is_number_float() -- check if value is a floating-point number + + @since version 1.0.0 + */ + constexpr bool is_number() const noexcept + { + return is_number_integer() || is_number_float(); + } + + /*! + @brief return whether value is an integer number + + This function returns true if and only if the JSON value is a signed or + unsigned integer number. This excludes floating-point values. + + @return `true` if type is an integer or unsigned integer number, `false` + otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_number_integer()` for all + JSON types.,is_number_integer} + + @sa see @ref is_number() -- check if value is a number + @sa see @ref is_number_unsigned() -- check if value is an unsigned integer + number + @sa see @ref is_number_float() -- check if value is a floating-point number + + @since version 1.0.0 + */ + constexpr bool is_number_integer() const noexcept + { + return m_type == value_t::number_integer || m_type == value_t::number_unsigned; + } + + /*! + @brief return whether value is an unsigned integer number + + This function returns true if and only if the JSON value is an unsigned + integer number. This excludes floating-point and signed integer values. + + @return `true` if type is an unsigned integer number, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_number_unsigned()` for all + JSON types.,is_number_unsigned} + + @sa see @ref is_number() -- check if value is a number + @sa see @ref is_number_integer() -- check if value is an integer or unsigned + integer number + @sa see @ref is_number_float() -- check if value is a floating-point number + + @since version 2.0.0 + */ + constexpr bool is_number_unsigned() const noexcept + { + return m_type == value_t::number_unsigned; + } + + /*! + @brief return whether value is a floating-point number + + This function returns true if and only if the JSON value is a + floating-point number. This excludes signed and unsigned integer values. + + @return `true` if type is a floating-point number, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_number_float()` for all + JSON types.,is_number_float} + + @sa see @ref is_number() -- check if value is number + @sa see @ref is_number_integer() -- check if value is an integer number + @sa see @ref is_number_unsigned() -- check if value is an unsigned integer + number + + @since version 1.0.0 + */ + constexpr bool is_number_float() const noexcept + { + return m_type == value_t::number_float; + } + + /*! + @brief return whether value is an object + + This function returns true if and only if the JSON value is an object. + + @return `true` if type is object, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_object()` for all JSON + types.,is_object} + + @since version 1.0.0 + */ + constexpr bool is_object() const noexcept + { + return m_type == value_t::object; + } + + /*! + @brief return whether value is an array + + This function returns true if and only if the JSON value is an array. + + @return `true` if type is array, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_array()` for all JSON + types.,is_array} + + @since version 1.0.0 + */ + constexpr bool is_array() const noexcept + { + return m_type == value_t::array; + } + + /*! + @brief return whether value is a string + + This function returns true if and only if the JSON value is a string. + + @return `true` if type is string, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_string()` for all JSON + types.,is_string} + + @since version 1.0.0 + */ + constexpr bool is_string() const noexcept + { + return m_type == value_t::string; + } + + /*! + @brief return whether value is a binary array + + This function returns true if and only if the JSON value is a binary array. + + @return `true` if type is binary array, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_binary()` for all JSON + types.,is_binary} + + @since version 3.8.0 + */ + constexpr bool is_binary() const noexcept + { + return m_type == value_t::binary; + } + + /*! + @brief return whether value is discarded + + This function returns true if and only if the JSON value was discarded + during parsing with a callback function (see @ref parser_callback_t). + + @note This function will always be `false` for JSON values after parsing. + That is, discarded values can only occur during parsing, but will be + removed when inside a structured value or replaced by null in other cases. + + @return `true` if type is discarded, `false` otherwise. + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies `is_discarded()` for all JSON + types.,is_discarded} + + @since version 1.0.0 + */ + constexpr bool is_discarded() const noexcept + { + return m_type == value_t::discarded; + } + + /*! + @brief return the type of the JSON value (implicit) + + Implicitly return the type of the JSON value as a value from the @ref + value_t enumeration. + + @return the type of the JSON value + + @complexity Constant. + + @exceptionsafety No-throw guarantee: this member function never throws + exceptions. + + @liveexample{The following code exemplifies the @ref value_t operator for + all JSON types.,operator__value_t} + + @sa see @ref type() -- return the type of the JSON value (explicit) + @sa see @ref type_name() -- return the type as string + + @since version 1.0.0 + */ + constexpr operator value_t() const noexcept + { + return m_type; + } + + /// @} + + private: + ////////////////// + // value access // + ////////////////// + + /// get a boolean (explicit) + boolean_t get_impl(boolean_t* /*unused*/) const + { + if (JSON_HEDLEY_LIKELY(is_boolean())) + { + return m_value.boolean; + } + + JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name()), *this)); + } + + /// get a pointer to the value (object) + object_t* get_impl_ptr(object_t* /*unused*/) noexcept + { + return is_object() ? m_value.object : nullptr; + } + + /// get a pointer to the value (object) + constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept + { + return is_object() ? m_value.object : nullptr; + } + + /// get a pointer to the value (array) + array_t* get_impl_ptr(array_t* /*unused*/) noexcept + { + return is_array() ? m_value.array : nullptr; + } + + /// get a pointer to the value (array) + constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept + { + return is_array() ? m_value.array : nullptr; + } + + /// get a pointer to the value (string) + string_t* get_impl_ptr(string_t* /*unused*/) noexcept + { + return is_string() ? m_value.string : nullptr; + } + + /// get a pointer to the value (string) + constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept + { + return is_string() ? m_value.string : nullptr; + } + + /// get a pointer to the value (boolean) + boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept + { + return is_boolean() ? &m_value.boolean : nullptr; + } + + /// get a pointer to the value (boolean) + constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept + { + return is_boolean() ? &m_value.boolean : nullptr; + } + + /// get a pointer to the value (integer number) + number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept + { + return is_number_integer() ? &m_value.number_integer : nullptr; + } + + /// get a pointer to the value (integer number) + constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept + { + return is_number_integer() ? &m_value.number_integer : nullptr; + } + + /// get a pointer to the value (unsigned number) + number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept + { + return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + } + + /// get a pointer to the value (unsigned number) + constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept + { + return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + } + + /// get a pointer to the value (floating-point number) + number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept + { + return is_number_float() ? &m_value.number_float : nullptr; + } + + /// get a pointer to the value (floating-point number) + constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept + { + return is_number_float() ? &m_value.number_float : nullptr; + } + + /// get a pointer to the value (binary) + binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept + { + return is_binary() ? m_value.binary : nullptr; + } + + /// get a pointer to the value (binary) + constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept + { + return is_binary() ? m_value.binary : nullptr; + } + + /*! + @brief helper function to implement get_ref() + + This function helps to implement get_ref() without code duplication for + const and non-const overloads + + @tparam ThisType will be deduced as `basic_json` or `const basic_json` + + @throw type_error.303 if ReferenceType does not match underlying value + type of the current JSON + */ + template + static ReferenceType get_ref_impl(ThisType& obj) + { + // delegate the call to get_ptr<>() + auto* ptr = obj.template get_ptr::type>(); + + if (JSON_HEDLEY_LIKELY(ptr != nullptr)) + { + return *ptr; + } + + JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name()), obj)); + } + + public: + /// @name value access + /// Direct access to the stored value of a JSON value. + /// @{ + + /*! + @brief get a pointer value (implicit) + + Implicit pointer access to the internally stored JSON value. No copies are + made. + + @warning Writing data to the pointee of the result yields an undefined + state. + + @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref + object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, + @ref number_unsigned_t, or @ref number_float_t. Enforced by a static + assertion. + + @return pointer to the internally stored JSON value if the requested + pointer type @a PointerType fits to the JSON value; `nullptr` otherwise + + @complexity Constant. + + @liveexample{The example below shows how pointers to internal values of a + JSON value can be requested. Note that no type conversions are made and a + `nullptr` is returned if the value and the requested pointer type does not + match.,get_ptr} + + @since version 1.0.0 + */ + template::value, int>::type = 0> + auto get_ptr() noexcept -> decltype(std::declval().get_impl_ptr(std::declval())) + { + // delegate the call to get_impl_ptr<>() + return get_impl_ptr(static_cast(nullptr)); + } + + /*! + @brief get a pointer value (implicit) + @copydoc get_ptr() + */ + template < typename PointerType, typename std::enable_if < + std::is_pointer::value&& + std::is_const::type>::value, int >::type = 0 > + constexpr auto get_ptr() const noexcept -> decltype(std::declval().get_impl_ptr(std::declval())) + { + // delegate the call to get_impl_ptr<>() const + return get_impl_ptr(static_cast(nullptr)); + } + + private: + /*! + @brief get a value (explicit) + + Explicit type conversion between the JSON value and a compatible value + which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible) + and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible). + The value is converted by calling the @ref json_serializer + `from_json()` method. + + The function is equivalent to executing + @code {.cpp} + ValueType ret; + JSONSerializer::from_json(*this, ret); + return ret; + @endcode + + This overloads is chosen if: + - @a ValueType is not @ref basic_json, + - @ref json_serializer has a `from_json()` method of the form + `void from_json(const basic_json&, ValueType&)`, and + - @ref json_serializer does not have a `from_json()` method of + the form `ValueType from_json(const basic_json&)` + + @tparam ValueType the returned value type + + @return copy of the JSON value, converted to @a ValueType + + @throw what @ref json_serializer `from_json()` method throws + + @liveexample{The example below shows several conversions from JSON values + to other types. There a few things to note: (1) Floating-point numbers can + be converted to integers\, (2) A JSON array can be converted to a standard + `std::vector`\, (3) A JSON object can be converted to C++ + associative containers such as `std::unordered_map`.,get__ValueType_const} + + @since version 2.1.0 + */ + template < typename ValueType, + detail::enable_if_t < + detail::is_default_constructible::value&& + detail::has_from_json::value, + int > = 0 > + ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept( + JSONSerializer::from_json(std::declval(), std::declval()))) + { + ValueType ret{}; + JSONSerializer::from_json(*this, ret); + return ret; + } + + /*! + @brief get a value (explicit); special case + + Explicit type conversion between the JSON value and a compatible value + which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible) + and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible). + The value is converted by calling the @ref json_serializer + `from_json()` method. + + The function is equivalent to executing + @code {.cpp} + return JSONSerializer::from_json(*this); + @endcode + + This overloads is chosen if: + - @a ValueType is not @ref basic_json and + - @ref json_serializer has a `from_json()` method of the form + `ValueType from_json(const basic_json&)` + + @note If @ref json_serializer has both overloads of + `from_json()`, this one is chosen. + + @tparam ValueType the returned value type + + @return copy of the JSON value, converted to @a ValueType + + @throw what @ref json_serializer `from_json()` method throws + + @since version 2.1.0 + */ + template < typename ValueType, + detail::enable_if_t < + detail::has_non_default_from_json::value, + int > = 0 > + ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept( + JSONSerializer::from_json(std::declval()))) + { + return JSONSerializer::from_json(*this); + } + + /*! + @brief get special-case overload + + This overloads converts the current @ref basic_json in a different + @ref basic_json type + + @tparam BasicJsonType == @ref basic_json + + @return a copy of *this, converted into @a BasicJsonType + + @complexity Depending on the implementation of the called `from_json()` + method. + + @since version 3.2.0 + */ + template < typename BasicJsonType, + detail::enable_if_t < + detail::is_basic_json::value, + int > = 0 > + BasicJsonType get_impl(detail::priority_tag<2> /*unused*/) const + { + return *this; + } + + /*! + @brief get special-case overload + + This overloads avoids a lot of template boilerplate, it can be seen as the + identity method + + @tparam BasicJsonType == @ref basic_json + + @return a copy of *this + + @complexity Constant. + + @since version 2.1.0 + */ + template::value, + int> = 0> + basic_json get_impl(detail::priority_tag<3> /*unused*/) const + { + return *this; + } + + /*! + @brief get a pointer value (explicit) + @copydoc get() + */ + template::value, + int> = 0> + constexpr auto get_impl(detail::priority_tag<4> /*unused*/) const noexcept + -> decltype(std::declval().template get_ptr()) + { + // delegate the call to get_ptr + return get_ptr(); + } + + public: + /*! + @brief get a (pointer) value (explicit) + + Performs explicit type conversion between the JSON value and a compatible value if required. + + - If the requested type is a pointer to the internally stored JSON value that pointer is returned. + No copies are made. + + - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible + from the current @ref basic_json. + + - Otherwise the value is converted by calling the @ref json_serializer `from_json()` + method. + + @tparam ValueTypeCV the provided value type + @tparam ValueType the returned value type + + @return copy of the JSON value, converted to @tparam ValueType if necessary + + @throw what @ref json_serializer `from_json()` method throws if conversion is required + + @since version 2.1.0 + */ + template < typename ValueTypeCV, typename ValueType = detail::uncvref_t> +#if defined(JSON_HAS_CPP_14) + constexpr +#endif + auto get() const noexcept( + noexcept(std::declval().template get_impl(detail::priority_tag<4> {}))) + -> decltype(std::declval().template get_impl(detail::priority_tag<4> {})) + { + // we cannot static_assert on ValueTypeCV being non-const, because + // there is support for get(), which is why we + // still need the uncvref + static_assert(!std::is_reference::value, + "get() cannot be used with reference types, you might want to use get_ref()"); + return get_impl(detail::priority_tag<4> {}); + } + + /*! + @brief get a pointer value (explicit) + + Explicit pointer access to the internally stored JSON value. No copies are + made. + + @warning The pointer becomes invalid if the underlying JSON object + changes. + + @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref + object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, + @ref number_unsigned_t, or @ref number_float_t. + + @return pointer to the internally stored JSON value if the requested + pointer type @a PointerType fits to the JSON value; `nullptr` otherwise + + @complexity Constant. + + @liveexample{The example below shows how pointers to internal values of a + JSON value can be requested. Note that no type conversions are made and a + `nullptr` is returned if the value and the requested pointer type does not + match.,get__PointerType} + + @sa see @ref get_ptr() for explicit pointer-member access + + @since version 1.0.0 + */ + template::value, int>::type = 0> + auto get() noexcept -> decltype(std::declval().template get_ptr()) + { + // delegate the call to get_ptr + return get_ptr(); + } + + /*! + @brief get a value (explicit) + + Explicit type conversion between the JSON value and a compatible value. + The value is filled into the input parameter by calling the @ref json_serializer + `from_json()` method. + + The function is equivalent to executing + @code {.cpp} + ValueType v; + JSONSerializer::from_json(*this, v); + @endcode + + This overloads is chosen if: + - @a ValueType is not @ref basic_json, + - @ref json_serializer has a `from_json()` method of the form + `void from_json(const basic_json&, ValueType&)`, and + + @tparam ValueType the input parameter type. + + @return the input parameter, allowing chaining calls. + + @throw what @ref json_serializer `from_json()` method throws + + @liveexample{The example below shows several conversions from JSON values + to other types. There a few things to note: (1) Floating-point numbers can + be converted to integers\, (2) A JSON array can be converted to a standard + `std::vector`\, (3) A JSON object can be converted to C++ + associative containers such as `std::unordered_map`.,get_to} + + @since version 3.3.0 + */ + template < typename ValueType, + detail::enable_if_t < + !detail::is_basic_json::value&& + detail::has_from_json::value, + int > = 0 > + ValueType & get_to(ValueType& v) const noexcept(noexcept( + JSONSerializer::from_json(std::declval(), v))) + { + JSONSerializer::from_json(*this, v); + return v; + } + + // specialization to allow to call get_to with a basic_json value + // see https://github.com/nlohmann/json/issues/2175 + template::value, + int> = 0> + ValueType & get_to(ValueType& v) const + { + v = *this; + return v; + } + + template < + typename T, std::size_t N, + typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) + detail::enable_if_t < + detail::has_from_json::value, int > = 0 > + Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays) + noexcept(noexcept(JSONSerializer::from_json( + std::declval(), v))) + { + JSONSerializer::from_json(*this, v); + return v; + } + + /*! + @brief get a reference value (implicit) + + Implicit reference access to the internally stored JSON value. No copies + are made. + + @warning Writing data to the referee of the result yields an undefined + state. + + @tparam ReferenceType reference type; must be a reference to @ref array_t, + @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or + @ref number_float_t. Enforced by static assertion. + + @return reference to the internally stored JSON value if the requested + reference type @a ReferenceType fits to the JSON value; throws + type_error.303 otherwise + + @throw type_error.303 in case passed type @a ReferenceType is incompatible + with the stored JSON value; see example below + + @complexity Constant. + + @liveexample{The example shows several calls to `get_ref()`.,get_ref} + + @since version 1.1.0 + */ + template::value, int>::type = 0> + ReferenceType get_ref() + { + // delegate call to get_ref_impl + return get_ref_impl(*this); + } + + /*! + @brief get a reference value (implicit) + @copydoc get_ref() + */ + template < typename ReferenceType, typename std::enable_if < + std::is_reference::value&& + std::is_const::type>::value, int >::type = 0 > + ReferenceType get_ref() const + { + // delegate call to get_ref_impl + return get_ref_impl(*this); + } + + /*! + @brief get a value (implicit) + + Implicit type conversion between the JSON value and a compatible value. + The call is realized by calling @ref get() const. + + @tparam ValueType non-pointer type compatible to the JSON value, for + instance `int` for JSON integer numbers, `bool` for JSON booleans, or + `std::vector` types for JSON arrays. The character type of @ref string_t + as well as an initializer list of this type is excluded to avoid + ambiguities as these types implicitly convert to `std::string`. + + @return copy of the JSON value, converted to type @a ValueType + + @throw type_error.302 in case passed type @a ValueType is incompatible + to the JSON value type (e.g., the JSON value is of type boolean, but a + string is requested); see example below + + @complexity Linear in the size of the JSON value. + + @liveexample{The example below shows several conversions from JSON values + to other types. There a few things to note: (1) Floating-point numbers can + be converted to integers\, (2) A JSON array can be converted to a standard + `std::vector`\, (3) A JSON object can be converted to C++ + associative containers such as `std::unordered_map`.,operator__ValueType} + + @since version 1.0.0 + */ + template < typename ValueType, typename std::enable_if < + !std::is_pointer::value&& + !std::is_same>::value&& + !std::is_same::value&& + !detail::is_basic_json::value + && !std::is_same>::value +#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914)) + && !std::is_same::value +#endif + && detail::is_detected::value + , int >::type = 0 > + JSON_EXPLICIT operator ValueType() const + { + // delegate the call to get<>() const + return get(); + } + + /*! + @return reference to the binary value + + @throw type_error.302 if the value is not binary + + @sa see @ref is_binary() to check if the value is binary + + @since version 3.8.0 + */ + binary_t& get_binary() + { + if (!is_binary()) + { + JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this)); + } + + return *get_ptr(); + } + + /// @copydoc get_binary() + const binary_t& get_binary() const + { + if (!is_binary()) + { + JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this)); + } + + return *get_ptr(); + } + + /// @} + + + //////////////////// + // element access // + //////////////////// + + /// @name element access + /// Access to the JSON value. + /// @{ + + /*! + @brief access specified array element with bounds checking + + Returns a reference to the element at specified location @a idx, with + bounds checking. + + @param[in] idx index of the element to access + + @return reference to the element at index @a idx + + @throw type_error.304 if the JSON value is not an array; in this case, + calling `at` with an index makes no sense. See example below. + @throw out_of_range.401 if the index @a idx is out of range of the array; + that is, `idx >= size()`. See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @since version 1.0.0 + + @liveexample{The example below shows how array elements can be read and + written using `at()`. It also demonstrates the different exceptions that + can be thrown.,at__size_type} + */ + reference at(size_type idx) + { + // at only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + JSON_TRY + { + return set_parent(m_value.array->at(idx)); + } + JSON_CATCH (std::out_of_range&) + { + // create better exception explanation + JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this)); + } + } + else + { + JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief access specified array element with bounds checking + + Returns a const reference to the element at specified location @a idx, + with bounds checking. + + @param[in] idx index of the element to access + + @return const reference to the element at index @a idx + + @throw type_error.304 if the JSON value is not an array; in this case, + calling `at` with an index makes no sense. See example below. + @throw out_of_range.401 if the index @a idx is out of range of the array; + that is, `idx >= size()`. See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @since version 1.0.0 + + @liveexample{The example below shows how array elements can be read using + `at()`. It also demonstrates the different exceptions that can be thrown., + at__size_type_const} + */ + const_reference at(size_type idx) const + { + // at only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + JSON_TRY + { + return m_value.array->at(idx); + } + JSON_CATCH (std::out_of_range&) + { + // create better exception explanation + JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this)); + } + } + else + { + JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief access specified object element with bounds checking + + Returns a reference to the element at with specified key @a key, with + bounds checking. + + @param[in] key key of the element to access + + @return reference to the element at key @a key + + @throw type_error.304 if the JSON value is not an object; in this case, + calling `at` with a key makes no sense. See example below. + @throw out_of_range.403 if the key @a key is is not stored in the object; + that is, `find(key) == end()`. See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Logarithmic in the size of the container. + + @sa see @ref operator[](const typename object_t::key_type&) for unchecked + access by reference + @sa see @ref value() for access by value with a default value + + @since version 1.0.0 + + @liveexample{The example below shows how object elements can be read and + written using `at()`. It also demonstrates the different exceptions that + can be thrown.,at__object_t_key_type} + */ + reference at(const typename object_t::key_type& key) + { + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + JSON_TRY + { + return set_parent(m_value.object->at(key)); + } + JSON_CATCH (std::out_of_range&) + { + // create better exception explanation + JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this)); + } + } + else + { + JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief access specified object element with bounds checking + + Returns a const reference to the element at with specified key @a key, + with bounds checking. + + @param[in] key key of the element to access + + @return const reference to the element at key @a key + + @throw type_error.304 if the JSON value is not an object; in this case, + calling `at` with a key makes no sense. See example below. + @throw out_of_range.403 if the key @a key is is not stored in the object; + that is, `find(key) == end()`. See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Logarithmic in the size of the container. + + @sa see @ref operator[](const typename object_t::key_type&) for unchecked + access by reference + @sa see @ref value() for access by value with a default value + + @since version 1.0.0 + + @liveexample{The example below shows how object elements can be read using + `at()`. It also demonstrates the different exceptions that can be thrown., + at__object_t_key_type_const} + */ + const_reference at(const typename object_t::key_type& key) const + { + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + JSON_TRY + { + return m_value.object->at(key); + } + JSON_CATCH (std::out_of_range&) + { + // create better exception explanation + JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this)); + } + } + else + { + JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief access specified array element + + Returns a reference to the element at specified location @a idx. + + @note If @a idx is beyond the range of the array (i.e., `idx >= size()`), + then the array is silently filled up with `null` values to make `idx` a + valid reference to the last stored element. + + @param[in] idx index of the element to access + + @return reference to the element at index @a idx + + @throw type_error.305 if the JSON value is not an array or null; in that + cases, using the [] operator with an index makes no sense. + + @complexity Constant if @a idx is in the range of the array. Otherwise + linear in `idx - size()`. + + @liveexample{The example below shows how array elements can be read and + written using `[]` operator. Note the addition of `null` + values.,operatorarray__size_type} + + @since version 1.0.0 + */ + reference operator[](size_type idx) + { + // implicitly convert null value to an empty array + if (is_null()) + { + m_type = value_t::array; + m_value.array = create(); + assert_invariant(); + } + + // operator[] only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + // fill up array with null values if given idx is outside range + if (idx >= m_value.array->size()) + { +#if JSON_DIAGNOSTICS + // remember array size before resizing + const auto previous_size = m_value.array->size(); +#endif + m_value.array->resize(idx + 1); + +#if JSON_DIAGNOSTICS + // set parent for values added above + set_parents(begin() + static_cast(previous_size), static_cast(idx + 1 - previous_size)); +#endif + } + + return m_value.array->operator[](idx); + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this)); + } + + /*! + @brief access specified array element + + Returns a const reference to the element at specified location @a idx. + + @param[in] idx index of the element to access + + @return const reference to the element at index @a idx + + @throw type_error.305 if the JSON value is not an array; in that case, + using the [] operator with an index makes no sense. + + @complexity Constant. + + @liveexample{The example below shows how array elements can be read using + the `[]` operator.,operatorarray__size_type_const} + + @since version 1.0.0 + */ + const_reference operator[](size_type idx) const + { + // const operator[] only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + return m_value.array->operator[](idx); + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this)); + } + + /*! + @brief access specified object element + + Returns a reference to the element at with specified key @a key. + + @note If @a key is not found in the object, then it is silently added to + the object and filled with a `null` value to make `key` a valid reference. + In case the value was `null` before, it is converted to an object. + + @param[in] key key of the element to access + + @return reference to the element at key @a key + + @throw type_error.305 if the JSON value is not an object or null; in that + cases, using the [] operator with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be read and + written using the `[]` operator.,operatorarray__key_type} + + @sa see @ref at(const typename object_t::key_type&) for access by reference + with range checking + @sa see @ref value() for access by value with a default value + + @since version 1.0.0 + */ + reference operator[](const typename object_t::key_type& key) + { + // implicitly convert null value to an empty object + if (is_null()) + { + m_type = value_t::object; + m_value.object = create(); + assert_invariant(); + } + + // operator[] only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + return set_parent(m_value.object->operator[](key)); + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this)); + } + + /*! + @brief read-only access specified object element + + Returns a const reference to the element at with specified key @a key. No + bounds checking is performed. + + @warning If the element with key @a key does not exist, the behavior is + undefined. + + @param[in] key key of the element to access + + @return const reference to the element at key @a key + + @pre The element with key @a key must exist. **This precondition is + enforced with an assertion.** + + @throw type_error.305 if the JSON value is not an object; in that case, + using the [] operator with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be read using + the `[]` operator.,operatorarray__key_type_const} + + @sa see @ref at(const typename object_t::key_type&) for access by reference + with range checking + @sa see @ref value() for access by value with a default value + + @since version 1.0.0 + */ + const_reference operator[](const typename object_t::key_type& key) const + { + // const operator[] only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + JSON_ASSERT(m_value.object->find(key) != m_value.object->end()); + return m_value.object->find(key)->second; + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this)); + } + + /*! + @brief access specified object element + + Returns a reference to the element at with specified key @a key. + + @note If @a key is not found in the object, then it is silently added to + the object and filled with a `null` value to make `key` a valid reference. + In case the value was `null` before, it is converted to an object. + + @param[in] key key of the element to access + + @return reference to the element at key @a key + + @throw type_error.305 if the JSON value is not an object or null; in that + cases, using the [] operator with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be read and + written using the `[]` operator.,operatorarray__key_type} + + @sa see @ref at(const typename object_t::key_type&) for access by reference + with range checking + @sa see @ref value() for access by value with a default value + + @since version 1.1.0 + */ + template + JSON_HEDLEY_NON_NULL(2) + reference operator[](T* key) + { + // implicitly convert null to object + if (is_null()) + { + m_type = value_t::object; + m_value = value_t::object; + assert_invariant(); + } + + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + return set_parent(m_value.object->operator[](key)); + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this)); + } + + /*! + @brief read-only access specified object element + + Returns a const reference to the element at with specified key @a key. No + bounds checking is performed. + + @warning If the element with key @a key does not exist, the behavior is + undefined. + + @param[in] key key of the element to access + + @return const reference to the element at key @a key + + @pre The element with key @a key must exist. **This precondition is + enforced with an assertion.** + + @throw type_error.305 if the JSON value is not an object; in that case, + using the [] operator with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be read using + the `[]` operator.,operatorarray__key_type_const} + + @sa see @ref at(const typename object_t::key_type&) for access by reference + with range checking + @sa see @ref value() for access by value with a default value + + @since version 1.1.0 + */ + template + JSON_HEDLEY_NON_NULL(2) + const_reference operator[](T* key) const + { + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + JSON_ASSERT(m_value.object->find(key) != m_value.object->end()); + return m_value.object->find(key)->second; + } + + JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this)); + } + + /*! + @brief access specified object element with default value + + Returns either a copy of an object's element at the specified key @a key + or a given default value if no element with key @a key exists. + + The function is basically equivalent to executing + @code {.cpp} + try { + return at(key); + } catch(out_of_range) { + return default_value; + } + @endcode + + @note Unlike @ref at(const typename object_t::key_type&), this function + does not throw if the given key @a key was not found. + + @note Unlike @ref operator[](const typename object_t::key_type& key), this + function does not implicitly add an element to the position defined by @a + key. This function is furthermore also applicable to const objects. + + @param[in] key key of the element to access + @param[in] default_value the value to return if @a key is not found + + @tparam ValueType type compatible to JSON values, for instance `int` for + JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for + JSON arrays. Note the type of the expected value at @a key and the default + value @a default_value must be compatible. + + @return copy of the element at key @a key or @a default_value if @a key + is not found + + @throw type_error.302 if @a default_value does not match the type of the + value at @a key + @throw type_error.306 if the JSON value is not an object; in that case, + using `value()` with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be queried + with a default value.,basic_json__value} + + @sa see @ref at(const typename object_t::key_type&) for access by reference + with range checking + @sa see @ref operator[](const typename object_t::key_type&) for unchecked + access by reference + + @since version 1.0.0 + */ + // using std::is_convertible in a std::enable_if will fail when using explicit conversions + template < class ValueType, typename std::enable_if < + detail::is_getable::value + && !std::is_same::value, int >::type = 0 > + ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const + { + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + // if key is found, return value and given default value otherwise + const auto it = find(key); + if (it != end()) + { + return it->template get(); + } + + return default_value; + } + + JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this)); + } + + /*! + @brief overload for a default value of type const char* + @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const + */ + string_t value(const typename object_t::key_type& key, const char* default_value) const + { + return value(key, string_t(default_value)); + } + + /*! + @brief access specified object element via JSON Pointer with default value + + Returns either a copy of an object's element at the specified key @a key + or a given default value if no element with key @a key exists. + + The function is basically equivalent to executing + @code {.cpp} + try { + return at(ptr); + } catch(out_of_range) { + return default_value; + } + @endcode + + @note Unlike @ref at(const json_pointer&), this function does not throw + if the given key @a key was not found. + + @param[in] ptr a JSON pointer to the element to access + @param[in] default_value the value to return if @a ptr found no value + + @tparam ValueType type compatible to JSON values, for instance `int` for + JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for + JSON arrays. Note the type of the expected value at @a key and the default + value @a default_value must be compatible. + + @return copy of the element at key @a key or @a default_value if @a key + is not found + + @throw type_error.302 if @a default_value does not match the type of the + value at @a ptr + @throw type_error.306 if the JSON value is not an object; in that case, + using `value()` with a key makes no sense. + + @complexity Logarithmic in the size of the container. + + @liveexample{The example below shows how object elements can be queried + with a default value.,basic_json__value_ptr} + + @sa see @ref operator[](const json_pointer&) for unchecked access by reference + + @since version 2.0.2 + */ + template::value, int>::type = 0> + ValueType value(const json_pointer& ptr, const ValueType& default_value) const + { + // at only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + // if pointer resolves a value, return it or use default value + JSON_TRY + { + return ptr.get_checked(this).template get(); + } + JSON_INTERNAL_CATCH (out_of_range&) + { + return default_value; + } + } + + JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this)); + } + + /*! + @brief overload for a default value of type const char* + @copydoc basic_json::value(const json_pointer&, ValueType) const + */ + JSON_HEDLEY_NON_NULL(3) + string_t value(const json_pointer& ptr, const char* default_value) const + { + return value(ptr, string_t(default_value)); + } + + /*! + @brief access the first element + + Returns a reference to the first element in the container. For a JSON + container `c`, the expression `c.front()` is equivalent to `*c.begin()`. + + @return In case of a structured type (array or object), a reference to the + first element is returned. In case of number, string, boolean, or binary + values, a reference to the value is returned. + + @complexity Constant. + + @pre The JSON value must not be `null` (would throw `std::out_of_range`) + or an empty array or object (undefined behavior, **guarded by + assertions**). + @post The JSON value remains unchanged. + + @throw invalid_iterator.214 when called on `null` value + + @liveexample{The following code shows an example for `front()`.,front} + + @sa see @ref back() -- access the last element + + @since version 1.0.0 + */ + reference front() + { + return *begin(); + } + + /*! + @copydoc basic_json::front() + */ + const_reference front() const + { + return *cbegin(); + } + + /*! + @brief access the last element + + Returns a reference to the last element in the container. For a JSON + container `c`, the expression `c.back()` is equivalent to + @code {.cpp} + auto tmp = c.end(); + --tmp; + return *tmp; + @endcode + + @return In case of a structured type (array or object), a reference to the + last element is returned. In case of number, string, boolean, or binary + values, a reference to the value is returned. + + @complexity Constant. + + @pre The JSON value must not be `null` (would throw `std::out_of_range`) + or an empty array or object (undefined behavior, **guarded by + assertions**). + @post The JSON value remains unchanged. + + @throw invalid_iterator.214 when called on a `null` value. See example + below. + + @liveexample{The following code shows an example for `back()`.,back} + + @sa see @ref front() -- access the first element + + @since version 1.0.0 + */ + reference back() + { + auto tmp = end(); + --tmp; + return *tmp; + } + + /*! + @copydoc basic_json::back() + */ + const_reference back() const + { + auto tmp = cend(); + --tmp; + return *tmp; + } + + /*! + @brief remove element given an iterator + + Removes the element specified by iterator @a pos. The iterator @a pos must + be valid and dereferenceable. Thus the `end()` iterator (which is valid, + but is not dereferenceable) cannot be used as a value for @a pos. + + If called on a primitive type other than `null`, the resulting JSON value + will be `null`. + + @param[in] pos iterator to the element to remove + @return Iterator following the last removed element. If the iterator @a + pos refers to the last element, the `end()` iterator is returned. + + @tparam IteratorType an @ref iterator or @ref const_iterator + + @post Invalidates iterators and references at or after the point of the + erase, including the `end()` iterator. + + @throw type_error.307 if called on a `null` value; example: `"cannot use + erase() with null"` + @throw invalid_iterator.202 if called on an iterator which does not belong + to the current JSON value; example: `"iterator does not fit current + value"` + @throw invalid_iterator.205 if called on a primitive type with invalid + iterator (i.e., any iterator which is not `begin()`); example: `"iterator + out of range"` + + @complexity The complexity depends on the type: + - objects: amortized constant + - arrays: linear in distance between @a pos and the end of the container + - strings and binary: linear in the length of the member + - other types: constant + + @liveexample{The example shows the result of `erase()` for different JSON + types.,erase__IteratorType} + + @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in + the given range + @sa see @ref erase(const typename object_t::key_type&) -- removes the element + from an object at the given key + @sa see @ref erase(const size_type) -- removes the element from an array at + the given index + + @since version 1.0.0 + */ + template < class IteratorType, typename std::enable_if < + std::is_same::value || + std::is_same::value, int >::type + = 0 > + IteratorType erase(IteratorType pos) + { + // make sure iterator fits the current value + if (JSON_HEDLEY_UNLIKELY(this != pos.m_object)) + { + JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this)); + } + + IteratorType result = end(); + + switch (m_type) + { + case value_t::boolean: + case value_t::number_float: + case value_t::number_integer: + case value_t::number_unsigned: + case value_t::string: + case value_t::binary: + { + if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin())) + { + JSON_THROW(invalid_iterator::create(205, "iterator out of range", *this)); + } + + if (is_string()) + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, m_value.string); + std::allocator_traits::deallocate(alloc, m_value.string, 1); + m_value.string = nullptr; + } + else if (is_binary()) + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, m_value.binary); + std::allocator_traits::deallocate(alloc, m_value.binary, 1); + m_value.binary = nullptr; + } + + m_type = value_t::null; + assert_invariant(); + break; + } + + case value_t::object: + { + result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator); + break; + } + + case value_t::array: + { + result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator); + break; + } + + default: + JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this)); + } + + return result; + } + + /*! + @brief remove elements given an iterator range + + Removes the element specified by the range `[first; last)`. The iterator + @a first does not need to be dereferenceable if `first == last`: erasing + an empty range is a no-op. + + If called on a primitive type other than `null`, the resulting JSON value + will be `null`. + + @param[in] first iterator to the beginning of the range to remove + @param[in] last iterator past the end of the range to remove + @return Iterator following the last removed element. If the iterator @a + second refers to the last element, the `end()` iterator is returned. + + @tparam IteratorType an @ref iterator or @ref const_iterator + + @post Invalidates iterators and references at or after the point of the + erase, including the `end()` iterator. + + @throw type_error.307 if called on a `null` value; example: `"cannot use + erase() with null"` + @throw invalid_iterator.203 if called on iterators which does not belong + to the current JSON value; example: `"iterators do not fit current value"` + @throw invalid_iterator.204 if called on a primitive type with invalid + iterators (i.e., if `first != begin()` and `last != end()`); example: + `"iterators out of range"` + + @complexity The complexity depends on the type: + - objects: `log(size()) + std::distance(first, last)` + - arrays: linear in the distance between @a first and @a last, plus linear + in the distance between @a last and end of the container + - strings and binary: linear in the length of the member + - other types: constant + + @liveexample{The example shows the result of `erase()` for different JSON + types.,erase__IteratorType_IteratorType} + + @sa see @ref erase(IteratorType) -- removes the element at a given position + @sa see @ref erase(const typename object_t::key_type&) -- removes the element + from an object at the given key + @sa see @ref erase(const size_type) -- removes the element from an array at + the given index + + @since version 1.0.0 + */ + template < class IteratorType, typename std::enable_if < + std::is_same::value || + std::is_same::value, int >::type + = 0 > + IteratorType erase(IteratorType first, IteratorType last) + { + // make sure iterator fits the current value + if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object)) + { + JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", *this)); + } + + IteratorType result = end(); + + switch (m_type) + { + case value_t::boolean: + case value_t::number_float: + case value_t::number_integer: + case value_t::number_unsigned: + case value_t::string: + case value_t::binary: + { + if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin() + || !last.m_it.primitive_iterator.is_end())) + { + JSON_THROW(invalid_iterator::create(204, "iterators out of range", *this)); + } + + if (is_string()) + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, m_value.string); + std::allocator_traits::deallocate(alloc, m_value.string, 1); + m_value.string = nullptr; + } + else if (is_binary()) + { + AllocatorType alloc; + std::allocator_traits::destroy(alloc, m_value.binary); + std::allocator_traits::deallocate(alloc, m_value.binary, 1); + m_value.binary = nullptr; + } + + m_type = value_t::null; + assert_invariant(); + break; + } + + case value_t::object: + { + result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator, + last.m_it.object_iterator); + break; + } + + case value_t::array: + { + result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator, + last.m_it.array_iterator); + break; + } + + default: + JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this)); + } + + return result; + } + + /*! + @brief remove element from a JSON object given a key + + Removes elements from a JSON object with the key value @a key. + + @param[in] key value of the elements to remove + + @return Number of elements removed. If @a ObjectType is the default + `std::map` type, the return value will always be `0` (@a key was not + found) or `1` (@a key was found). + + @post References and iterators to the erased elements are invalidated. + Other references and iterators are not affected. + + @throw type_error.307 when called on a type other than JSON object; + example: `"cannot use erase() with null"` + + @complexity `log(size()) + count(key)` + + @liveexample{The example shows the effect of `erase()`.,erase__key_type} + + @sa see @ref erase(IteratorType) -- removes the element at a given position + @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in + the given range + @sa see @ref erase(const size_type) -- removes the element from an array at + the given index + + @since version 1.0.0 + */ + size_type erase(const typename object_t::key_type& key) + { + // this erase only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + return m_value.object->erase(key); + } + + JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this)); + } + + /*! + @brief remove element from a JSON array given an index + + Removes element from a JSON array at the index @a idx. + + @param[in] idx index of the element to remove + + @throw type_error.307 when called on a type other than JSON object; + example: `"cannot use erase() with null"` + @throw out_of_range.401 when `idx >= size()`; example: `"array index 17 + is out of range"` + + @complexity Linear in distance between @a idx and the end of the container. + + @liveexample{The example shows the effect of `erase()`.,erase__size_type} + + @sa see @ref erase(IteratorType) -- removes the element at a given position + @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in + the given range + @sa see @ref erase(const typename object_t::key_type&) -- removes the element + from an object at the given key + + @since version 1.0.0 + */ + void erase(const size_type idx) + { + // this erase only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + if (JSON_HEDLEY_UNLIKELY(idx >= size())) + { + JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this)); + } + + m_value.array->erase(m_value.array->begin() + static_cast(idx)); + } + else + { + JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this)); + } + } + + /// @} + + + //////////// + // lookup // + //////////// + + /// @name lookup + /// @{ + + /*! + @brief find an element in a JSON object + + Finds an element in a JSON object with key equivalent to @a key. If the + element is not found or the JSON value is not an object, end() is + returned. + + @note This method always returns @ref end() when executed on a JSON type + that is not an object. + + @param[in] key key value of the element to search for. + + @return Iterator to an element with key equivalent to @a key. If no such + element is found or the JSON value is not an object, past-the-end (see + @ref end()) iterator is returned. + + @complexity Logarithmic in the size of the JSON object. + + @liveexample{The example shows how `find()` is used.,find__key_type} + + @sa see @ref contains(KeyT&&) const -- checks whether a key exists + + @since version 1.0.0 + */ + template + iterator find(KeyT&& key) + { + auto result = end(); + + if (is_object()) + { + result.m_it.object_iterator = m_value.object->find(std::forward(key)); + } + + return result; + } + + /*! + @brief find an element in a JSON object + @copydoc find(KeyT&&) + */ + template + const_iterator find(KeyT&& key) const + { + auto result = cend(); + + if (is_object()) + { + result.m_it.object_iterator = m_value.object->find(std::forward(key)); + } + + return result; + } + + /*! + @brief returns the number of occurrences of a key in a JSON object + + Returns the number of elements with key @a key. If ObjectType is the + default `std::map` type, the return value will always be `0` (@a key was + not found) or `1` (@a key was found). + + @note This method always returns `0` when executed on a JSON type that is + not an object. + + @param[in] key key value of the element to count + + @return Number of elements with key @a key. If the JSON value is not an + object, the return value will be `0`. + + @complexity Logarithmic in the size of the JSON object. + + @liveexample{The example shows how `count()` is used.,count} + + @since version 1.0.0 + */ + template + size_type count(KeyT&& key) const + { + // return 0 for all nonobject types + return is_object() ? m_value.object->count(std::forward(key)) : 0; + } + + /*! + @brief check the existence of an element in a JSON object + + Check whether an element exists in a JSON object with key equivalent to + @a key. If the element is not found or the JSON value is not an object, + false is returned. + + @note This method always returns false when executed on a JSON type + that is not an object. + + @param[in] key key value to check its existence. + + @return true if an element with specified @a key exists. If no such + element with such key is found or the JSON value is not an object, + false is returned. + + @complexity Logarithmic in the size of the JSON object. + + @liveexample{The following code shows an example for `contains()`.,contains} + + @sa see @ref find(KeyT&&) -- returns an iterator to an object element + @sa see @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer + + @since version 3.6.0 + */ + template < typename KeyT, typename std::enable_if < + !std::is_same::type, json_pointer>::value, int >::type = 0 > + bool contains(KeyT && key) const + { + return is_object() && m_value.object->find(std::forward(key)) != m_value.object->end(); + } + + /*! + @brief check the existence of an element in a JSON object given a JSON pointer + + Check whether the given JSON pointer @a ptr can be resolved in the current + JSON value. + + @note This method can be executed on any JSON value type. + + @param[in] ptr JSON pointer to check its existence. + + @return true if the JSON pointer can be resolved to a stored value, false + otherwise. + + @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`. + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + + @complexity Logarithmic in the size of the JSON object. + + @liveexample{The following code shows an example for `contains()`.,contains_json_pointer} + + @sa see @ref contains(KeyT &&) const -- checks the existence of a key + + @since version 3.7.0 + */ + bool contains(const json_pointer& ptr) const + { + return ptr.contains(this); + } + + /// @} + + + /////////////// + // iterators // + /////////////// + + /// @name iterators + /// @{ + + /*! + @brief returns an iterator to the first element + + Returns an iterator to the first element. + + @image html range-begin-end.svg "Illustration from cppreference.com" + + @return iterator to the first element + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + + @liveexample{The following code shows an example for `begin()`.,begin} + + @sa see @ref cbegin() -- returns a const iterator to the beginning + @sa see @ref end() -- returns an iterator to the end + @sa see @ref cend() -- returns a const iterator to the end + + @since version 1.0.0 + */ + iterator begin() noexcept + { + iterator result(this); + result.set_begin(); + return result; + } + + /*! + @copydoc basic_json::cbegin() + */ + const_iterator begin() const noexcept + { + return cbegin(); + } + + /*! + @brief returns a const iterator to the first element + + Returns a const iterator to the first element. + + @image html range-begin-end.svg "Illustration from cppreference.com" + + @return const iterator to the first element + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + - Has the semantics of `const_cast(*this).begin()`. + + @liveexample{The following code shows an example for `cbegin()`.,cbegin} + + @sa see @ref begin() -- returns an iterator to the beginning + @sa see @ref end() -- returns an iterator to the end + @sa see @ref cend() -- returns a const iterator to the end + + @since version 1.0.0 + */ + const_iterator cbegin() const noexcept + { + const_iterator result(this); + result.set_begin(); + return result; + } + + /*! + @brief returns an iterator to one past the last element + + Returns an iterator to one past the last element. + + @image html range-begin-end.svg "Illustration from cppreference.com" + + @return iterator one past the last element + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + + @liveexample{The following code shows an example for `end()`.,end} + + @sa see @ref cend() -- returns a const iterator to the end + @sa see @ref begin() -- returns an iterator to the beginning + @sa see @ref cbegin() -- returns a const iterator to the beginning + + @since version 1.0.0 + */ + iterator end() noexcept + { + iterator result(this); + result.set_end(); + return result; + } + + /*! + @copydoc basic_json::cend() + */ + const_iterator end() const noexcept + { + return cend(); + } + + /*! + @brief returns a const iterator to one past the last element + + Returns a const iterator to one past the last element. + + @image html range-begin-end.svg "Illustration from cppreference.com" + + @return const iterator one past the last element + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + - Has the semantics of `const_cast(*this).end()`. + + @liveexample{The following code shows an example for `cend()`.,cend} + + @sa see @ref end() -- returns an iterator to the end + @sa see @ref begin() -- returns an iterator to the beginning + @sa see @ref cbegin() -- returns a const iterator to the beginning + + @since version 1.0.0 + */ + const_iterator cend() const noexcept + { + const_iterator result(this); + result.set_end(); + return result; + } + + /*! + @brief returns an iterator to the reverse-beginning + + Returns an iterator to the reverse-beginning; that is, the last element. + + @image html range-rbegin-rend.svg "Illustration from cppreference.com" + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer) + requirements: + - The complexity is constant. + - Has the semantics of `reverse_iterator(end())`. + + @liveexample{The following code shows an example for `rbegin()`.,rbegin} + + @sa see @ref crbegin() -- returns a const reverse iterator to the beginning + @sa see @ref rend() -- returns a reverse iterator to the end + @sa see @ref crend() -- returns a const reverse iterator to the end + + @since version 1.0.0 + */ + reverse_iterator rbegin() noexcept + { + return reverse_iterator(end()); + } + + /*! + @copydoc basic_json::crbegin() + */ + const_reverse_iterator rbegin() const noexcept + { + return crbegin(); + } + + /*! + @brief returns an iterator to the reverse-end + + Returns an iterator to the reverse-end; that is, one before the first + element. + + @image html range-rbegin-rend.svg "Illustration from cppreference.com" + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer) + requirements: + - The complexity is constant. + - Has the semantics of `reverse_iterator(begin())`. + + @liveexample{The following code shows an example for `rend()`.,rend} + + @sa see @ref crend() -- returns a const reverse iterator to the end + @sa see @ref rbegin() -- returns a reverse iterator to the beginning + @sa see @ref crbegin() -- returns a const reverse iterator to the beginning + + @since version 1.0.0 + */ + reverse_iterator rend() noexcept + { + return reverse_iterator(begin()); + } + + /*! + @copydoc basic_json::crend() + */ + const_reverse_iterator rend() const noexcept + { + return crend(); + } + + /*! + @brief returns a const reverse iterator to the last element + + Returns a const iterator to the reverse-beginning; that is, the last + element. + + @image html range-rbegin-rend.svg "Illustration from cppreference.com" + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer) + requirements: + - The complexity is constant. + - Has the semantics of `const_cast(*this).rbegin()`. + + @liveexample{The following code shows an example for `crbegin()`.,crbegin} + + @sa see @ref rbegin() -- returns a reverse iterator to the beginning + @sa see @ref rend() -- returns a reverse iterator to the end + @sa see @ref crend() -- returns a const reverse iterator to the end + + @since version 1.0.0 + */ + const_reverse_iterator crbegin() const noexcept + { + return const_reverse_iterator(cend()); + } + + /*! + @brief returns a const reverse iterator to one before the first + + Returns a const reverse iterator to the reverse-end; that is, one before + the first element. + + @image html range-rbegin-rend.svg "Illustration from cppreference.com" + + @complexity Constant. + + @requirement This function helps `basic_json` satisfying the + [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer) + requirements: + - The complexity is constant. + - Has the semantics of `const_cast(*this).rend()`. + + @liveexample{The following code shows an example for `crend()`.,crend} + + @sa see @ref rend() -- returns a reverse iterator to the end + @sa see @ref rbegin() -- returns a reverse iterator to the beginning + @sa see @ref crbegin() -- returns a const reverse iterator to the beginning + + @since version 1.0.0 + */ + const_reverse_iterator crend() const noexcept + { + return const_reverse_iterator(cbegin()); + } + + public: + /*! + @brief wrapper to access iterator member functions in range-based for + + This function allows to access @ref iterator::key() and @ref + iterator::value() during range-based for loops. In these loops, a + reference to the JSON values is returned, so there is no access to the + underlying iterator. + + For loop without iterator_wrapper: + + @code{cpp} + for (auto it = j_object.begin(); it != j_object.end(); ++it) + { + std::cout << "key: " << it.key() << ", value:" << it.value() << '\n'; + } + @endcode + + Range-based for loop without iterator proxy: + + @code{cpp} + for (auto it : j_object) + { + // "it" is of type json::reference and has no key() member + std::cout << "value: " << it << '\n'; + } + @endcode + + Range-based for loop with iterator proxy: + + @code{cpp} + for (auto it : json::iterator_wrapper(j_object)) + { + std::cout << "key: " << it.key() << ", value:" << it.value() << '\n'; + } + @endcode + + @note When iterating over an array, `key()` will return the index of the + element as string (see example). + + @param[in] ref reference to a JSON value + @return iteration proxy object wrapping @a ref with an interface to use in + range-based for loops + + @liveexample{The following code shows how the wrapper is used,iterator_wrapper} + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @note The name of this function is not yet final and may change in the + future. + + @deprecated This stream operator is deprecated and will be removed in + future 4.0.0 of the library. Please use @ref items() instead; + that is, replace `json::iterator_wrapper(j)` with `j.items()`. + */ + JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items()) + static iteration_proxy iterator_wrapper(reference ref) noexcept + { + return ref.items(); + } + + /*! + @copydoc iterator_wrapper(reference) + */ + JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items()) + static iteration_proxy iterator_wrapper(const_reference ref) noexcept + { + return ref.items(); + } + + /*! + @brief helper to access iterator member functions in range-based for + + This function allows to access @ref iterator::key() and @ref + iterator::value() during range-based for loops. In these loops, a + reference to the JSON values is returned, so there is no access to the + underlying iterator. + + For loop without `items()` function: + + @code{cpp} + for (auto it = j_object.begin(); it != j_object.end(); ++it) + { + std::cout << "key: " << it.key() << ", value:" << it.value() << '\n'; + } + @endcode + + Range-based for loop without `items()` function: + + @code{cpp} + for (auto it : j_object) + { + // "it" is of type json::reference and has no key() member + std::cout << "value: " << it << '\n'; + } + @endcode + + Range-based for loop with `items()` function: + + @code{cpp} + for (auto& el : j_object.items()) + { + std::cout << "key: " << el.key() << ", value:" << el.value() << '\n'; + } + @endcode + + The `items()` function also allows to use + [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding) + (C++17): + + @code{cpp} + for (auto& [key, val] : j_object.items()) + { + std::cout << "key: " << key << ", value:" << val << '\n'; + } + @endcode + + @note When iterating over an array, `key()` will return the index of the + element as string (see example). For primitive types (e.g., numbers), + `key()` returns an empty string. + + @warning Using `items()` on temporary objects is dangerous. Make sure the + object's lifetime exeeds the iteration. See + for more + information. + + @return iteration proxy object wrapping @a ref with an interface to use in + range-based for loops + + @liveexample{The following code shows how the function is used.,items} + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @since version 3.1.0, structured bindings support since 3.5.0. + */ + iteration_proxy items() noexcept + { + return iteration_proxy(*this); + } + + /*! + @copydoc items() + */ + iteration_proxy items() const noexcept + { + return iteration_proxy(*this); + } + + /// @} + + + ////////////// + // capacity // + ////////////// + + /// @name capacity + /// @{ + + /*! + @brief checks whether the container is empty. + + Checks if a JSON value has no elements (i.e. whether its @ref size is `0`). + + @return The return value depends on the different types and is + defined as follows: + Value type | return value + ----------- | ------------- + null | `true` + boolean | `false` + string | `false` + number | `false` + binary | `false` + object | result of function `object_t::empty()` + array | result of function `array_t::empty()` + + @liveexample{The following code uses `empty()` to check if a JSON + object contains any elements.,empty} + + @complexity Constant, as long as @ref array_t and @ref object_t satisfy + the Container concept; that is, their `empty()` functions have constant + complexity. + + @iterators No changes. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @note This function does not return whether a string stored as JSON value + is empty - it returns whether the JSON container itself is empty which is + false in the case of a string. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + - Has the semantics of `begin() == end()`. + + @sa see @ref size() -- returns the number of elements + + @since version 1.0.0 + */ + bool empty() const noexcept + { + switch (m_type) + { + case value_t::null: + { + // null values are empty + return true; + } + + case value_t::array: + { + // delegate call to array_t::empty() + return m_value.array->empty(); + } + + case value_t::object: + { + // delegate call to object_t::empty() + return m_value.object->empty(); + } + + default: + { + // all other types are nonempty + return false; + } + } + } + + /*! + @brief returns the number of elements + + Returns the number of elements in a JSON value. + + @return The return value depends on the different types and is + defined as follows: + Value type | return value + ----------- | ------------- + null | `0` + boolean | `1` + string | `1` + number | `1` + binary | `1` + object | result of function object_t::size() + array | result of function array_t::size() + + @liveexample{The following code calls `size()` on the different value + types.,size} + + @complexity Constant, as long as @ref array_t and @ref object_t satisfy + the Container concept; that is, their size() functions have constant + complexity. + + @iterators No changes. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @note This function does not return the length of a string stored as JSON + value - it returns the number of elements in the JSON value which is 1 in + the case of a string. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + - Has the semantics of `std::distance(begin(), end())`. + + @sa see @ref empty() -- checks whether the container is empty + @sa see @ref max_size() -- returns the maximal number of elements + + @since version 1.0.0 + */ + size_type size() const noexcept + { + switch (m_type) + { + case value_t::null: + { + // null values are empty + return 0; + } + + case value_t::array: + { + // delegate call to array_t::size() + return m_value.array->size(); + } + + case value_t::object: + { + // delegate call to object_t::size() + return m_value.object->size(); + } + + default: + { + // all other types have size 1 + return 1; + } + } + } + + /*! + @brief returns the maximum possible number of elements + + Returns the maximum number of elements a JSON value is able to hold due to + system or library implementation limitations, i.e. `std::distance(begin(), + end())` for the JSON value. + + @return The return value depends on the different types and is + defined as follows: + Value type | return value + ----------- | ------------- + null | `0` (same as `size()`) + boolean | `1` (same as `size()`) + string | `1` (same as `size()`) + number | `1` (same as `size()`) + binary | `1` (same as `size()`) + object | result of function `object_t::max_size()` + array | result of function `array_t::max_size()` + + @liveexample{The following code calls `max_size()` on the different value + types. Note the output is implementation specific.,max_size} + + @complexity Constant, as long as @ref array_t and @ref object_t satisfy + the Container concept; that is, their `max_size()` functions have constant + complexity. + + @iterators No changes. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @requirement This function helps `basic_json` satisfying the + [Container](https://en.cppreference.com/w/cpp/named_req/Container) + requirements: + - The complexity is constant. + - Has the semantics of returning `b.size()` where `b` is the largest + possible JSON value. + + @sa see @ref size() -- returns the number of elements + + @since version 1.0.0 + */ + size_type max_size() const noexcept + { + switch (m_type) + { + case value_t::array: + { + // delegate call to array_t::max_size() + return m_value.array->max_size(); + } + + case value_t::object: + { + // delegate call to object_t::max_size() + return m_value.object->max_size(); + } + + default: + { + // all other types have max_size() == size() + return size(); + } + } + } + + /// @} + + + /////////////// + // modifiers // + /////////////// + + /// @name modifiers + /// @{ + + /*! + @brief clears the contents + + Clears the content of a JSON value and resets it to the default value as + if @ref basic_json(value_t) would have been called with the current value + type from @ref type(): + + Value type | initial value + ----------- | ------------- + null | `null` + boolean | `false` + string | `""` + number | `0` + binary | An empty byte vector + object | `{}` + array | `[]` + + @post Has the same effect as calling + @code {.cpp} + *this = basic_json(type()); + @endcode + + @liveexample{The example below shows the effect of `clear()` to different + JSON types.,clear} + + @complexity Linear in the size of the JSON value. + + @iterators All iterators, pointers and references related to this container + are invalidated. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @sa see @ref basic_json(value_t) -- constructor that creates an object with the + same value than calling `clear()` + + @since version 1.0.0 + */ + void clear() noexcept + { + switch (m_type) + { + case value_t::number_integer: + { + m_value.number_integer = 0; + break; + } + + case value_t::number_unsigned: + { + m_value.number_unsigned = 0; + break; + } + + case value_t::number_float: + { + m_value.number_float = 0.0; + break; + } + + case value_t::boolean: + { + m_value.boolean = false; + break; + } + + case value_t::string: + { + m_value.string->clear(); + break; + } + + case value_t::binary: + { + m_value.binary->clear(); + break; + } + + case value_t::array: + { + m_value.array->clear(); + break; + } + + case value_t::object: + { + m_value.object->clear(); + break; + } + + default: + break; + } + } + + /*! + @brief add an object to an array + + Appends the given element @a val to the end of the JSON value. If the + function is called on a JSON null value, an empty array is created before + appending @a val. + + @param[in] val the value to add to the JSON array + + @throw type_error.308 when called on a type other than JSON array or + null; example: `"cannot use push_back() with number"` + + @complexity Amortized constant. + + @liveexample{The example shows how `push_back()` and `+=` can be used to + add elements to a JSON array. Note how the `null` value was silently + converted to a JSON array.,push_back} + + @since version 1.0.0 + */ + void push_back(basic_json&& val) + { + // push_back only works for null objects or arrays + if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) + { + JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this)); + } + + // transform null object into an array + if (is_null()) + { + m_type = value_t::array; + m_value = value_t::array; + assert_invariant(); + } + + // add element to array (move semantics) + m_value.array->push_back(std::move(val)); + set_parent(m_value.array->back()); + // if val is moved from, basic_json move constructor marks it null so we do not call the destructor + } + + /*! + @brief add an object to an array + @copydoc push_back(basic_json&&) + */ + reference operator+=(basic_json&& val) + { + push_back(std::move(val)); + return *this; + } + + /*! + @brief add an object to an array + @copydoc push_back(basic_json&&) + */ + void push_back(const basic_json& val) + { + // push_back only works for null objects or arrays + if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) + { + JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this)); + } + + // transform null object into an array + if (is_null()) + { + m_type = value_t::array; + m_value = value_t::array; + assert_invariant(); + } + + // add element to array + m_value.array->push_back(val); + set_parent(m_value.array->back()); + } + + /*! + @brief add an object to an array + @copydoc push_back(basic_json&&) + */ + reference operator+=(const basic_json& val) + { + push_back(val); + return *this; + } + + /*! + @brief add an object to an object + + Inserts the given element @a val to the JSON object. If the function is + called on a JSON null value, an empty object is created before inserting + @a val. + + @param[in] val the value to add to the JSON object + + @throw type_error.308 when called on a type other than JSON object or + null; example: `"cannot use push_back() with number"` + + @complexity Logarithmic in the size of the container, O(log(`size()`)). + + @liveexample{The example shows how `push_back()` and `+=` can be used to + add elements to a JSON object. Note how the `null` value was silently + converted to a JSON object.,push_back__object_t__value} + + @since version 1.0.0 + */ + void push_back(const typename object_t::value_type& val) + { + // push_back only works for null objects or objects + if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object()))) + { + JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this)); + } + + // transform null object into an object + if (is_null()) + { + m_type = value_t::object; + m_value = value_t::object; + assert_invariant(); + } + + // add element to object + auto res = m_value.object->insert(val); + set_parent(res.first->second); + } + + /*! + @brief add an object to an object + @copydoc push_back(const typename object_t::value_type&) + */ + reference operator+=(const typename object_t::value_type& val) + { + push_back(val); + return *this; + } + + /*! + @brief add an object to an object + + This function allows to use `push_back` with an initializer list. In case + + 1. the current value is an object, + 2. the initializer list @a init contains only two elements, and + 3. the first element of @a init is a string, + + @a init is converted into an object element and added using + @ref push_back(const typename object_t::value_type&). Otherwise, @a init + is converted to a JSON value and added using @ref push_back(basic_json&&). + + @param[in] init an initializer list + + @complexity Linear in the size of the initializer list @a init. + + @note This function is required to resolve an ambiguous overload error, + because pairs like `{"key", "value"}` can be both interpreted as + `object_t::value_type` or `std::initializer_list`, see + https://github.com/nlohmann/json/issues/235 for more information. + + @liveexample{The example shows how initializer lists are treated as + objects when possible.,push_back__initializer_list} + */ + void push_back(initializer_list_t init) + { + if (is_object() && init.size() == 2 && (*init.begin())->is_string()) + { + basic_json&& key = init.begin()->moved_or_copied(); + push_back(typename object_t::value_type( + std::move(key.get_ref()), (init.begin() + 1)->moved_or_copied())); + } + else + { + push_back(basic_json(init)); + } + } + + /*! + @brief add an object to an object + @copydoc push_back(initializer_list_t) + */ + reference operator+=(initializer_list_t init) + { + push_back(init); + return *this; + } + + /*! + @brief add an object to an array + + Creates a JSON value from the passed parameters @a args to the end of the + JSON value. If the function is called on a JSON null value, an empty array + is created before appending the value created from @a args. + + @param[in] args arguments to forward to a constructor of @ref basic_json + @tparam Args compatible types to create a @ref basic_json object + + @return reference to the inserted element + + @throw type_error.311 when called on a type other than JSON array or + null; example: `"cannot use emplace_back() with number"` + + @complexity Amortized constant. + + @liveexample{The example shows how `push_back()` can be used to add + elements to a JSON array. Note how the `null` value was silently converted + to a JSON array.,emplace_back} + + @since version 2.0.8, returns reference since 3.7.0 + */ + template + reference emplace_back(Args&& ... args) + { + // emplace_back only works for null objects or arrays + if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) + { + JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name()), *this)); + } + + // transform null object into an array + if (is_null()) + { + m_type = value_t::array; + m_value = value_t::array; + assert_invariant(); + } + + // add element to array (perfect forwarding) +#ifdef JSON_HAS_CPP_17 + return set_parent(m_value.array->emplace_back(std::forward(args)...)); +#else + m_value.array->emplace_back(std::forward(args)...); + return set_parent(m_value.array->back()); +#endif + } + + /*! + @brief add an object to an object if key does not exist + + Inserts a new element into a JSON object constructed in-place with the + given @a args if there is no element with the key in the container. If the + function is called on a JSON null value, an empty object is created before + appending the value created from @a args. + + @param[in] args arguments to forward to a constructor of @ref basic_json + @tparam Args compatible types to create a @ref basic_json object + + @return a pair consisting of an iterator to the inserted element, or the + already-existing element if no insertion happened, and a bool + denoting whether the insertion took place. + + @throw type_error.311 when called on a type other than JSON object or + null; example: `"cannot use emplace() with number"` + + @complexity Logarithmic in the size of the container, O(log(`size()`)). + + @liveexample{The example shows how `emplace()` can be used to add elements + to a JSON object. Note how the `null` value was silently converted to a + JSON object. Further note how no value is added if there was already one + value stored with the same key.,emplace} + + @since version 2.0.8 + */ + template + std::pair emplace(Args&& ... args) + { + // emplace only works for null objects or arrays + if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object()))) + { + JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name()), *this)); + } + + // transform null object into an object + if (is_null()) + { + m_type = value_t::object; + m_value = value_t::object; + assert_invariant(); + } + + // add element to array (perfect forwarding) + auto res = m_value.object->emplace(std::forward(args)...); + set_parent(res.first->second); + + // create result iterator and set iterator to the result of emplace + auto it = begin(); + it.m_it.object_iterator = res.first; + + // return pair of iterator and boolean + return {it, res.second}; + } + + /// Helper for insertion of an iterator + /// @note: This uses std::distance to support GCC 4.8, + /// see https://github.com/nlohmann/json/pull/1257 + template + iterator insert_iterator(const_iterator pos, Args&& ... args) + { + iterator result(this); + JSON_ASSERT(m_value.array != nullptr); + + auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator); + m_value.array->insert(pos.m_it.array_iterator, std::forward(args)...); + result.m_it.array_iterator = m_value.array->begin() + insert_pos; + + // This could have been written as: + // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val); + // but the return value of insert is missing in GCC 4.8, so it is written this way instead. + + return result; + } + + /*! + @brief inserts element + + Inserts element @a val before iterator @a pos. + + @param[in] pos iterator before which the content will be inserted; may be + the end() iterator + @param[in] val element to insert + @return iterator pointing to the inserted @a val. + + @throw type_error.309 if called on JSON values other than arrays; + example: `"cannot use insert() with string"` + @throw invalid_iterator.202 if @a pos is not an iterator of *this; + example: `"iterator does not fit current value"` + + @complexity Constant plus linear in the distance between @a pos and end of + the container. + + @liveexample{The example shows how `insert()` is used.,insert} + + @since version 1.0.0 + */ + iterator insert(const_iterator pos, const basic_json& val) + { + // insert only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + // check if iterator pos fits to this JSON value + if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) + { + JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this)); + } + + // insert to array and return iterator + return set_parents(insert_iterator(pos, val), static_cast(1)); + } + + JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this)); + } + + /*! + @brief inserts element + @copydoc insert(const_iterator, const basic_json&) + */ + iterator insert(const_iterator pos, basic_json&& val) + { + return insert(pos, val); + } + + /*! + @brief inserts elements + + Inserts @a cnt copies of @a val before iterator @a pos. + + @param[in] pos iterator before which the content will be inserted; may be + the end() iterator + @param[in] cnt number of copies of @a val to insert + @param[in] val element to insert + @return iterator pointing to the first element inserted, or @a pos if + `cnt==0` + + @throw type_error.309 if called on JSON values other than arrays; example: + `"cannot use insert() with string"` + @throw invalid_iterator.202 if @a pos is not an iterator of *this; + example: `"iterator does not fit current value"` + + @complexity Linear in @a cnt plus linear in the distance between @a pos + and end of the container. + + @liveexample{The example shows how `insert()` is used.,insert__count} + + @since version 1.0.0 + */ + iterator insert(const_iterator pos, size_type cnt, const basic_json& val) + { + // insert only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + // check if iterator pos fits to this JSON value + if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) + { + JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this)); + } + + // insert to array and return iterator + return set_parents(insert_iterator(pos, cnt, val), static_cast(cnt)); + } + + JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this)); + } + + /*! + @brief inserts elements + + Inserts elements from range `[first, last)` before iterator @a pos. + + @param[in] pos iterator before which the content will be inserted; may be + the end() iterator + @param[in] first begin of the range of elements to insert + @param[in] last end of the range of elements to insert + + @throw type_error.309 if called on JSON values other than arrays; example: + `"cannot use insert() with string"` + @throw invalid_iterator.202 if @a pos is not an iterator of *this; + example: `"iterator does not fit current value"` + @throw invalid_iterator.210 if @a first and @a last do not belong to the + same JSON value; example: `"iterators do not fit"` + @throw invalid_iterator.211 if @a first or @a last are iterators into + container for which insert is called; example: `"passed iterators may not + belong to container"` + + @return iterator pointing to the first element inserted, or @a pos if + `first==last` + + @complexity Linear in `std::distance(first, last)` plus linear in the + distance between @a pos and end of the container. + + @liveexample{The example shows how `insert()` is used.,insert__range} + + @since version 1.0.0 + */ + iterator insert(const_iterator pos, const_iterator first, const_iterator last) + { + // insert only works for arrays + if (JSON_HEDLEY_UNLIKELY(!is_array())) + { + JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this)); + } + + // check if iterator pos fits to this JSON value + if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) + { + JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this)); + } + + // check if range iterators belong to the same JSON object + if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) + { + JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this)); + } + + if (JSON_HEDLEY_UNLIKELY(first.m_object == this)) + { + JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", *this)); + } + + // insert to array and return iterator + return set_parents(insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator), std::distance(first, last)); + } + + /*! + @brief inserts elements + + Inserts elements from initializer list @a ilist before iterator @a pos. + + @param[in] pos iterator before which the content will be inserted; may be + the end() iterator + @param[in] ilist initializer list to insert the values from + + @throw type_error.309 if called on JSON values other than arrays; example: + `"cannot use insert() with string"` + @throw invalid_iterator.202 if @a pos is not an iterator of *this; + example: `"iterator does not fit current value"` + + @return iterator pointing to the first element inserted, or @a pos if + `ilist` is empty + + @complexity Linear in `ilist.size()` plus linear in the distance between + @a pos and end of the container. + + @liveexample{The example shows how `insert()` is used.,insert__ilist} + + @since version 1.0.0 + */ + iterator insert(const_iterator pos, initializer_list_t ilist) + { + // insert only works for arrays + if (JSON_HEDLEY_UNLIKELY(!is_array())) + { + JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this)); + } + + // check if iterator pos fits to this JSON value + if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) + { + JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this)); + } + + // insert to array and return iterator + return set_parents(insert_iterator(pos, ilist.begin(), ilist.end()), static_cast(ilist.size())); + } + + /*! + @brief inserts elements + + Inserts elements from range `[first, last)`. + + @param[in] first begin of the range of elements to insert + @param[in] last end of the range of elements to insert + + @throw type_error.309 if called on JSON values other than objects; example: + `"cannot use insert() with string"` + @throw invalid_iterator.202 if iterator @a first or @a last does does not + point to an object; example: `"iterators first and last must point to + objects"` + @throw invalid_iterator.210 if @a first and @a last do not belong to the + same JSON value; example: `"iterators do not fit"` + + @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number + of elements to insert. + + @liveexample{The example shows how `insert()` is used.,insert__range_object} + + @since version 3.0.0 + */ + void insert(const_iterator first, const_iterator last) + { + // insert only works for objects + if (JSON_HEDLEY_UNLIKELY(!is_object())) + { + JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this)); + } + + // check if range iterators belong to the same JSON object + if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) + { + JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this)); + } + + // passed iterators must belong to objects + if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object())) + { + JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this)); + } + + m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator); + } + + /*! + @brief updates a JSON object from another object, overwriting existing keys + + Inserts all values from JSON object @a j and overwrites existing keys. + + @param[in] j JSON object to read values from + + @throw type_error.312 if called on JSON values other than objects; example: + `"cannot use update() with string"` + + @complexity O(N*log(size() + N)), where N is the number of elements to + insert. + + @liveexample{The example shows how `update()` is used.,update} + + @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update + + @since version 3.0.0 + */ + void update(const_reference j) + { + // implicitly convert null value to an empty object + if (is_null()) + { + m_type = value_t::object; + m_value.object = create(); + assert_invariant(); + } + + if (JSON_HEDLEY_UNLIKELY(!is_object())) + { + JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this)); + } + if (JSON_HEDLEY_UNLIKELY(!j.is_object())) + { + JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name()), *this)); + } + + for (auto it = j.cbegin(); it != j.cend(); ++it) + { + m_value.object->operator[](it.key()) = it.value(); + } + } + + /*! + @brief updates a JSON object from another object, overwriting existing keys + + Inserts all values from from range `[first, last)` and overwrites existing + keys. + + @param[in] first begin of the range of elements to insert + @param[in] last end of the range of elements to insert + + @throw type_error.312 if called on JSON values other than objects; example: + `"cannot use update() with string"` + @throw invalid_iterator.202 if iterator @a first or @a last does does not + point to an object; example: `"iterators first and last must point to + objects"` + @throw invalid_iterator.210 if @a first and @a last do not belong to the + same JSON value; example: `"iterators do not fit"` + + @complexity O(N*log(size() + N)), where N is the number of elements to + insert. + + @liveexample{The example shows how `update()` is used__range.,update} + + @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update + + @since version 3.0.0 + */ + void update(const_iterator first, const_iterator last) + { + // implicitly convert null value to an empty object + if (is_null()) + { + m_type = value_t::object; + m_value.object = create(); + assert_invariant(); + } + + if (JSON_HEDLEY_UNLIKELY(!is_object())) + { + JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this)); + } + + // check if range iterators belong to the same JSON object + if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) + { + JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this)); + } + + // passed iterators must belong to objects + if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object() + || !last.m_object->is_object())) + { + JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this)); + } + + for (auto it = first; it != last; ++it) + { + m_value.object->operator[](it.key()) = it.value(); + } + } + + /*! + @brief exchanges the values + + Exchanges the contents of the JSON value with those of @a other. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. + + @param[in,out] other JSON value to exchange the contents with + + @complexity Constant. + + @liveexample{The example below shows how JSON values can be swapped with + `swap()`.,swap__reference} + + @since version 1.0.0 + */ + void swap(reference other) noexcept ( + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value + ) + { + std::swap(m_type, other.m_type); + std::swap(m_value, other.m_value); + + set_parents(); + other.set_parents(); + assert_invariant(); + } + + /*! + @brief exchanges the values + + Exchanges the contents of the JSON value from @a left with those of @a right. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. implemented as a friend function callable via ADL. + + @param[in,out] left JSON value to exchange the contents with + @param[in,out] right JSON value to exchange the contents with + + @complexity Constant. + + @liveexample{The example below shows how JSON values can be swapped with + `swap()`.,swap__reference} + + @since version 1.0.0 + */ + friend void swap(reference left, reference right) noexcept ( + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_assignable::value + ) + { + left.swap(right); + } + + /*! + @brief exchanges the values + + Exchanges the contents of a JSON array with those of @a other. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. + + @param[in,out] other array to exchange the contents with + + @throw type_error.310 when JSON value is not an array; example: `"cannot + use swap() with string"` + + @complexity Constant. + + @liveexample{The example below shows how arrays can be swapped with + `swap()`.,swap__array_t} + + @since version 1.0.0 + */ + void swap(array_t& other) // NOLINT(bugprone-exception-escape) + { + // swap only works for arrays + if (JSON_HEDLEY_LIKELY(is_array())) + { + std::swap(*(m_value.array), other); + } + else + { + JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief exchanges the values + + Exchanges the contents of a JSON object with those of @a other. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. + + @param[in,out] other object to exchange the contents with + + @throw type_error.310 when JSON value is not an object; example: + `"cannot use swap() with string"` + + @complexity Constant. + + @liveexample{The example below shows how objects can be swapped with + `swap()`.,swap__object_t} + + @since version 1.0.0 + */ + void swap(object_t& other) // NOLINT(bugprone-exception-escape) + { + // swap only works for objects + if (JSON_HEDLEY_LIKELY(is_object())) + { + std::swap(*(m_value.object), other); + } + else + { + JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief exchanges the values + + Exchanges the contents of a JSON string with those of @a other. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. + + @param[in,out] other string to exchange the contents with + + @throw type_error.310 when JSON value is not a string; example: `"cannot + use swap() with boolean"` + + @complexity Constant. + + @liveexample{The example below shows how strings can be swapped with + `swap()`.,swap__string_t} + + @since version 1.0.0 + */ + void swap(string_t& other) // NOLINT(bugprone-exception-escape) + { + // swap only works for strings + if (JSON_HEDLEY_LIKELY(is_string())) + { + std::swap(*(m_value.string), other); + } + else + { + JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this)); + } + } + + /*! + @brief exchanges the values + + Exchanges the contents of a JSON string with those of @a other. Does not + invoke any move, copy, or swap operations on individual elements. All + iterators and references remain valid. The past-the-end iterator is + invalidated. + + @param[in,out] other binary to exchange the contents with + + @throw type_error.310 when JSON value is not a string; example: `"cannot + use swap() with boolean"` + + @complexity Constant. + + @liveexample{The example below shows how strings can be swapped with + `swap()`.,swap__binary_t} + + @since version 3.8.0 + */ + void swap(binary_t& other) // NOLINT(bugprone-exception-escape) + { + // swap only works for strings + if (JSON_HEDLEY_LIKELY(is_binary())) + { + std::swap(*(m_value.binary), other); + } + else + { + JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this)); + } + } + + /// @copydoc swap(binary_t&) + void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape) + { + // swap only works for strings + if (JSON_HEDLEY_LIKELY(is_binary())) + { + std::swap(*(m_value.binary), other); + } + else + { + JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this)); + } + } + + /// @} + + public: + ////////////////////////////////////////// + // lexicographical comparison operators // + ////////////////////////////////////////// + + /// @name lexicographical comparison operators + /// @{ + + /*! + @brief comparison: equal + + Compares two JSON values for equality according to the following rules: + - Two JSON values are equal if (1) they are from the same type and (2) + their stored values are the same according to their respective + `operator==`. + - Integer and floating-point numbers are automatically converted before + comparison. Note that two NaN values are always treated as unequal. + - Two JSON null values are equal. + + @note Floating-point inside JSON values numbers are compared with + `json::number_float_t::operator==` which is `double::operator==` by + default. To compare floating-point while respecting an epsilon, an alternative + [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39) + could be used, for instance + @code {.cpp} + template::value, T>::type> + inline bool is_same(T a, T b, T epsilon = std::numeric_limits::epsilon()) noexcept + { + return std::abs(a - b) <= epsilon; + } + @endcode + Or you can self-defined operator equal function like this: + @code {.cpp} + bool my_equal(const_reference lhs, const_reference rhs) { + const auto lhs_type lhs.type(); + const auto rhs_type rhs.type(); + if (lhs_type == rhs_type) { + switch(lhs_type) + // self_defined case + case value_t::number_float: + return std::abs(lhs - rhs) <= std::numeric_limits::epsilon(); + // other cases remain the same with the original + ... + } + ... + } + @endcode + + @note NaN values never compare equal to themselves or to other NaN values. + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether the values @a lhs and @a rhs are equal + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @complexity Linear. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__equal} + + @since version 1.0.0 + */ + friend bool operator==(const_reference lhs, const_reference rhs) noexcept + { + const auto lhs_type = lhs.type(); + const auto rhs_type = rhs.type(); + + if (lhs_type == rhs_type) + { + switch (lhs_type) + { + case value_t::array: + return *lhs.m_value.array == *rhs.m_value.array; + + case value_t::object: + return *lhs.m_value.object == *rhs.m_value.object; + + case value_t::null: + return true; + + case value_t::string: + return *lhs.m_value.string == *rhs.m_value.string; + + case value_t::boolean: + return lhs.m_value.boolean == rhs.m_value.boolean; + + case value_t::number_integer: + return lhs.m_value.number_integer == rhs.m_value.number_integer; + + case value_t::number_unsigned: + return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned; + + case value_t::number_float: + return lhs.m_value.number_float == rhs.m_value.number_float; + + case value_t::binary: + return *lhs.m_value.binary == *rhs.m_value.binary; + + default: + return false; + } + } + else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float) + { + return static_cast(lhs.m_value.number_integer) == rhs.m_value.number_float; + } + else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer) + { + return lhs.m_value.number_float == static_cast(rhs.m_value.number_integer); + } + else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float) + { + return static_cast(lhs.m_value.number_unsigned) == rhs.m_value.number_float; + } + else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned) + { + return lhs.m_value.number_float == static_cast(rhs.m_value.number_unsigned); + } + else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer) + { + return static_cast(lhs.m_value.number_unsigned) == rhs.m_value.number_integer; + } + else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned) + { + return lhs.m_value.number_integer == static_cast(rhs.m_value.number_unsigned); + } + + return false; + } + + /*! + @brief comparison: equal + @copydoc operator==(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator==(const_reference lhs, ScalarType rhs) noexcept + { + return lhs == basic_json(rhs); + } + + /*! + @brief comparison: equal + @copydoc operator==(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator==(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) == rhs; + } + + /*! + @brief comparison: not equal + + Compares two JSON values for inequality by calculating `not (lhs == rhs)`. + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether the values @a lhs and @a rhs are not equal + + @complexity Linear. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__notequal} + + @since version 1.0.0 + */ + friend bool operator!=(const_reference lhs, const_reference rhs) noexcept + { + return !(lhs == rhs); + } + + /*! + @brief comparison: not equal + @copydoc operator!=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept + { + return lhs != basic_json(rhs); + } + + /*! + @brief comparison: not equal + @copydoc operator!=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) != rhs; + } + + /*! + @brief comparison: less than + + Compares whether one JSON value @a lhs is less than another JSON value @a + rhs according to the following rules: + - If @a lhs and @a rhs have the same type, the values are compared using + the default `<` operator. + - Integer and floating-point numbers are automatically converted before + comparison + - In case @a lhs and @a rhs have different types, the values are ignored + and the order of the types is considered, see + @ref operator<(const value_t, const value_t). + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether @a lhs is less than @a rhs + + @complexity Linear. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__less} + + @since version 1.0.0 + */ + friend bool operator<(const_reference lhs, const_reference rhs) noexcept + { + const auto lhs_type = lhs.type(); + const auto rhs_type = rhs.type(); + + if (lhs_type == rhs_type) + { + switch (lhs_type) + { + case value_t::array: + // note parentheses are necessary, see + // https://github.com/nlohmann/json/issues/1530 + return (*lhs.m_value.array) < (*rhs.m_value.array); + + case value_t::object: + return (*lhs.m_value.object) < (*rhs.m_value.object); + + case value_t::null: + return false; + + case value_t::string: + return (*lhs.m_value.string) < (*rhs.m_value.string); + + case value_t::boolean: + return (lhs.m_value.boolean) < (rhs.m_value.boolean); + + case value_t::number_integer: + return (lhs.m_value.number_integer) < (rhs.m_value.number_integer); + + case value_t::number_unsigned: + return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned); + + case value_t::number_float: + return (lhs.m_value.number_float) < (rhs.m_value.number_float); + + case value_t::binary: + return (*lhs.m_value.binary) < (*rhs.m_value.binary); + + default: + return false; + } + } + else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float) + { + return static_cast(lhs.m_value.number_integer) < rhs.m_value.number_float; + } + else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer) + { + return lhs.m_value.number_float < static_cast(rhs.m_value.number_integer); + } + else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float) + { + return static_cast(lhs.m_value.number_unsigned) < rhs.m_value.number_float; + } + else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned) + { + return lhs.m_value.number_float < static_cast(rhs.m_value.number_unsigned); + } + else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned) + { + return lhs.m_value.number_integer < static_cast(rhs.m_value.number_unsigned); + } + else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer) + { + return static_cast(lhs.m_value.number_unsigned) < rhs.m_value.number_integer; + } + + // We only reach this line if we cannot compare values. In that case, + // we compare types. Note we have to call the operator explicitly, + // because MSVC has problems otherwise. + return operator<(lhs_type, rhs_type); + } + + /*! + @brief comparison: less than + @copydoc operator<(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator<(const_reference lhs, ScalarType rhs) noexcept + { + return lhs < basic_json(rhs); + } + + /*! + @brief comparison: less than + @copydoc operator<(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator<(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) < rhs; + } + + /*! + @brief comparison: less than or equal + + Compares whether one JSON value @a lhs is less than or equal to another + JSON value by calculating `not (rhs < lhs)`. + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether @a lhs is less than or equal to @a rhs + + @complexity Linear. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__greater} + + @since version 1.0.0 + */ + friend bool operator<=(const_reference lhs, const_reference rhs) noexcept + { + return !(rhs < lhs); + } + + /*! + @brief comparison: less than or equal + @copydoc operator<=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept + { + return lhs <= basic_json(rhs); + } + + /*! + @brief comparison: less than or equal + @copydoc operator<=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) <= rhs; + } + + /*! + @brief comparison: greater than + + Compares whether one JSON value @a lhs is greater than another + JSON value by calculating `not (lhs <= rhs)`. + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether @a lhs is greater than to @a rhs + + @complexity Linear. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__lessequal} + + @since version 1.0.0 + */ + friend bool operator>(const_reference lhs, const_reference rhs) noexcept + { + return !(lhs <= rhs); + } + + /*! + @brief comparison: greater than + @copydoc operator>(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator>(const_reference lhs, ScalarType rhs) noexcept + { + return lhs > basic_json(rhs); + } + + /*! + @brief comparison: greater than + @copydoc operator>(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator>(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) > rhs; + } + + /*! + @brief comparison: greater than or equal + + Compares whether one JSON value @a lhs is greater than or equal to another + JSON value by calculating `not (lhs < rhs)`. + + @param[in] lhs first JSON value to consider + @param[in] rhs second JSON value to consider + @return whether @a lhs is greater than or equal to @a rhs + + @complexity Linear. + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @liveexample{The example demonstrates comparing several JSON + types.,operator__greaterequal} + + @since version 1.0.0 + */ + friend bool operator>=(const_reference lhs, const_reference rhs) noexcept + { + return !(lhs < rhs); + } + + /*! + @brief comparison: greater than or equal + @copydoc operator>=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept + { + return lhs >= basic_json(rhs); + } + + /*! + @brief comparison: greater than or equal + @copydoc operator>=(const_reference, const_reference) + */ + template::value, int>::type = 0> + friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept + { + return basic_json(lhs) >= rhs; + } + + /// @} + + /////////////////// + // serialization // + /////////////////// + + /// @name serialization + /// @{ +#ifndef JSON_NO_IO + /*! + @brief serialize to stream + + Serialize the given JSON value @a j to the output stream @a o. The JSON + value will be serialized using the @ref dump member function. + + - The indentation of the output can be controlled with the member variable + `width` of the output stream @a o. For instance, using the manipulator + `std::setw(4)` on @a o sets the indentation level to `4` and the + serialization result is the same as calling `dump(4)`. + + - The indentation character can be controlled with the member variable + `fill` of the output stream @a o. For instance, the manipulator + `std::setfill('\\t')` sets indentation to use a tab character rather than + the default space character. + + @param[in,out] o stream to serialize to + @param[in] j JSON value to serialize + + @return the stream @a o + + @throw type_error.316 if a string stored inside the JSON value is not + UTF-8 encoded + + @complexity Linear. + + @liveexample{The example below shows the serialization with different + parameters to `width` to adjust the indentation level.,operator_serialize} + + @since version 1.0.0; indentation character added in version 3.0.0 + */ + friend std::ostream& operator<<(std::ostream& o, const basic_json& j) + { + // read width member and use it as indentation parameter if nonzero + const bool pretty_print = o.width() > 0; + const auto indentation = pretty_print ? o.width() : 0; + + // reset width to 0 for subsequent calls to this stream + o.width(0); + + // do the actual serialization + serializer s(detail::output_adapter(o), o.fill()); + s.dump(j, pretty_print, false, static_cast(indentation)); + return o; + } + + /*! + @brief serialize to stream + @deprecated This stream operator is deprecated and will be removed in + future 4.0.0 of the library. Please use + @ref operator<<(std::ostream&, const basic_json&) + instead; that is, replace calls like `j >> o;` with `o << j;`. + @since version 1.0.0; deprecated since version 3.0.0 + */ + JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&)) + friend std::ostream& operator>>(const basic_json& j, std::ostream& o) + { + return o << j; + } +#endif // JSON_NO_IO + /// @} + + + ///////////////////// + // deserialization // + ///////////////////// + + /// @name deserialization + /// @{ + + /*! + @brief deserialize from a compatible input + + @tparam InputType A compatible input, for instance + - an std::istream object + - a FILE pointer + - a C-style array of characters + - a pointer to a null-terminated string of single byte characters + - an object obj for which begin(obj) and end(obj) produces a valid pair of + iterators. + + @param[in] i input to read from + @param[in] cb a parser callback function of type @ref parser_callback_t + which is used to control the deserialization by filtering unwanted values + (optional) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + @param[in] ignore_comments whether comments should be ignored and treated + like whitespace (true) or yield a parse error (true); (optional, false by + default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.101 if a parse error occurs; example: `""unexpected end + of input; expected string literal""` + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + + @complexity Linear in the length of the input. The parser is a predictive + LL(1) parser. The complexity can be higher if the parser callback function + @a cb or reading from the input @a i has a super-linear complexity. + + @note A UTF-8 byte order mark is silently ignored. + + @liveexample{The example below demonstrates the `parse()` function reading + from an array.,parse__array__parser_callback_t} + + @liveexample{The example below demonstrates the `parse()` function with + and without callback function.,parse__string__parser_callback_t} + + @liveexample{The example below demonstrates the `parse()` function with + and without callback function.,parse__istream__parser_callback_t} + + @liveexample{The example below demonstrates the `parse()` function reading + from a contiguous container.,parse__contiguouscontainer__parser_callback_t} + + @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to + ignore comments. + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json parse(InputType&& i, + const parser_callback_t cb = nullptr, + const bool allow_exceptions = true, + const bool ignore_comments = false) + { + basic_json result; + parser(detail::input_adapter(std::forward(i)), cb, allow_exceptions, ignore_comments).parse(true, result); + return result; + } + + /*! + @brief deserialize from a pair of character iterators + + The value_type of the iterator must be a integral type with size of 1, 2 or + 4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32. + + @param[in] first iterator to start of character range + @param[in] last iterator to end of character range + @param[in] cb a parser callback function of type @ref parser_callback_t + which is used to control the deserialization by filtering unwanted values + (optional) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + @param[in] ignore_comments whether comments should be ignored and treated + like whitespace (true) or yield a parse error (true); (optional, false by + default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.101 if a parse error occurs; example: `""unexpected end + of input; expected string literal""` + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json parse(IteratorType first, + IteratorType last, + const parser_callback_t cb = nullptr, + const bool allow_exceptions = true, + const bool ignore_comments = false) + { + basic_json result; + parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result); + return result; + } + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len)) + static basic_json parse(detail::span_input_adapter&& i, + const parser_callback_t cb = nullptr, + const bool allow_exceptions = true, + const bool ignore_comments = false) + { + basic_json result; + parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result); + return result; + } + + /*! + @brief check if the input is valid JSON + + Unlike the @ref parse(InputType&&, const parser_callback_t,const bool) + function, this function neither throws an exception in case of invalid JSON + input (i.e., a parse error) nor creates diagnostic information. + + @tparam InputType A compatible input, for instance + - an std::istream object + - a FILE pointer + - a C-style array of characters + - a pointer to a null-terminated string of single byte characters + - an object obj for which begin(obj) and end(obj) produces a valid pair of + iterators. + + @param[in] i input to read from + @param[in] ignore_comments whether comments should be ignored and treated + like whitespace (true) or yield a parse error (true); (optional, false by + default) + + @return Whether the input read from @a i is valid JSON. + + @complexity Linear in the length of the input. The parser is a predictive + LL(1) parser. + + @note A UTF-8 byte order mark is silently ignored. + + @liveexample{The example below demonstrates the `accept()` function reading + from a string.,accept__string} + */ + template + static bool accept(InputType&& i, + const bool ignore_comments = false) + { + return parser(detail::input_adapter(std::forward(i)), nullptr, false, ignore_comments).accept(true); + } + + template + static bool accept(IteratorType first, IteratorType last, + const bool ignore_comments = false) + { + return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true); + } + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len)) + static bool accept(detail::span_input_adapter&& i, + const bool ignore_comments = false) + { + return parser(i.get(), nullptr, false, ignore_comments).accept(true); + } + + /*! + @brief generate SAX events + + The SAX event lister must follow the interface of @ref json_sax. + + This function reads from a compatible input. Examples are: + - an std::istream object + - a FILE pointer + - a C-style array of characters + - a pointer to a null-terminated string of single byte characters + - an object obj for which begin(obj) and end(obj) produces a valid pair of + iterators. + + @param[in] i input to read from + @param[in,out] sax SAX event listener + @param[in] format the format to parse (JSON, CBOR, MessagePack, or UBJSON) + @param[in] strict whether the input has to be consumed completely + @param[in] ignore_comments whether comments should be ignored and treated + like whitespace (true) or yield a parse error (true); (optional, false by + default); only applies to the JSON file format. + + @return return value of the last processed SAX event + + @throw parse_error.101 if a parse error occurs; example: `""unexpected end + of input; expected string literal""` + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + + @complexity Linear in the length of the input. The parser is a predictive + LL(1) parser. The complexity can be higher if the SAX consumer @a sax has + a super-linear complexity. + + @note A UTF-8 byte order mark is silently ignored. + + @liveexample{The example below demonstrates the `sax_parse()` function + reading from string and processing the events with a user-defined SAX + event consumer.,sax_parse} + + @since version 3.2.0 + */ + template + JSON_HEDLEY_NON_NULL(2) + static bool sax_parse(InputType&& i, SAX* sax, + input_format_t format = input_format_t::json, + const bool strict = true, + const bool ignore_comments = false) + { + auto ia = detail::input_adapter(std::forward(i)); + return format == input_format_t::json + ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict) + : detail::binary_reader(std::move(ia)).sax_parse(format, sax, strict); + } + + template + JSON_HEDLEY_NON_NULL(3) + static bool sax_parse(IteratorType first, IteratorType last, SAX* sax, + input_format_t format = input_format_t::json, + const bool strict = true, + const bool ignore_comments = false) + { + auto ia = detail::input_adapter(std::move(first), std::move(last)); + return format == input_format_t::json + ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict) + : detail::binary_reader(std::move(ia)).sax_parse(format, sax, strict); + } + + template + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...)) + JSON_HEDLEY_NON_NULL(2) + static bool sax_parse(detail::span_input_adapter&& i, SAX* sax, + input_format_t format = input_format_t::json, + const bool strict = true, + const bool ignore_comments = false) + { + auto ia = i.get(); + return format == input_format_t::json + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict) + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + : detail::binary_reader(std::move(ia)).sax_parse(format, sax, strict); + } +#ifndef JSON_NO_IO + /*! + @brief deserialize from stream + @deprecated This stream operator is deprecated and will be removed in + version 4.0.0 of the library. Please use + @ref operator>>(std::istream&, basic_json&) + instead; that is, replace calls like `j << i;` with `i >> j;`. + @since version 1.0.0; deprecated since version 3.0.0 + */ + JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&)) + friend std::istream& operator<<(basic_json& j, std::istream& i) + { + return operator>>(i, j); + } + + /*! + @brief deserialize from stream + + Deserializes an input stream to a JSON value. + + @param[in,out] i input stream to read a serialized JSON value from + @param[in,out] j JSON value to write the deserialized input to + + @throw parse_error.101 in case of an unexpected token + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + + @complexity Linear in the length of the input. The parser is a predictive + LL(1) parser. + + @note A UTF-8 byte order mark is silently ignored. + + @liveexample{The example below shows how a JSON value is constructed by + reading a serialization from a stream.,operator_deserialize} + + @sa parse(std::istream&, const parser_callback_t) for a variant with a + parser callback function to filter values while parsing + + @since version 1.0.0 + */ + friend std::istream& operator>>(std::istream& i, basic_json& j) + { + parser(detail::input_adapter(i)).parse(false, j); + return i; + } +#endif // JSON_NO_IO + /// @} + + /////////////////////////// + // convenience functions // + /////////////////////////// + + /*! + @brief return the type as string + + Returns the type name as string to be used in error messages - usually to + indicate that a function was called on a wrong JSON type. + + @return a string representation of a the @a m_type member: + Value type | return value + ----------- | ------------- + null | `"null"` + boolean | `"boolean"` + string | `"string"` + number | `"number"` (for all number types) + object | `"object"` + array | `"array"` + binary | `"binary"` + discarded | `"discarded"` + + @exceptionsafety No-throw guarantee: this function never throws exceptions. + + @complexity Constant. + + @liveexample{The following code exemplifies `type_name()` for all JSON + types.,type_name} + + @sa see @ref type() -- return the type of the JSON value + @sa see @ref operator value_t() -- return the type of the JSON value (implicit) + + @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept` + since 3.0.0 + */ + JSON_HEDLEY_RETURNS_NON_NULL + const char* type_name() const noexcept + { + { + switch (m_type) + { + case value_t::null: + return "null"; + case value_t::object: + return "object"; + case value_t::array: + return "array"; + case value_t::string: + return "string"; + case value_t::boolean: + return "boolean"; + case value_t::binary: + return "binary"; + case value_t::discarded: + return "discarded"; + default: + return "number"; + } + } + } + + + JSON_PRIVATE_UNLESS_TESTED: + ////////////////////// + // member variables // + ////////////////////// + + /// the type of the current element + value_t m_type = value_t::null; + + /// the value of the current element + json_value m_value = {}; + +#if JSON_DIAGNOSTICS + /// a pointer to a parent value (for debugging purposes) + basic_json* m_parent = nullptr; +#endif + + ////////////////////////////////////////// + // binary serialization/deserialization // + ////////////////////////////////////////// + + /// @name binary serialization/deserialization support + /// @{ + + public: + /*! + @brief create a CBOR serialization of a given JSON value + + Serializes a given JSON value @a j to a byte vector using the CBOR (Concise + Binary Object Representation) serialization format. CBOR is a binary + serialization format which aims to be more compact than JSON itself, yet + more efficient to parse. + + The library uses the following mapping from JSON values types to + CBOR types according to the CBOR specification (RFC 7049): + + JSON value type | value/range | CBOR type | first byte + --------------- | ------------------------------------------ | ---------------------------------- | --------------- + null | `null` | Null | 0xF6 + boolean | `true` | True | 0xF5 + boolean | `false` | False | 0xF4 + number_integer | -9223372036854775808..-2147483649 | Negative integer (8 bytes follow) | 0x3B + number_integer | -2147483648..-32769 | Negative integer (4 bytes follow) | 0x3A + number_integer | -32768..-129 | Negative integer (2 bytes follow) | 0x39 + number_integer | -128..-25 | Negative integer (1 byte follow) | 0x38 + number_integer | -24..-1 | Negative integer | 0x20..0x37 + number_integer | 0..23 | Integer | 0x00..0x17 + number_integer | 24..255 | Unsigned integer (1 byte follow) | 0x18 + number_integer | 256..65535 | Unsigned integer (2 bytes follow) | 0x19 + number_integer | 65536..4294967295 | Unsigned integer (4 bytes follow) | 0x1A + number_integer | 4294967296..18446744073709551615 | Unsigned integer (8 bytes follow) | 0x1B + number_unsigned | 0..23 | Integer | 0x00..0x17 + number_unsigned | 24..255 | Unsigned integer (1 byte follow) | 0x18 + number_unsigned | 256..65535 | Unsigned integer (2 bytes follow) | 0x19 + number_unsigned | 65536..4294967295 | Unsigned integer (4 bytes follow) | 0x1A + number_unsigned | 4294967296..18446744073709551615 | Unsigned integer (8 bytes follow) | 0x1B + number_float | *any value representable by a float* | Single-Precision Float | 0xFA + number_float | *any value NOT representable by a float* | Double-Precision Float | 0xFB + string | *length*: 0..23 | UTF-8 string | 0x60..0x77 + string | *length*: 23..255 | UTF-8 string (1 byte follow) | 0x78 + string | *length*: 256..65535 | UTF-8 string (2 bytes follow) | 0x79 + string | *length*: 65536..4294967295 | UTF-8 string (4 bytes follow) | 0x7A + string | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow) | 0x7B + array | *size*: 0..23 | array | 0x80..0x97 + array | *size*: 23..255 | array (1 byte follow) | 0x98 + array | *size*: 256..65535 | array (2 bytes follow) | 0x99 + array | *size*: 65536..4294967295 | array (4 bytes follow) | 0x9A + array | *size*: 4294967296..18446744073709551615 | array (8 bytes follow) | 0x9B + object | *size*: 0..23 | map | 0xA0..0xB7 + object | *size*: 23..255 | map (1 byte follow) | 0xB8 + object | *size*: 256..65535 | map (2 bytes follow) | 0xB9 + object | *size*: 65536..4294967295 | map (4 bytes follow) | 0xBA + object | *size*: 4294967296..18446744073709551615 | map (8 bytes follow) | 0xBB + binary | *size*: 0..23 | byte string | 0x40..0x57 + binary | *size*: 23..255 | byte string (1 byte follow) | 0x58 + binary | *size*: 256..65535 | byte string (2 bytes follow) | 0x59 + binary | *size*: 65536..4294967295 | byte string (4 bytes follow) | 0x5A + binary | *size*: 4294967296..18446744073709551615 | byte string (8 bytes follow) | 0x5B + + @note The mapping is **complete** in the sense that any JSON value type + can be converted to a CBOR value. + + @note If NaN or Infinity are stored inside a JSON number, they are + serialized properly. This behavior differs from the @ref dump() + function which serializes NaN or Infinity to `null`. + + @note The following CBOR types are not used in the conversion: + - UTF-8 strings terminated by "break" (0x7F) + - arrays terminated by "break" (0x9F) + - maps terminated by "break" (0xBF) + - byte strings terminated by "break" (0x5F) + - date/time (0xC0..0xC1) + - bignum (0xC2..0xC3) + - decimal fraction (0xC4) + - bigfloat (0xC5) + - expected conversions (0xD5..0xD7) + - simple values (0xE0..0xF3, 0xF8) + - undefined (0xF7) + - half-precision floats (0xF9) + - break (0xFF) + + @param[in] j JSON value to serialize + @return CBOR serialization as byte vector + + @complexity Linear in the size of the JSON value @a j. + + @liveexample{The example shows the serialization of a JSON value to a byte + vector in CBOR format.,to_cbor} + + @sa http://cbor.io + @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the + analogous deserialization + @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format + @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the + related UBJSON format + + @since version 2.0.9; compact representation of floating-point numbers + since version 3.8.0 + */ + static std::vector to_cbor(const basic_json& j) + { + std::vector result; + to_cbor(j, result); + return result; + } + + static void to_cbor(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_cbor(j); + } + + static void to_cbor(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_cbor(j); + } + + /*! + @brief create a MessagePack serialization of a given JSON value + + Serializes a given JSON value @a j to a byte vector using the MessagePack + serialization format. MessagePack is a binary serialization format which + aims to be more compact than JSON itself, yet more efficient to parse. + + The library uses the following mapping from JSON values types to + MessagePack types according to the MessagePack specification: + + JSON value type | value/range | MessagePack type | first byte + --------------- | --------------------------------- | ---------------- | ---------- + null | `null` | nil | 0xC0 + boolean | `true` | true | 0xC3 + boolean | `false` | false | 0xC2 + number_integer | -9223372036854775808..-2147483649 | int64 | 0xD3 + number_integer | -2147483648..-32769 | int32 | 0xD2 + number_integer | -32768..-129 | int16 | 0xD1 + number_integer | -128..-33 | int8 | 0xD0 + number_integer | -32..-1 | negative fixint | 0xE0..0xFF + number_integer | 0..127 | positive fixint | 0x00..0x7F + number_integer | 128..255 | uint 8 | 0xCC + number_integer | 256..65535 | uint 16 | 0xCD + number_integer | 65536..4294967295 | uint 32 | 0xCE + number_integer | 4294967296..18446744073709551615 | uint 64 | 0xCF + number_unsigned | 0..127 | positive fixint | 0x00..0x7F + number_unsigned | 128..255 | uint 8 | 0xCC + number_unsigned | 256..65535 | uint 16 | 0xCD + number_unsigned | 65536..4294967295 | uint 32 | 0xCE + number_unsigned | 4294967296..18446744073709551615 | uint 64 | 0xCF + number_float | *any value representable by a float* | float 32 | 0xCA + number_float | *any value NOT representable by a float* | float 64 | 0xCB + string | *length*: 0..31 | fixstr | 0xA0..0xBF + string | *length*: 32..255 | str 8 | 0xD9 + string | *length*: 256..65535 | str 16 | 0xDA + string | *length*: 65536..4294967295 | str 32 | 0xDB + array | *size*: 0..15 | fixarray | 0x90..0x9F + array | *size*: 16..65535 | array 16 | 0xDC + array | *size*: 65536..4294967295 | array 32 | 0xDD + object | *size*: 0..15 | fix map | 0x80..0x8F + object | *size*: 16..65535 | map 16 | 0xDE + object | *size*: 65536..4294967295 | map 32 | 0xDF + binary | *size*: 0..255 | bin 8 | 0xC4 + binary | *size*: 256..65535 | bin 16 | 0xC5 + binary | *size*: 65536..4294967295 | bin 32 | 0xC6 + + @note The mapping is **complete** in the sense that any JSON value type + can be converted to a MessagePack value. + + @note The following values can **not** be converted to a MessagePack value: + - strings with more than 4294967295 bytes + - byte strings with more than 4294967295 bytes + - arrays with more than 4294967295 elements + - objects with more than 4294967295 elements + + @note Any MessagePack output created @ref to_msgpack can be successfully + parsed by @ref from_msgpack. + + @note If NaN or Infinity are stored inside a JSON number, they are + serialized properly. This behavior differs from the @ref dump() + function which serializes NaN or Infinity to `null`. + + @param[in] j JSON value to serialize + @return MessagePack serialization as byte vector + + @complexity Linear in the size of the JSON value @a j. + + @liveexample{The example shows the serialization of a JSON value to a byte + vector in MessagePack format.,to_msgpack} + + @sa http://msgpack.org + @sa see @ref from_msgpack for the analogous deserialization + @sa see @ref to_cbor(const basic_json& for the related CBOR format + @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the + related UBJSON format + + @since version 2.0.9 + */ + static std::vector to_msgpack(const basic_json& j) + { + std::vector result; + to_msgpack(j, result); + return result; + } + + static void to_msgpack(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_msgpack(j); + } + + static void to_msgpack(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_msgpack(j); + } + + /*! + @brief create a UBJSON serialization of a given JSON value + + Serializes a given JSON value @a j to a byte vector using the UBJSON + (Universal Binary JSON) serialization format. UBJSON aims to be more compact + than JSON itself, yet more efficient to parse. + + The library uses the following mapping from JSON values types to + UBJSON types according to the UBJSON specification: + + JSON value type | value/range | UBJSON type | marker + --------------- | --------------------------------- | ----------- | ------ + null | `null` | null | `Z` + boolean | `true` | true | `T` + boolean | `false` | false | `F` + number_integer | -9223372036854775808..-2147483649 | int64 | `L` + number_integer | -2147483648..-32769 | int32 | `l` + number_integer | -32768..-129 | int16 | `I` + number_integer | -128..127 | int8 | `i` + number_integer | 128..255 | uint8 | `U` + number_integer | 256..32767 | int16 | `I` + number_integer | 32768..2147483647 | int32 | `l` + number_integer | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 0..127 | int8 | `i` + number_unsigned | 128..255 | uint8 | `U` + number_unsigned | 256..32767 | int16 | `I` + number_unsigned | 32768..2147483647 | int32 | `l` + number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` + number_float | *any value* | float64 | `D` + string | *with shortest length indicator* | string | `S` + array | *see notes on optimized format* | array | `[` + object | *see notes on optimized format* | map | `{` + + @note The mapping is **complete** in the sense that any JSON value type + can be converted to a UBJSON value. + + @note The following values can **not** be converted to a UBJSON value: + - strings with more than 9223372036854775807 bytes (theoretical) + + @note The following markers are not used in the conversion: + - `Z`: no-op values are not created. + - `C`: single-byte strings are serialized with `S` markers. + + @note Any UBJSON output created @ref to_ubjson can be successfully parsed + by @ref from_ubjson. + + @note If NaN or Infinity are stored inside a JSON number, they are + serialized properly. This behavior differs from the @ref dump() + function which serializes NaN or Infinity to `null`. + + @note The optimized formats for containers are supported: Parameter + @a use_size adds size information to the beginning of a container and + removes the closing marker. Parameter @a use_type further checks + whether all elements of a container have the same type and adds the + type marker to the beginning of the container. The @a use_type + parameter must only be used together with @a use_size = true. Note + that @a use_size = true alone may result in larger representations - + the benefit of this parameter is that the receiving side is + immediately informed on the number of elements of the container. + + @note If the JSON data contains the binary type, the value stored is a list + of integers, as suggested by the UBJSON documentation. In particular, + this means that serialization and the deserialization of a JSON + containing binary values into UBJSON and back will result in a + different JSON object. + + @param[in] j JSON value to serialize + @param[in] use_size whether to add size annotations to container types + @param[in] use_type whether to add type annotations to container types + (must be combined with @a use_size = true) + @return UBJSON serialization as byte vector + + @complexity Linear in the size of the JSON value @a j. + + @liveexample{The example shows the serialization of a JSON value to a byte + vector in UBJSON format.,to_ubjson} + + @sa http://ubjson.org + @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the + analogous deserialization + @sa see @ref to_cbor(const basic_json& for the related CBOR format + @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format + + @since version 3.1.0 + */ + static std::vector to_ubjson(const basic_json& j, + const bool use_size = false, + const bool use_type = false) + { + std::vector result; + to_ubjson(j, result, use_size, use_type); + return result; + } + + static void to_ubjson(const basic_json& j, detail::output_adapter o, + const bool use_size = false, const bool use_type = false) + { + binary_writer(o).write_ubjson(j, use_size, use_type); + } + + static void to_ubjson(const basic_json& j, detail::output_adapter o, + const bool use_size = false, const bool use_type = false) + { + binary_writer(o).write_ubjson(j, use_size, use_type); + } + + + /*! + @brief Serializes the given JSON object `j` to BSON and returns a vector + containing the corresponding BSON-representation. + + BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are + stored as a single entity (a so-called document). + + The library uses the following mapping from JSON values types to BSON types: + + JSON value type | value/range | BSON type | marker + --------------- | --------------------------------- | ----------- | ------ + null | `null` | null | 0x0A + boolean | `true`, `false` | boolean | 0x08 + number_integer | -9223372036854775808..-2147483649 | int64 | 0x12 + number_integer | -2147483648..2147483647 | int32 | 0x10 + number_integer | 2147483648..9223372036854775807 | int64 | 0x12 + number_unsigned | 0..2147483647 | int32 | 0x10 + number_unsigned | 2147483648..9223372036854775807 | int64 | 0x12 + number_unsigned | 9223372036854775808..18446744073709551615| -- | -- + number_float | *any value* | double | 0x01 + string | *any value* | string | 0x02 + array | *any value* | document | 0x04 + object | *any value* | document | 0x03 + binary | *any value* | binary | 0x05 + + @warning The mapping is **incomplete**, since only JSON-objects (and things + contained therein) can be serialized to BSON. + Also, integers larger than 9223372036854775807 cannot be serialized to BSON, + and the keys may not contain U+0000, since they are serialized a + zero-terminated c-strings. + + @throw out_of_range.407 if `j.is_number_unsigned() && j.get() > 9223372036854775807` + @throw out_of_range.409 if a key in `j` contains a NULL (U+0000) + @throw type_error.317 if `!j.is_object()` + + @pre The input `j` is required to be an object: `j.is_object() == true`. + + @note Any BSON output created via @ref to_bson can be successfully parsed + by @ref from_bson. + + @param[in] j JSON value to serialize + @return BSON serialization as byte vector + + @complexity Linear in the size of the JSON value @a j. + + @liveexample{The example shows the serialization of a JSON value to a byte + vector in BSON format.,to_bson} + + @sa http://bsonspec.org/spec.html + @sa see @ref from_bson(detail::input_adapter&&, const bool strict) for the + analogous deserialization + @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the + related UBJSON format + @sa see @ref to_cbor(const basic_json&) for the related CBOR format + @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format + */ + static std::vector to_bson(const basic_json& j) + { + std::vector result; + to_bson(j, result); + return result; + } + + /*! + @brief Serializes the given JSON object `j` to BSON and forwards the + corresponding BSON-representation to the given output_adapter `o`. + @param j The JSON object to convert to BSON. + @param o The output adapter that receives the binary BSON representation. + @pre The input `j` shall be an object: `j.is_object() == true` + @sa see @ref to_bson(const basic_json&) + */ + static void to_bson(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_bson(j); + } + + /*! + @copydoc to_bson(const basic_json&, detail::output_adapter) + */ + static void to_bson(const basic_json& j, detail::output_adapter o) + { + binary_writer(o).write_bson(j); + } + + + /*! + @brief create a JSON value from an input in CBOR format + + Deserializes a given input @a i to a JSON value using the CBOR (Concise + Binary Object Representation) serialization format. + + The library maps CBOR types to JSON value types as follows: + + CBOR type | JSON value type | first byte + ---------------------- | --------------- | ---------- + Integer | number_unsigned | 0x00..0x17 + Unsigned integer | number_unsigned | 0x18 + Unsigned integer | number_unsigned | 0x19 + Unsigned integer | number_unsigned | 0x1A + Unsigned integer | number_unsigned | 0x1B + Negative integer | number_integer | 0x20..0x37 + Negative integer | number_integer | 0x38 + Negative integer | number_integer | 0x39 + Negative integer | number_integer | 0x3A + Negative integer | number_integer | 0x3B + Byte string | binary | 0x40..0x57 + Byte string | binary | 0x58 + Byte string | binary | 0x59 + Byte string | binary | 0x5A + Byte string | binary | 0x5B + UTF-8 string | string | 0x60..0x77 + UTF-8 string | string | 0x78 + UTF-8 string | string | 0x79 + UTF-8 string | string | 0x7A + UTF-8 string | string | 0x7B + UTF-8 string | string | 0x7F + array | array | 0x80..0x97 + array | array | 0x98 + array | array | 0x99 + array | array | 0x9A + array | array | 0x9B + array | array | 0x9F + map | object | 0xA0..0xB7 + map | object | 0xB8 + map | object | 0xB9 + map | object | 0xBA + map | object | 0xBB + map | object | 0xBF + False | `false` | 0xF4 + True | `true` | 0xF5 + Null | `null` | 0xF6 + Half-Precision Float | number_float | 0xF9 + Single-Precision Float | number_float | 0xFA + Double-Precision Float | number_float | 0xFB + + @warning The mapping is **incomplete** in the sense that not all CBOR + types can be converted to a JSON value. The following CBOR types + are not supported and will yield parse errors (parse_error.112): + - date/time (0xC0..0xC1) + - bignum (0xC2..0xC3) + - decimal fraction (0xC4) + - bigfloat (0xC5) + - expected conversions (0xD5..0xD7) + - simple values (0xE0..0xF3, 0xF8) + - undefined (0xF7) + + @warning CBOR allows map keys of any type, whereas JSON only allows + strings as keys in object values. Therefore, CBOR maps with keys + other than UTF-8 strings are rejected (parse_error.113). + + @note Any CBOR output created @ref to_cbor can be successfully parsed by + @ref from_cbor. + + @param[in] i an input in CBOR format convertible to an input adapter + @param[in] strict whether to expect the input to be consumed until EOF + (true by default) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + @param[in] tag_handler how to treat CBOR tags (optional, error by default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.110 if the given input ends prematurely or the end of + file was not reached when @a strict was set to true + @throw parse_error.112 if unsupported features from CBOR were + used in the given input @a v or if the input is not valid CBOR + @throw parse_error.113 if a string was expected as map key, but not found + + @complexity Linear in the size of the input @a i. + + @liveexample{The example shows the deserialization of a byte vector in CBOR + format to a JSON value.,from_cbor} + + @sa http://cbor.io + @sa see @ref to_cbor(const basic_json&) for the analogous serialization + @sa see @ref from_msgpack(InputType&&, const bool, const bool) for the + related MessagePack format + @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the + related UBJSON format + + @since version 2.0.9; parameter @a start_index since 2.1.1; changed to + consume input adapters, removed start_index parameter, and added + @a strict parameter since 3.0.0; added @a allow_exceptions parameter + since 3.2.0; added @a tag_handler parameter since 3.9.0. + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_cbor(InputType&& i, + const bool strict = true, + const bool allow_exceptions = true, + const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::forward(i)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); + return res ? result : basic_json(value_t::discarded); + } + + /*! + @copydoc from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_cbor(IteratorType first, IteratorType last, + const bool strict = true, + const bool allow_exceptions = true, + const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::move(first), std::move(last)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); + return res ? result : basic_json(value_t::discarded); + } + + template + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len)) + static basic_json from_cbor(const T* ptr, std::size_t len, + const bool strict = true, + const bool allow_exceptions = true, + const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) + { + return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); + } + + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len)) + static basic_json from_cbor(detail::span_input_adapter&& i, + const bool strict = true, + const bool allow_exceptions = true, + const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = i.get(); + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); + return res ? result : basic_json(value_t::discarded); + } + + /*! + @brief create a JSON value from an input in MessagePack format + + Deserializes a given input @a i to a JSON value using the MessagePack + serialization format. + + The library maps MessagePack types to JSON value types as follows: + + MessagePack type | JSON value type | first byte + ---------------- | --------------- | ---------- + positive fixint | number_unsigned | 0x00..0x7F + fixmap | object | 0x80..0x8F + fixarray | array | 0x90..0x9F + fixstr | string | 0xA0..0xBF + nil | `null` | 0xC0 + false | `false` | 0xC2 + true | `true` | 0xC3 + float 32 | number_float | 0xCA + float 64 | number_float | 0xCB + uint 8 | number_unsigned | 0xCC + uint 16 | number_unsigned | 0xCD + uint 32 | number_unsigned | 0xCE + uint 64 | number_unsigned | 0xCF + int 8 | number_integer | 0xD0 + int 16 | number_integer | 0xD1 + int 32 | number_integer | 0xD2 + int 64 | number_integer | 0xD3 + str 8 | string | 0xD9 + str 16 | string | 0xDA + str 32 | string | 0xDB + array 16 | array | 0xDC + array 32 | array | 0xDD + map 16 | object | 0xDE + map 32 | object | 0xDF + bin 8 | binary | 0xC4 + bin 16 | binary | 0xC5 + bin 32 | binary | 0xC6 + ext 8 | binary | 0xC7 + ext 16 | binary | 0xC8 + ext 32 | binary | 0xC9 + fixext 1 | binary | 0xD4 + fixext 2 | binary | 0xD5 + fixext 4 | binary | 0xD6 + fixext 8 | binary | 0xD7 + fixext 16 | binary | 0xD8 + negative fixint | number_integer | 0xE0-0xFF + + @note Any MessagePack output created @ref to_msgpack can be successfully + parsed by @ref from_msgpack. + + @param[in] i an input in MessagePack format convertible to an input + adapter + @param[in] strict whether to expect the input to be consumed until EOF + (true by default) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.110 if the given input ends prematurely or the end of + file was not reached when @a strict was set to true + @throw parse_error.112 if unsupported features from MessagePack were + used in the given input @a i or if the input is not valid MessagePack + @throw parse_error.113 if a string was expected as map key, but not found + + @complexity Linear in the size of the input @a i. + + @liveexample{The example shows the deserialization of a byte vector in + MessagePack format to a JSON value.,from_msgpack} + + @sa http://msgpack.org + @sa see @ref to_msgpack(const basic_json&) for the analogous serialization + @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the + related CBOR format + @sa see @ref from_ubjson(InputType&&, const bool, const bool) for + the related UBJSON format + @sa see @ref from_bson(InputType&&, const bool, const bool) for + the related BSON format + + @since version 2.0.9; parameter @a start_index since 2.1.1; changed to + consume input adapters, removed start_index parameter, and added + @a strict parameter since 3.0.0; added @a allow_exceptions parameter + since 3.2.0 + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_msgpack(InputType&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::forward(i)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + /*! + @copydoc from_msgpack(InputType&&, const bool, const bool) + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_msgpack(IteratorType first, IteratorType last, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::move(first), std::move(last)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + + template + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len)) + static basic_json from_msgpack(const T* ptr, std::size_t len, + const bool strict = true, + const bool allow_exceptions = true) + { + return from_msgpack(ptr, ptr + len, strict, allow_exceptions); + } + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len)) + static basic_json from_msgpack(detail::span_input_adapter&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = i.get(); + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + + /*! + @brief create a JSON value from an input in UBJSON format + + Deserializes a given input @a i to a JSON value using the UBJSON (Universal + Binary JSON) serialization format. + + The library maps UBJSON types to JSON value types as follows: + + UBJSON type | JSON value type | marker + ----------- | --------------------------------------- | ------ + no-op | *no value, next value is read* | `N` + null | `null` | `Z` + false | `false` | `F` + true | `true` | `T` + float32 | number_float | `d` + float64 | number_float | `D` + uint8 | number_unsigned | `U` + int8 | number_integer | `i` + int16 | number_integer | `I` + int32 | number_integer | `l` + int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' + string | string | `S` + char | string | `C` + array | array (optimized values are supported) | `[` + object | object (optimized values are supported) | `{` + + @note The mapping is **complete** in the sense that any UBJSON value can + be converted to a JSON value. + + @param[in] i an input in UBJSON format convertible to an input adapter + @param[in] strict whether to expect the input to be consumed until EOF + (true by default) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.110 if the given input ends prematurely or the end of + file was not reached when @a strict was set to true + @throw parse_error.112 if a parse error occurs + @throw parse_error.113 if a string could not be parsed successfully + + @complexity Linear in the size of the input @a i. + + @liveexample{The example shows the deserialization of a byte vector in + UBJSON format to a JSON value.,from_ubjson} + + @sa http://ubjson.org + @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the + analogous serialization + @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the + related CBOR format + @sa see @ref from_msgpack(InputType&&, const bool, const bool) for + the related MessagePack format + @sa see @ref from_bson(InputType&&, const bool, const bool) for + the related BSON format + + @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0 + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_ubjson(InputType&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::forward(i)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + /*! + @copydoc from_ubjson(InputType&&, const bool, const bool) + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_ubjson(IteratorType first, IteratorType last, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::move(first), std::move(last)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + template + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len)) + static basic_json from_ubjson(const T* ptr, std::size_t len, + const bool strict = true, + const bool allow_exceptions = true) + { + return from_ubjson(ptr, ptr + len, strict, allow_exceptions); + } + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len)) + static basic_json from_ubjson(detail::span_input_adapter&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = i.get(); + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + + /*! + @brief Create a JSON value from an input in BSON format + + Deserializes a given input @a i to a JSON value using the BSON (Binary JSON) + serialization format. + + The library maps BSON record types to JSON value types as follows: + + BSON type | BSON marker byte | JSON value type + --------------- | ---------------- | --------------------------- + double | 0x01 | number_float + string | 0x02 | string + document | 0x03 | object + array | 0x04 | array + binary | 0x05 | binary + undefined | 0x06 | still unsupported + ObjectId | 0x07 | still unsupported + boolean | 0x08 | boolean + UTC Date-Time | 0x09 | still unsupported + null | 0x0A | null + Regular Expr. | 0x0B | still unsupported + DB Pointer | 0x0C | still unsupported + JavaScript Code | 0x0D | still unsupported + Symbol | 0x0E | still unsupported + JavaScript Code | 0x0F | still unsupported + int32 | 0x10 | number_integer + Timestamp | 0x11 | still unsupported + 128-bit decimal float | 0x13 | still unsupported + Max Key | 0x7F | still unsupported + Min Key | 0xFF | still unsupported + + @warning The mapping is **incomplete**. The unsupported mappings + are indicated in the table above. + + @param[in] i an input in BSON format convertible to an input adapter + @param[in] strict whether to expect the input to be consumed until EOF + (true by default) + @param[in] allow_exceptions whether to throw exceptions in case of a + parse error (optional, true by default) + + @return deserialized JSON value; in case of a parse error and + @a allow_exceptions set to `false`, the return value will be + value_t::discarded. + + @throw parse_error.114 if an unsupported BSON record type is encountered + + @complexity Linear in the size of the input @a i. + + @liveexample{The example shows the deserialization of a byte vector in + BSON format to a JSON value.,from_bson} + + @sa http://bsonspec.org/spec.html + @sa see @ref to_bson(const basic_json&) for the analogous serialization + @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the + related CBOR format + @sa see @ref from_msgpack(InputType&&, const bool, const bool) for + the related MessagePack format + @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the + related UBJSON format + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_bson(InputType&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::forward(i)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + /*! + @copydoc from_bson(InputType&&, const bool, const bool) + */ + template + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json from_bson(IteratorType first, IteratorType last, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = detail::input_adapter(std::move(first), std::move(last)); + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + + template + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len)) + static basic_json from_bson(const T* ptr, std::size_t len, + const bool strict = true, + const bool allow_exceptions = true) + { + return from_bson(ptr, ptr + len, strict, allow_exceptions); + } + + JSON_HEDLEY_WARN_UNUSED_RESULT + JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len)) + static basic_json from_bson(detail::span_input_adapter&& i, + const bool strict = true, + const bool allow_exceptions = true) + { + basic_json result; + detail::json_sax_dom_parser sdp(result, allow_exceptions); + auto ia = i.get(); + // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg) + const bool res = binary_reader(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict); + return res ? result : basic_json(value_t::discarded); + } + /// @} + + ////////////////////////// + // JSON Pointer support // + ////////////////////////// + + /// @name JSON Pointer functions + /// @{ + + /*! + @brief access specified element via JSON Pointer + + Uses a JSON pointer to retrieve a reference to the respective JSON value. + No bound checking is performed. Similar to @ref operator[](const typename + object_t::key_type&), `null` values are created in arrays and objects if + necessary. + + In particular: + - If the JSON pointer points to an object key that does not exist, it + is created an filled with a `null` value before a reference to it + is returned. + - If the JSON pointer points to an array index that does not exist, it + is created an filled with a `null` value before a reference to it + is returned. All indices between the current maximum and the given + index are also filled with `null`. + - The special value `-` is treated as a synonym for the index past the + end. + + @param[in] ptr a JSON pointer + + @return reference to the element pointed to by @a ptr + + @complexity Constant. + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.404 if the JSON pointer can not be resolved + + @liveexample{The behavior is shown in the example.,operatorjson_pointer} + + @since version 2.0.0 + */ + reference operator[](const json_pointer& ptr) + { + return ptr.get_unchecked(this); + } + + /*! + @brief access specified element via JSON Pointer + + Uses a JSON pointer to retrieve a reference to the respective JSON value. + No bound checking is performed. The function does not change the JSON + value; no `null` values are created. In particular, the special value + `-` yields an exception. + + @param[in] ptr JSON pointer to the desired element + + @return const reference to the element pointed to by @a ptr + + @complexity Constant. + + @throw parse_error.106 if an array index begins with '0' + @throw parse_error.109 if an array index was not a number + @throw out_of_range.402 if the array index '-' is used + @throw out_of_range.404 if the JSON pointer can not be resolved + + @liveexample{The behavior is shown in the example.,operatorjson_pointer_const} + + @since version 2.0.0 + */ + const_reference operator[](const json_pointer& ptr) const + { + return ptr.get_unchecked(this); + } + + /*! + @brief access specified element via JSON Pointer + + Returns a reference to the element at with specified JSON pointer @a ptr, + with bounds checking. + + @param[in] ptr JSON pointer to the desired element + + @return reference to the element pointed to by @a ptr + + @throw parse_error.106 if an array index in the passed JSON pointer @a ptr + begins with '0'. See example below. + + @throw parse_error.109 if an array index in the passed JSON pointer @a ptr + is not a number. See example below. + + @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr + is out of range. See example below. + + @throw out_of_range.402 if the array index '-' is used in the passed JSON + pointer @a ptr. As `at` provides checked access (and no elements are + implicitly inserted), the index '-' is always invalid. See example below. + + @throw out_of_range.403 if the JSON pointer describes a key of an object + which cannot be found. See example below. + + @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved. + See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @since version 2.0.0 + + @liveexample{The behavior is shown in the example.,at_json_pointer} + */ + reference at(const json_pointer& ptr) + { + return ptr.get_checked(this); + } + + /*! + @brief access specified element via JSON Pointer + + Returns a const reference to the element at with specified JSON pointer @a + ptr, with bounds checking. + + @param[in] ptr JSON pointer to the desired element + + @return reference to the element pointed to by @a ptr + + @throw parse_error.106 if an array index in the passed JSON pointer @a ptr + begins with '0'. See example below. + + @throw parse_error.109 if an array index in the passed JSON pointer @a ptr + is not a number. See example below. + + @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr + is out of range. See example below. + + @throw out_of_range.402 if the array index '-' is used in the passed JSON + pointer @a ptr. As `at` provides checked access (and no elements are + implicitly inserted), the index '-' is always invalid. See example below. + + @throw out_of_range.403 if the JSON pointer describes a key of an object + which cannot be found. See example below. + + @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved. + See example below. + + @exceptionsafety Strong guarantee: if an exception is thrown, there are no + changes in the JSON value. + + @complexity Constant. + + @since version 2.0.0 + + @liveexample{The behavior is shown in the example.,at_json_pointer_const} + */ + const_reference at(const json_pointer& ptr) const + { + return ptr.get_checked(this); + } + + /*! + @brief return flattened JSON value + + The function creates a JSON object whose keys are JSON pointers (see [RFC + 6901](https://tools.ietf.org/html/rfc6901)) and whose values are all + primitive. The original JSON value can be restored using the @ref + unflatten() function. + + @return an object that maps JSON pointers to primitive values + + @note Empty objects and arrays are flattened to `null` and will not be + reconstructed correctly by the @ref unflatten() function. + + @complexity Linear in the size the JSON value. + + @liveexample{The following code shows how a JSON object is flattened to an + object whose keys consist of JSON pointers.,flatten} + + @sa see @ref unflatten() for the reverse function + + @since version 2.0.0 + */ + basic_json flatten() const + { + basic_json result(value_t::object); + json_pointer::flatten("", *this, result); + return result; + } + + /*! + @brief unflatten a previously flattened JSON value + + The function restores the arbitrary nesting of a JSON value that has been + flattened before using the @ref flatten() function. The JSON value must + meet certain constraints: + 1. The value must be an object. + 2. The keys must be JSON pointers (see + [RFC 6901](https://tools.ietf.org/html/rfc6901)) + 3. The mapped values must be primitive JSON types. + + @return the original JSON from a flattened version + + @note Empty objects and arrays are flattened by @ref flatten() to `null` + values and can not unflattened to their original type. Apart from + this example, for a JSON value `j`, the following is always true: + `j == j.flatten().unflatten()`. + + @complexity Linear in the size the JSON value. + + @throw type_error.314 if value is not an object + @throw type_error.315 if object values are not primitive + + @liveexample{The following code shows how a flattened JSON object is + unflattened into the original nested JSON object.,unflatten} + + @sa see @ref flatten() for the reverse function + + @since version 2.0.0 + */ + basic_json unflatten() const + { + return json_pointer::unflatten(*this); + } + + /// @} + + ////////////////////////// + // JSON Patch functions // + ////////////////////////// + + /// @name JSON Patch functions + /// @{ + + /*! + @brief applies a JSON patch + + [JSON Patch](http://jsonpatch.com) defines a JSON document structure for + expressing a sequence of operations to apply to a JSON) document. With + this function, a JSON Patch is applied to the current JSON value by + executing all operations from the patch. + + @param[in] json_patch JSON patch document + @return patched document + + @note The application of a patch is atomic: Either all operations succeed + and the patched document is returned or an exception is thrown. In + any case, the original value is not changed: the patch is applied + to a copy of the value. + + @throw parse_error.104 if the JSON patch does not consist of an array of + objects + + @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory + attributes are missing); example: `"operation add must have member path"` + + @throw out_of_range.401 if an array index is out of range. + + @throw out_of_range.403 if a JSON pointer inside the patch could not be + resolved successfully in the current JSON value; example: `"key baz not + found"` + + @throw out_of_range.405 if JSON pointer has no parent ("add", "remove", + "move") + + @throw other_error.501 if "test" operation was unsuccessful + + @complexity Linear in the size of the JSON value and the length of the + JSON patch. As usually only a fraction of the JSON value is affected by + the patch, the complexity can usually be neglected. + + @liveexample{The following code shows how a JSON patch is applied to a + value.,patch} + + @sa see @ref diff -- create a JSON patch by comparing two JSON values + + @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902) + @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901) + + @since version 2.0.0 + */ + basic_json patch(const basic_json& json_patch) const + { + // make a working copy to apply the patch to + basic_json result = *this; + + // the valid JSON Patch operations + enum class patch_operations {add, remove, replace, move, copy, test, invalid}; + + const auto get_op = [](const std::string & op) + { + if (op == "add") + { + return patch_operations::add; + } + if (op == "remove") + { + return patch_operations::remove; + } + if (op == "replace") + { + return patch_operations::replace; + } + if (op == "move") + { + return patch_operations::move; + } + if (op == "copy") + { + return patch_operations::copy; + } + if (op == "test") + { + return patch_operations::test; + } + + return patch_operations::invalid; + }; + + // wrapper for "add" operation; add value at ptr + const auto operation_add = [&result](json_pointer & ptr, basic_json val) + { + // adding to the root of the target document means replacing it + if (ptr.empty()) + { + result = val; + return; + } + + // make sure the top element of the pointer exists + json_pointer top_pointer = ptr.top(); + if (top_pointer != ptr) + { + result.at(top_pointer); + } + + // get reference to parent of JSON pointer ptr + const auto last_path = ptr.back(); + ptr.pop_back(); + basic_json& parent = result[ptr]; + + switch (parent.m_type) + { + case value_t::null: + case value_t::object: + { + // use operator[] to add value + parent[last_path] = val; + break; + } + + case value_t::array: + { + if (last_path == "-") + { + // special case: append to back + parent.push_back(val); + } + else + { + const auto idx = json_pointer::array_index(last_path); + if (JSON_HEDLEY_UNLIKELY(idx > parent.size())) + { + // avoid undefined behavior + JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", parent)); + } + + // default case: insert add offset + parent.insert(parent.begin() + static_cast(idx), val); + } + break; + } + + // if there exists a parent it cannot be primitive + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + } + }; + + // wrapper for "remove" operation; remove value at ptr + const auto operation_remove = [this, &result](json_pointer & ptr) + { + // get reference to parent of JSON pointer ptr + const auto last_path = ptr.back(); + ptr.pop_back(); + basic_json& parent = result.at(ptr); + + // remove child + if (parent.is_object()) + { + // perform range check + auto it = parent.find(last_path); + if (JSON_HEDLEY_LIKELY(it != parent.end())) + { + parent.erase(it); + } + else + { + JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found", *this)); + } + } + else if (parent.is_array()) + { + // note erase performs range check + parent.erase(json_pointer::array_index(last_path)); + } + }; + + // type check: top level value must be an array + if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array())) + { + JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", json_patch)); + } + + // iterate and apply the operations + for (const auto& val : json_patch) + { + // wrapper to get a value for an operation + const auto get_value = [&val](const std::string & op, + const std::string & member, + bool string_type) -> basic_json & + { + // find value + auto it = val.m_value.object->find(member); + + // context-sensitive error message + const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'"; + + // check if desired value is present + if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end())) + { + // NOLINTNEXTLINE(performance-inefficient-string-concatenation) + JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'", val)); + } + + // check if result is of type string + if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string())) + { + // NOLINTNEXTLINE(performance-inefficient-string-concatenation) + JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'", val)); + } + + // no error: return value + return it->second; + }; + + // type check: every element of the array must be an object + if (JSON_HEDLEY_UNLIKELY(!val.is_object())) + { + JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", val)); + } + + // collect mandatory members + const auto op = get_value("op", "op", true).template get(); + const auto path = get_value(op, "path", true).template get(); + json_pointer ptr(path); + + switch (get_op(op)) + { + case patch_operations::add: + { + operation_add(ptr, get_value("add", "value", false)); + break; + } + + case patch_operations::remove: + { + operation_remove(ptr); + break; + } + + case patch_operations::replace: + { + // the "path" location must exist - use at() + result.at(ptr) = get_value("replace", "value", false); + break; + } + + case patch_operations::move: + { + const auto from_path = get_value("move", "from", true).template get(); + json_pointer from_ptr(from_path); + + // the "from" location must exist - use at() + basic_json v = result.at(from_ptr); + + // The move operation is functionally identical to a + // "remove" operation on the "from" location, followed + // immediately by an "add" operation at the target + // location with the value that was just removed. + operation_remove(from_ptr); + operation_add(ptr, v); + break; + } + + case patch_operations::copy: + { + const auto from_path = get_value("copy", "from", true).template get(); + const json_pointer from_ptr(from_path); + + // the "from" location must exist - use at() + basic_json v = result.at(from_ptr); + + // The copy is functionally identical to an "add" + // operation at the target location using the value + // specified in the "from" member. + operation_add(ptr, v); + break; + } + + case patch_operations::test: + { + bool success = false; + JSON_TRY + { + // check if "value" matches the one at "path" + // the "path" location must exist - use at() + success = (result.at(ptr) == get_value("test", "value", false)); + } + JSON_INTERNAL_CATCH (out_of_range&) + { + // ignore out of range errors: success remains false + } + + // throw an exception if test fails + if (JSON_HEDLEY_UNLIKELY(!success)) + { + JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump(), val)); + } + + break; + } + + default: + { + // op must be "add", "remove", "replace", "move", "copy", or + // "test" + JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid", val)); + } + } + } + + return result; + } + + /*! + @brief creates a diff as a JSON patch + + Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can + be changed into the value @a target by calling @ref patch function. + + @invariant For two JSON values @a source and @a target, the following code + yields always `true`: + @code {.cpp} + source.patch(diff(source, target)) == target; + @endcode + + @note Currently, only `remove`, `add`, and `replace` operations are + generated. + + @param[in] source JSON value to compare from + @param[in] target JSON value to compare against + @param[in] path helper value to create JSON pointers + + @return a JSON patch to convert the @a source to @a target + + @complexity Linear in the lengths of @a source and @a target. + + @liveexample{The following code shows how a JSON patch is created as a + diff for two JSON values.,diff} + + @sa see @ref patch -- apply a JSON patch + @sa see @ref merge_patch -- apply a JSON Merge Patch + + @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902) + + @since version 2.0.0 + */ + JSON_HEDLEY_WARN_UNUSED_RESULT + static basic_json diff(const basic_json& source, const basic_json& target, + const std::string& path = "") + { + // the patch + basic_json result(value_t::array); + + // if the values are the same, return empty patch + if (source == target) + { + return result; + } + + if (source.type() != target.type()) + { + // different types: replace value + result.push_back( + { + {"op", "replace"}, {"path", path}, {"value", target} + }); + return result; + } + + switch (source.type()) + { + case value_t::array: + { + // first pass: traverse common elements + std::size_t i = 0; + while (i < source.size() && i < target.size()) + { + // recursive call to compare array values at index i + auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i)); + result.insert(result.end(), temp_diff.begin(), temp_diff.end()); + ++i; + } + + // i now reached the end of at least one array + // in a second pass, traverse the remaining elements + + // remove my remaining elements + const auto end_index = static_cast(result.size()); + while (i < source.size()) + { + // add operations in reverse order to avoid invalid + // indices + result.insert(result.begin() + end_index, object( + { + {"op", "remove"}, + {"path", path + "/" + std::to_string(i)} + })); + ++i; + } + + // add other remaining elements + while (i < target.size()) + { + result.push_back( + { + {"op", "add"}, + {"path", path + "/-"}, + {"value", target[i]} + }); + ++i; + } + + break; + } + + case value_t::object: + { + // first pass: traverse this object's elements + for (auto it = source.cbegin(); it != source.cend(); ++it) + { + // escape the key name to be used in a JSON patch + const auto path_key = path + "/" + detail::escape(it.key()); + + if (target.find(it.key()) != target.end()) + { + // recursive call to compare object values at key it + auto temp_diff = diff(it.value(), target[it.key()], path_key); + result.insert(result.end(), temp_diff.begin(), temp_diff.end()); + } + else + { + // found a key that is not in o -> remove it + result.push_back(object( + { + {"op", "remove"}, {"path", path_key} + })); + } + } + + // second pass: traverse other object's elements + for (auto it = target.cbegin(); it != target.cend(); ++it) + { + if (source.find(it.key()) == source.end()) + { + // found a key that is not in this -> add it + const auto path_key = path + "/" + detail::escape(it.key()); + result.push_back( + { + {"op", "add"}, {"path", path_key}, + {"value", it.value()} + }); + } + } + + break; + } + + default: + { + // both primitive type: replace value + result.push_back( + { + {"op", "replace"}, {"path", path}, {"value", target} + }); + break; + } + } + + return result; + } + + /// @} + + //////////////////////////////// + // JSON Merge Patch functions // + //////////////////////////////// + + /// @name JSON Merge Patch functions + /// @{ + + /*! + @brief applies a JSON Merge Patch + + The merge patch format is primarily intended for use with the HTTP PATCH + method as a means of describing a set of modifications to a target + resource's content. This function applies a merge patch to the current + JSON value. + + The function implements the following algorithm from Section 2 of + [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396): + + ``` + define MergePatch(Target, Patch): + if Patch is an Object: + if Target is not an Object: + Target = {} // Ignore the contents and set it to an empty Object + for each Name/Value pair in Patch: + if Value is null: + if Name exists in Target: + remove the Name/Value pair from Target + else: + Target[Name] = MergePatch(Target[Name], Value) + return Target + else: + return Patch + ``` + + Thereby, `Target` is the current object; that is, the patch is applied to + the current value. + + @param[in] apply_patch the patch to apply + + @complexity Linear in the lengths of @a patch. + + @liveexample{The following code shows how a JSON Merge Patch is applied to + a JSON document.,merge_patch} + + @sa see @ref patch -- apply a JSON patch + @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396) + + @since version 3.0.0 + */ + void merge_patch(const basic_json& apply_patch) + { + if (apply_patch.is_object()) + { + if (!is_object()) + { + *this = object(); + } + for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it) + { + if (it.value().is_null()) + { + erase(it.key()); + } + else + { + operator[](it.key()).merge_patch(it.value()); + } + } + } + else + { + *this = apply_patch; + } + } + + /// @} +}; + +/*! +@brief user-defined to_string function for JSON values + +This function implements a user-defined to_string for JSON objects. + +@param[in] j a JSON object +@return a std::string object +*/ + +NLOHMANN_BASIC_JSON_TPL_DECLARATION +std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j) +{ + return j.dump(); +} +} // namespace nlohmann + +/////////////////////// +// nonmember support // +/////////////////////// + +// specialization of std::swap, and std::hash +namespace std +{ + +/// hash value for JSON objects +template<> +struct hash +{ + /*! + @brief return a hash value for a JSON object + + @since version 1.0.0 + */ + std::size_t operator()(const nlohmann::json& j) const + { + return nlohmann::detail::hash(j); + } +}; + +/// specialization for std::less +/// @note: do not remove the space after '<', +/// see https://github.com/nlohmann/json/pull/679 +template<> +struct less<::nlohmann::detail::value_t> +{ + /*! + @brief compare two value_t enum values + @since version 3.0.0 + */ + bool operator()(nlohmann::detail::value_t lhs, + nlohmann::detail::value_t rhs) const noexcept + { + return nlohmann::detail::operator<(lhs, rhs); + } +}; + +// C++20 prohibit function specialization in the std namespace. +#ifndef JSON_HAS_CPP_20 + +/*! +@brief exchanges the values of two JSON objects + +@since version 1.0.0 +*/ +template<> +inline void swap(nlohmann::json& j1, nlohmann::json& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name) + is_nothrow_move_constructible::value&& // NOLINT(misc-redundant-expression) + is_nothrow_move_assignable::value + ) +{ + j1.swap(j2); +} + +#endif + +} // namespace std + +/*! +@brief user-defined string literal for JSON values + +This operator implements a user-defined string literal for JSON objects. It +can be used by adding `"_json"` to a string literal and returns a JSON object +if no parse error occurred. + +@param[in] s a string representation of a JSON object +@param[in] n the length of string @a s +@return a JSON object + +@since version 1.0.0 +*/ +JSON_HEDLEY_NON_NULL(1) +inline nlohmann::json operator "" _json(const char* s, std::size_t n) +{ + return nlohmann::json::parse(s, s + n); +} + +/*! +@brief user-defined string literal for JSON pointer + +This operator implements a user-defined string literal for JSON Pointers. It +can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer +object if no parse error occurred. + +@param[in] s a string representation of a JSON Pointer +@param[in] n the length of string @a s +@return a JSON pointer object + +@since version 2.0.0 +*/ +JSON_HEDLEY_NON_NULL(1) +inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n) +{ + return nlohmann::json::json_pointer(std::string(s, n)); +} + +// #include + + +// restore GCC/clang diagnostic settings +#if defined(__clang__) + #pragma GCC diagnostic pop +#endif + +// clean up +#undef JSON_ASSERT +#undef JSON_INTERNAL_CATCH +#undef JSON_CATCH +#undef JSON_THROW +#undef JSON_TRY +#undef JSON_PRIVATE_UNLESS_TESTED +#undef JSON_HAS_CPP_11 +#undef JSON_HAS_CPP_14 +#undef JSON_HAS_CPP_17 +#undef JSON_HAS_CPP_20 +#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION +#undef NLOHMANN_BASIC_JSON_TPL +#undef JSON_EXPLICIT + +// #include + + +#undef JSON_HEDLEY_ALWAYS_INLINE +#undef JSON_HEDLEY_ARM_VERSION +#undef JSON_HEDLEY_ARM_VERSION_CHECK +#undef JSON_HEDLEY_ARRAY_PARAM +#undef JSON_HEDLEY_ASSUME +#undef JSON_HEDLEY_BEGIN_C_DECLS +#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE +#undef JSON_HEDLEY_CLANG_HAS_BUILTIN +#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE +#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE +#undef JSON_HEDLEY_CLANG_HAS_EXTENSION +#undef JSON_HEDLEY_CLANG_HAS_FEATURE +#undef JSON_HEDLEY_CLANG_HAS_WARNING +#undef JSON_HEDLEY_COMPCERT_VERSION +#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK +#undef JSON_HEDLEY_CONCAT +#undef JSON_HEDLEY_CONCAT3 +#undef JSON_HEDLEY_CONCAT3_EX +#undef JSON_HEDLEY_CONCAT_EX +#undef JSON_HEDLEY_CONST +#undef JSON_HEDLEY_CONSTEXPR +#undef JSON_HEDLEY_CONST_CAST +#undef JSON_HEDLEY_CPP_CAST +#undef JSON_HEDLEY_CRAY_VERSION +#undef JSON_HEDLEY_CRAY_VERSION_CHECK +#undef JSON_HEDLEY_C_DECL +#undef JSON_HEDLEY_DEPRECATED +#undef JSON_HEDLEY_DEPRECATED_FOR +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#undef JSON_HEDLEY_DIAGNOSTIC_POP +#undef JSON_HEDLEY_DIAGNOSTIC_PUSH +#undef JSON_HEDLEY_DMC_VERSION +#undef JSON_HEDLEY_DMC_VERSION_CHECK +#undef JSON_HEDLEY_EMPTY_BASES +#undef JSON_HEDLEY_EMSCRIPTEN_VERSION +#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK +#undef JSON_HEDLEY_END_C_DECLS +#undef JSON_HEDLEY_FLAGS +#undef JSON_HEDLEY_FLAGS_CAST +#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE +#undef JSON_HEDLEY_GCC_HAS_BUILTIN +#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE +#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE +#undef JSON_HEDLEY_GCC_HAS_EXTENSION +#undef JSON_HEDLEY_GCC_HAS_FEATURE +#undef JSON_HEDLEY_GCC_HAS_WARNING +#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK +#undef JSON_HEDLEY_GCC_VERSION +#undef JSON_HEDLEY_GCC_VERSION_CHECK +#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE +#undef JSON_HEDLEY_GNUC_HAS_BUILTIN +#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE +#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE +#undef JSON_HEDLEY_GNUC_HAS_EXTENSION +#undef JSON_HEDLEY_GNUC_HAS_FEATURE +#undef JSON_HEDLEY_GNUC_HAS_WARNING +#undef JSON_HEDLEY_GNUC_VERSION +#undef JSON_HEDLEY_GNUC_VERSION_CHECK +#undef JSON_HEDLEY_HAS_ATTRIBUTE +#undef JSON_HEDLEY_HAS_BUILTIN +#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE +#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS +#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE +#undef JSON_HEDLEY_HAS_EXTENSION +#undef JSON_HEDLEY_HAS_FEATURE +#undef JSON_HEDLEY_HAS_WARNING +#undef JSON_HEDLEY_IAR_VERSION +#undef JSON_HEDLEY_IAR_VERSION_CHECK +#undef JSON_HEDLEY_IBM_VERSION +#undef JSON_HEDLEY_IBM_VERSION_CHECK +#undef JSON_HEDLEY_IMPORT +#undef JSON_HEDLEY_INLINE +#undef JSON_HEDLEY_INTEL_CL_VERSION +#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK +#undef JSON_HEDLEY_INTEL_VERSION +#undef JSON_HEDLEY_INTEL_VERSION_CHECK +#undef JSON_HEDLEY_IS_CONSTANT +#undef JSON_HEDLEY_IS_CONSTEXPR_ +#undef JSON_HEDLEY_LIKELY +#undef JSON_HEDLEY_MALLOC +#undef JSON_HEDLEY_MCST_LCC_VERSION +#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK +#undef JSON_HEDLEY_MESSAGE +#undef JSON_HEDLEY_MSVC_VERSION +#undef JSON_HEDLEY_MSVC_VERSION_CHECK +#undef JSON_HEDLEY_NEVER_INLINE +#undef JSON_HEDLEY_NON_NULL +#undef JSON_HEDLEY_NO_ESCAPE +#undef JSON_HEDLEY_NO_RETURN +#undef JSON_HEDLEY_NO_THROW +#undef JSON_HEDLEY_NULL +#undef JSON_HEDLEY_PELLES_VERSION +#undef JSON_HEDLEY_PELLES_VERSION_CHECK +#undef JSON_HEDLEY_PGI_VERSION +#undef JSON_HEDLEY_PGI_VERSION_CHECK +#undef JSON_HEDLEY_PREDICT +#undef JSON_HEDLEY_PRINTF_FORMAT +#undef JSON_HEDLEY_PRIVATE +#undef JSON_HEDLEY_PUBLIC +#undef JSON_HEDLEY_PURE +#undef JSON_HEDLEY_REINTERPRET_CAST +#undef JSON_HEDLEY_REQUIRE +#undef JSON_HEDLEY_REQUIRE_CONSTEXPR +#undef JSON_HEDLEY_REQUIRE_MSG +#undef JSON_HEDLEY_RESTRICT +#undef JSON_HEDLEY_RETURNS_NON_NULL +#undef JSON_HEDLEY_SENTINEL +#undef JSON_HEDLEY_STATIC_ASSERT +#undef JSON_HEDLEY_STATIC_CAST +#undef JSON_HEDLEY_STRINGIFY +#undef JSON_HEDLEY_STRINGIFY_EX +#undef JSON_HEDLEY_SUNPRO_VERSION +#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK +#undef JSON_HEDLEY_TINYC_VERSION +#undef JSON_HEDLEY_TINYC_VERSION_CHECK +#undef JSON_HEDLEY_TI_ARMCL_VERSION +#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK +#undef JSON_HEDLEY_TI_CL2000_VERSION +#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK +#undef JSON_HEDLEY_TI_CL430_VERSION +#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK +#undef JSON_HEDLEY_TI_CL6X_VERSION +#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK +#undef JSON_HEDLEY_TI_CL7X_VERSION +#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK +#undef JSON_HEDLEY_TI_CLPRU_VERSION +#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK +#undef JSON_HEDLEY_TI_VERSION +#undef JSON_HEDLEY_TI_VERSION_CHECK +#undef JSON_HEDLEY_UNAVAILABLE +#undef JSON_HEDLEY_UNLIKELY +#undef JSON_HEDLEY_UNPREDICTABLE +#undef JSON_HEDLEY_UNREACHABLE +#undef JSON_HEDLEY_UNREACHABLE_RETURN +#undef JSON_HEDLEY_VERSION +#undef JSON_HEDLEY_VERSION_DECODE_MAJOR +#undef JSON_HEDLEY_VERSION_DECODE_MINOR +#undef JSON_HEDLEY_VERSION_DECODE_REVISION +#undef JSON_HEDLEY_VERSION_ENCODE +#undef JSON_HEDLEY_WARNING +#undef JSON_HEDLEY_WARN_UNUSED_RESULT +#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG +#undef JSON_HEDLEY_FALL_THROUGH + + + +#endif // INCLUDE_NLOHMANN_JSON_HPP_ diff --git a/include/nlohmann/json_fwd.hpp b/include/nlohmann/json_fwd.hpp new file mode 100644 index 00000000..332227c1 --- /dev/null +++ b/include/nlohmann/json_fwd.hpp @@ -0,0 +1,78 @@ +#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ +#define INCLUDE_NLOHMANN_JSON_FWD_HPP_ + +#include // int64_t, uint64_t +#include // map +#include // allocator +#include // string +#include // vector + +/*! +@brief namespace for Niels Lohmann +@see https://github.com/nlohmann +@since version 1.0.0 +*/ +namespace nlohmann +{ +/*! +@brief default JSONSerializer template argument + +This serializer ignores the template arguments and uses ADL +([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) +for serialization. +*/ +template +struct adl_serializer; + +template class ObjectType = + std::map, + template class ArrayType = std::vector, + class StringType = std::string, class BooleanType = bool, + class NumberIntegerType = std::int64_t, + class NumberUnsignedType = std::uint64_t, + class NumberFloatType = double, + template class AllocatorType = std::allocator, + template class JSONSerializer = + adl_serializer, + class BinaryType = std::vector> +class basic_json; + +/*! +@brief JSON Pointer + +A JSON pointer defines a string syntax for identifying a specific value +within a JSON document. It can be used with functions `at` and +`operator[]`. Furthermore, JSON pointers are the base for JSON patches. + +@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) + +@since version 2.0.0 +*/ +template +class json_pointer; + +/*! +@brief default JSON class + +This type is the default specialization of the @ref basic_json class which +uses the standard template types. + +@since version 1.0.0 +*/ +using json = basic_json<>; + +template +struct ordered_map; + +/*! +@brief ordered JSON class + +This type preserves the insertion order of object keys. + +@since version 3.9.0 +*/ +using ordered_json = basic_json; + +} // namespace nlohmann + +#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ From aab0c2abb52ee68ad9e5da307f3c6d9d31435cb1 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 18:15:01 -0400 Subject: [PATCH 114/350] Add vertex count interface to cdBg constructor Available for free from the hash table --- include/Read_CdBG_Constructor.hpp | 3 +++ src/Read_CdBG_Constructor.cpp | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 6b0f87d3..0658b315 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -92,6 +92,9 @@ class Read_CdBG_Constructor // Computes the states of the DFA in the de Bruijn graph. void compute_DFA_states(); + // Returns the number of distinct vertices in the underlying graph. + uint64_t vertex_count() const; + // Returns the number of distinct edges in the underlying graph. uint64_t edge_count() const; }; diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index dac3a713..bed807f8 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -134,6 +134,13 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed } +template +uint64_t Read_CdBG_Constructor::vertex_count() const +{ + return hash_table.size(); +} + + template uint64_t Read_CdBG_Constructor::edge_count() const { From c60dab1272412b7d43899f46c6dc3426f311c019 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 18:38:27 -0400 Subject: [PATCH 115/350] Add basic structural information dump --- include/Read_CdBG.hpp | 8 ++++++++ src/Read_CdBG.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index d707218f..88d4de4f 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -7,6 +7,8 @@ #include "Build_Params.hpp" #include "Kmer_Hash_Table.hpp" +#include "nlohmann/json.hpp" + // Read de Bruijn graph class to support the compaction algorithm. template @@ -17,6 +19,12 @@ class Read_CdBG const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + nlohmann::ordered_json dBg_info; // JSON object to store structural information over the de Bruijn graph. + + + // Writes the structural information about the de Bruijn graph — obtained from the algorithm + // execution — to disk. + void dump_dBg_info() const; public: diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 66d87b8a..d1466979 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -3,6 +3,8 @@ #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include + template Read_CdBG::Read_CdBG(const Build_Params& params): @@ -21,6 +23,10 @@ void Read_CdBG::construct() Read_CdBG_Constructor cdBg_constructor(params, hash_table); cdBg_constructor.compute_DFA_states(); + const char* const field_type = "basic info"; + dBg_info[field_type]["vertex count"] = cdBg_constructor.vertex_count(); + dBg_info[field_type]["edge count"] = cdBg_constructor.edge_count(); + std::cout << (!params.extract_cycles() ? "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); Read_CdBG_Extractor cdBg_extractor(params, hash_table); @@ -29,6 +35,28 @@ void Read_CdBG::construct() cdBg_extractor.extract_detached_cycles(); hash_table.clear(); + + dump_dBg_info(); +} + + +template +void Read_CdBG::dump_dBg_info() const +{ + const std::string info_file_path = params.output_file_path() + ".json"; + + std::ofstream output(info_file_path.c_str()); + output << std::setw(4) << dBg_info << "\n"; + + if(output.fail()) + { + std::cerr << "Error writing to the information file " << info_file_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + output.close(); + + std::cout << "\nStructural information for the de Bruijn graph is written to " << info_file_path << ".\n"; } From 95625f42ba2b3ef3ef78951610a36fe6e9022248 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 18:47:58 -0400 Subject: [PATCH 116/350] Centralize JSON type --- include/Read_CdBG.hpp | 3 ++- include/globals.hpp | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 88d4de4f..1d848f82 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -4,6 +4,7 @@ +#include "globals.hpp" #include "Build_Params.hpp" #include "Kmer_Hash_Table.hpp" @@ -19,7 +20,7 @@ class Read_CdBG const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table hash_table; // Hash table for the vertices (canonical k-mers) of the graph. - nlohmann::ordered_json dBg_info; // JSON object to store structural information over the de Bruijn graph. + cuttlefish::json_t dBg_info; // JSON object to store structural information over the de Bruijn graph. // Writes the structural information about the de Bruijn graph — obtained from the algorithm diff --git a/include/globals.hpp b/include/globals.hpp index 049f5cea..90378e93 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -6,8 +6,10 @@ #include "Kmer.hpp" -#include #include "boost/preprocessor/repetition/repeat.hpp" +#include "nlohmann/json_fwd.hpp" + +#include // The macro `INSTANCE_COUNT` must be set exactly to `(MAX_K + 1) / 2` for a required maximum k-value. @@ -74,6 +76,9 @@ namespace cuttlefish typedef std::shared_ptr logger_t; + + + typedef nlohmann::ordered_json json_t; } From 58a007b33b4701d4b9ba237700b6fd558dbf587c Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 19:18:01 -0400 Subject: [PATCH 117/350] Dump more structural information Contigs info --- include/Read_CdBG_Extractor.hpp | 4 +++- include/Unipaths_Meta_info.hpp | 6 ++++++ src/Read_CdBG.cpp | 2 +- src/Read_CdBG_Extractor.cpp | 6 ++++-- src/Unipaths_Meta_info.cpp | 19 ++++++++++++++++++- 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index af354ce9..9c7ae458 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -50,6 +50,8 @@ class Read_CdBG_Extractor Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. + cuttlefish::json_t& dBg_info; // Reference to a JSON object that contains structural information of the graph. + // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) // parsed by the parser `vertex_parser` to the worker threads in the thread pool `thread_pool`, @@ -168,7 +170,7 @@ class Read_CdBG_Extractor // Constructs a vertex-extractor object for some compacted read de Bruijn graph, with the required // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. - Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); + Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table, cuttlefish::json_t& dBg_info); // Extracts the maximal unitigs of the de Bruijn graph. void extract_maximal_unitigs(); diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index 32a42ecf..eac2d72a 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -4,6 +4,8 @@ +#include "globals.hpp" + #include #include @@ -36,6 +38,10 @@ class Unipaths_Meta_info // Returns the total number of k-mers in the extracted maximal unitigs. uint64_t kmer_count() const; + // Populates the JSON file `dBg_info` with the tracked information over + // the maximal unitigs. + void populate(cuttlefish::json_t& dBg_info) const; + // Prints the tracked information to the standard output. void print() const; }; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index d1466979..7f78aeeb 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -29,7 +29,7 @@ void Read_CdBG::construct() std::cout << (!params.extract_cycles() ? "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); - Read_CdBG_Extractor cdBg_extractor(params, hash_table); + Read_CdBG_Extractor cdBg_extractor(params, hash_table, dBg_info); !params.extract_cycles() ? cdBg_extractor.extract_maximal_unitigs(): cdBg_extractor.extract_detached_cycles(); diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index fdbd6ae9..61e40c04 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -9,9 +9,10 @@ template -Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table): +Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table, cuttlefish::json_t& dBg_info): params(params), - hash_table(hash_table) + hash_table(hash_table), + dBg_info(dBg_info) {} @@ -49,6 +50,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() close_output_sink(); std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; + unipaths_meta_info.populate(dBg_info); unipaths_meta_info.print(); // Check for the existence of cycle(s). diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index 81bc4ae8..811a6393 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -1,9 +1,11 @@ #include "Unipaths_Meta_info.hpp" -#include "globals.hpp" + +#include "nlohmann/json.hpp" #include #include +#include template @@ -35,6 +37,21 @@ uint64_t Unipaths_Meta_info::kmer_count() const } +template +void Unipaths_Meta_info::populate(cuttlefish::json_t& dBg_info) const +{ + const char* const field_type = "contigs info"; + + dBg_info[field_type]["maximal unitig count"] = unipath_count_; + dBg_info[field_type]["vertex count in the maximal unitigs"] = kmer_count_; + dBg_info[field_type]["shortest maximal unitig length"] = min_len_; + dBg_info[field_type]["longest maximal unitig length"] = max_len_; + dBg_info[field_type]["sum maximal unitig length"] = sum_len_; + dBg_info[field_type]["avg. maximal unitig length"] = static_cast(std::round(static_cast(sum_len_) / unipath_count_)); + dBg_info[field_type]["_comment"] = "lengths are in bases"; +} + + template void Unipaths_Meta_info::print() const { From 52921d896415e744d590157462c470e4cb86cb3f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 22:21:47 -0400 Subject: [PATCH 118/350] Separate JSON wrapper logic Better design --- include/Read_CdBG.hpp | 9 ++---- include/Read_CdBG_Extractor.hpp | 9 +++--- include/Unipaths_Meta_info.hpp | 17 +++++++++-- include/dBG_Info.hpp | 44 +++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Read_CdBG.cpp | 31 +++++--------------- src/Read_CdBG_Extractor.cpp | 19 ++++++++----- src/Unipaths_Meta_info.cpp | 40 +++++++++++++++++++------- src/dBG_Info.cpp | 50 +++++++++++++++++++++++++++++++++ 9 files changed, 165 insertions(+), 55 deletions(-) create mode 100644 include/dBG_Info.hpp create mode 100644 src/dBG_Info.cpp diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 1d848f82..1f155bcb 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -7,8 +7,7 @@ #include "globals.hpp" #include "Build_Params.hpp" #include "Kmer_Hash_Table.hpp" - -#include "nlohmann/json.hpp" +#include "dBG_Info.hpp" // Read de Bruijn graph class to support the compaction algorithm. @@ -20,12 +19,8 @@ class Read_CdBG const Build_Params params; // Required parameters (wrapped inside). Kmer_Hash_Table hash_table; // Hash table for the vertices (canonical k-mers) of the graph. - cuttlefish::json_t dBg_info; // JSON object to store structural information over the de Bruijn graph. - + dBG_Info dbg_info; // Wrapper object for structural information of the graph. - // Writes the structural information about the de Bruijn graph — obtained from the algorithm - // execution — to disk. - void dump_dBg_info() const; public: diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 9c7ae458..6cf15424 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -48,9 +48,7 @@ class Read_CdBG_Extractor mutable uint64_t cycle_count = 0; // Total number of detached chordless cycles. mutable uint64_t cycle_vertex_count = 0; // Total number of vertices present in the detached chordless cycles. - Unipaths_Meta_info unipaths_meta_info; // Meta-information over the extracted maximal unitigs. - - cuttlefish::json_t& dBg_info; // Reference to a JSON object that contains structural information of the graph. + Unipaths_Meta_info unipaths_meta_info_; // Meta-information over the extracted maximal unitigs. // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) @@ -170,7 +168,7 @@ class Read_CdBG_Extractor // Constructs a vertex-extractor object for some compacted read de Bruijn graph, with the required // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. - Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table, cuttlefish::json_t& dBg_info); + Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); // Extracts the maximal unitigs of the de Bruijn graph. void extract_maximal_unitigs(); @@ -179,6 +177,9 @@ class Read_CdBG_Extractor // rest of the graph. A precondition for the algorithm is the availability of the maximal unitigs. void extract_detached_cycles(); + // Returns a wrapper over the meta-information of the extracted unitigs. + const Unipaths_Meta_info& unipaths_meta_info() const; + // Returns the number of vertices in the underlying graph. uint64_t vertex_count() const; }; diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index eac2d72a..dde13d52 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -35,12 +35,23 @@ class Unipaths_Meta_info // Aggregates the information of the tracker `other` to this tracker. void aggregate(const Unipaths_Meta_info& other); + // Returns the total number of maximal unitigs. + uint64_t unipath_count() const; + // Returns the total number of k-mers in the extracted maximal unitigs. uint64_t kmer_count() const; - // Populates the JSON file `dBg_info` with the tracked information over - // the maximal unitigs. - void populate(cuttlefish::json_t& dBg_info) const; + // Returns the length of the longest maximal unitig. + std::size_t max_len() const; + + // Returns the length of the shortest maximal unitig. + std::size_t min_len() const; + + // Returns the sum length of the maximal unitigs. + uint64_t sum_len() const; + + // Returns the average length of the maximal unitigs. + uint64_t avg_len() const; // Prints the tracked information to the standard output. void print() const; diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp new file mode 100644 index 00000000..0f921a3b --- /dev/null +++ b/include/dBG_Info.hpp @@ -0,0 +1,44 @@ + +#ifndef DBG_INFO_HPP +#define DBG_INFO_HPP + + + +#include "globals.hpp" + +#include "nlohmann/json.hpp" + + +// Forward declarations. +template class Read_CdBG_Constructor; +template class Read_CdBG_Extractor; + + +// A class to wrap the structural information of a de Bruijn graph and some execution +// information of Cuttlefish over it. +template +class dBG_Info +{ +private: + + cuttlefish::json_t dBg_info; // A JSON object wrapping all the information. + + static constexpr const char* basic_field = "basic info"; // Category header for basic graph information. + static constexpr const char* contigs_field = "contigs info"; // Category header for information about the contigs (maximal unitigs). + + +public: + + // Adds basic graph structural information from `cdbg_constructor`. + void add_basic_info(const Read_CdBG_Constructor& cdbg_constructor); + + // Adds information about the extracted maximal unitigs from `cdbg_extractor`. + void add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor); + + // Writes the information to a file at path `file_path`. + void dump_info(const std::string& file_path) const; +}; + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1527870f..4caaffdd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,7 @@ set(PROJECT_SRC Unipaths_Meta_info.cpp Detached_Cycles_Extractor.cpp Character_Buffer_Flusher.cpp + dBG_Info.cpp Validator.cpp Validator_Hash_Table.cpp Sequence_Validator.cpp diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 7f78aeeb..38c47c84 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -19,43 +19,26 @@ void Read_CdBG::construct() std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; hash_table.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + std::cout << "\nComputing the DFA states.\n"; Read_CdBG_Constructor cdBg_constructor(params, hash_table); cdBg_constructor.compute_DFA_states(); - const char* const field_type = "basic info"; - dBg_info[field_type]["vertex count"] = cdBg_constructor.vertex_count(); - dBg_info[field_type]["edge count"] = cdBg_constructor.edge_count(); + dbg_info.add_basic_info(cdBg_constructor); + std::cout << (!params.extract_cycles() ? "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); - Read_CdBG_Extractor cdBg_extractor(params, hash_table, dBg_info); + Read_CdBG_Extractor cdBg_extractor(params, hash_table); !params.extract_cycles() ? - cdBg_extractor.extract_maximal_unitigs(): + cdBg_extractor.extract_maximal_unitigs(), dbg_info.add_unipaths_info(cdBg_extractor): cdBg_extractor.extract_detached_cycles(); - hash_table.clear(); - - dump_dBg_info(); -} + hash_table.clear(); -template -void Read_CdBG::dump_dBg_info() const -{ const std::string info_file_path = params.output_file_path() + ".json"; - - std::ofstream output(info_file_path.c_str()); - output << std::setw(4) << dBg_info << "\n"; - - if(output.fail()) - { - std::cerr << "Error writing to the information file " << info_file_path << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } - - output.close(); - + dbg_info.dump_info(info_file_path); std::cout << "\nStructural information for the de Bruijn graph is written to " << info_file_path << ".\n"; } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 61e40c04..ff4d6b5a 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -9,10 +9,9 @@ template -Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table, cuttlefish::json_t& dBg_info): +Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table): params(params), - hash_table(hash_table), - dBg_info(dBg_info) + hash_table(hash_table) {} @@ -50,11 +49,10 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() close_output_sink(); std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; - unipaths_meta_info.populate(dBg_info); - unipaths_meta_info.print(); + unipaths_meta_info_.print(); // Check for the existence of cycle(s). - if(unipaths_meta_info.kmer_count() != vertex_container.size()) + if(unipaths_meta_info_.kmer_count() != vertex_container.size()) std::cout << "\nCycles disconnected from the rest of the graph are present." " I.e. the cycles are graph components exclusively on their own.\n\n"; @@ -138,7 +136,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. vertices_scanned += vertex_count; - unipaths_meta_info.aggregate(extracted_unipaths_info); + unipaths_meta_info_.aggregate(extracted_unipaths_info); lock.unlock(); } @@ -202,6 +200,13 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } +template +const Unipaths_Meta_info& Read_CdBG_Extractor::unipaths_meta_info() const +{ + return unipaths_meta_info_; +} + + template uint64_t Read_CdBG_Extractor::vertex_count() const { diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index 811a6393..a2a08ad7 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -30,6 +30,13 @@ void Unipaths_Meta_info::aggregate(const Unipaths_Meta_info& other) } +template +uint64_t Unipaths_Meta_info::unipath_count() const +{ + return unipath_count_; +} + + template uint64_t Unipaths_Meta_info::kmer_count() const { @@ -38,17 +45,30 @@ uint64_t Unipaths_Meta_info::kmer_count() const template -void Unipaths_Meta_info::populate(cuttlefish::json_t& dBg_info) const +std::size_t Unipaths_Meta_info::max_len() const +{ + return max_len_; +} + + +template +std::size_t Unipaths_Meta_info::min_len() const +{ + return min_len_; +} + + +template +uint64_t Unipaths_Meta_info::sum_len() const +{ + return sum_len_; +} + + +template +uint64_t Unipaths_Meta_info::avg_len() const { - const char* const field_type = "contigs info"; - - dBg_info[field_type]["maximal unitig count"] = unipath_count_; - dBg_info[field_type]["vertex count in the maximal unitigs"] = kmer_count_; - dBg_info[field_type]["shortest maximal unitig length"] = min_len_; - dBg_info[field_type]["longest maximal unitig length"] = max_len_; - dBg_info[field_type]["sum maximal unitig length"] = sum_len_; - dBg_info[field_type]["avg. maximal unitig length"] = static_cast(std::round(static_cast(sum_len_) / unipath_count_)); - dBg_info[field_type]["_comment"] = "lengths are in bases"; + return static_cast(std::round(static_cast(sum_len_) / unipath_count_)); } diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp new file mode 100644 index 00000000..78700545 --- /dev/null +++ b/src/dBG_Info.cpp @@ -0,0 +1,50 @@ + +#include "dBG_Info.hpp" +#include "Read_CdBG_Constructor.hpp" +#include "Read_CdBG_Extractor.hpp" + +#include + + +template +void dBG_Info::add_basic_info(const Read_CdBG_Constructor& cdbg_constructor) +{ + dBg_info[basic_field]["vertex count"] = cdbg_constructor.vertex_count(); + dBg_info[basic_field]["edge count"] = cdbg_constructor.edge_count(); +} + + +template +void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor) +{ + const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); + + dBg_info[contigs_field]["maximal unitig count"] = unipaths_info.unipath_count(); + dBg_info[contigs_field]["vertex count in the maximal unitigs"] = unipaths_info.kmer_count(); + dBg_info[contigs_field]["shortest maximal unitig length"] = unipaths_info.min_len(); + dBg_info[contigs_field]["longest maximal unitig length"] = unipaths_info.max_len(); + dBg_info[contigs_field]["sum maximal unitig length"] = unipaths_info.sum_len(); + dBg_info[contigs_field]["avg. maximal unitig length"] = unipaths_info.avg_len(); + dBg_info[contigs_field]["_comment"] = "lengths are in bases"; +} + + +template +void dBG_Info::dump_info(const std::string& file_path) const +{ + std::ofstream output(file_path.c_str()); + output << std::setw(4) << dBg_info << "\n"; // Pretty-print the JSON wrapper with overloaded `std::setw`. + + if(output.fail()) + { + std::cerr << "Error writing to the information file " << file_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + output.close(); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, dBG_Info) From 22c1208fb63c5c0bc8f819c88555d5072f2a13ee Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 7 Jul 2021 22:25:00 -0400 Subject: [PATCH 119/350] Remove central JSON type No longer required --- include/dBG_Info.hpp | 2 +- include/globals.hpp | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index 0f921a3b..cdcd86ec 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -21,7 +21,7 @@ class dBG_Info { private: - cuttlefish::json_t dBg_info; // A JSON object wrapping all the information. + nlohmann::ordered_json dBg_info; // A JSON object wrapping all the information. static constexpr const char* basic_field = "basic info"; // Category header for basic graph information. static constexpr const char* contigs_field = "contigs info"; // Category header for information about the contigs (maximal unitigs). diff --git a/include/globals.hpp b/include/globals.hpp index 90378e93..47d3b211 100644 --- a/include/globals.hpp +++ b/include/globals.hpp @@ -7,7 +7,6 @@ #include "Kmer.hpp" #include "boost/preprocessor/repetition/repeat.hpp" -#include "nlohmann/json_fwd.hpp" #include @@ -76,9 +75,6 @@ namespace cuttlefish typedef std::shared_ptr logger_t; - - - typedef nlohmann::ordered_json json_t; } From ef49ca7c8cee17cf5622b9db7f7f3c481a93ea32 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 8 Jul 2021 11:53:02 -0400 Subject: [PATCH 120/350] Remove fwd header for JSON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing commit — comes with the decentralized type --- include/nlohmann/json_fwd.hpp | 78 ----------------------------------- 1 file changed, 78 deletions(-) delete mode 100644 include/nlohmann/json_fwd.hpp diff --git a/include/nlohmann/json_fwd.hpp b/include/nlohmann/json_fwd.hpp deleted file mode 100644 index 332227c1..00000000 --- a/include/nlohmann/json_fwd.hpp +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ -#define INCLUDE_NLOHMANN_JSON_FWD_HPP_ - -#include // int64_t, uint64_t -#include // map -#include // allocator -#include // string -#include // vector - -/*! -@brief namespace for Niels Lohmann -@see https://github.com/nlohmann -@since version 1.0.0 -*/ -namespace nlohmann -{ -/*! -@brief default JSONSerializer template argument - -This serializer ignores the template arguments and uses ADL -([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) -for serialization. -*/ -template -struct adl_serializer; - -template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer, - class BinaryType = std::vector> -class basic_json; - -/*! -@brief JSON Pointer - -A JSON pointer defines a string syntax for identifying a specific value -within a JSON document. It can be used with functions `at` and -`operator[]`. Furthermore, JSON pointers are the base for JSON patches. - -@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) - -@since version 2.0.0 -*/ -template -class json_pointer; - -/*! -@brief default JSON class - -This type is the default specialization of the @ref basic_json class which -uses the standard template types. - -@since version 1.0.0 -*/ -using json = basic_json<>; - -template -struct ordered_map; - -/*! -@brief ordered JSON class - -This type preserves the insertion order of object keys. - -@since version 3.9.0 -*/ -using ordered_json = basic_json; - -} // namespace nlohmann - -#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ From 4fb655d881f1663fc68f103247f1a9b177fbcdfc Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 8 Jul 2021 16:47:29 -0400 Subject: [PATCH 121/350] Better KMC-DB size query Skip loading entire .kmc_pre file for such --- include/kmc_api/kmc_file.h | 5 ++- src/Kmer_Container.cpp | 2 +- src/kmc_api/kmc_file.cpp | 79 +++++++++++++++++++++++++------------- 3 files changed, 58 insertions(+), 28 deletions(-) diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index da857f0f..6e3ee388 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -80,7 +80,7 @@ class CKMCFile bool OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]); // Recognize current parameters. Auxiliary function. - bool ReadParamsFrom_prefix_file_buf(uint64 &size); + bool ReadParamsFrom_prefix_file_buf(uint64 &size, bool load_pref_file = true); // Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function. void Reload_sufix_file_buf(); @@ -113,6 +113,9 @@ class CKMCFile // Open files `*kmc_pre` & `*.kmc_suf`, read `*.kmc_pre` to RAM; `*.kmc_suf` is not buffered internally. bool open_for_listing_unbuffered(const std::string& file_name); + // Open files `*kmc_pre` & `*.kmc_suf`, and read KMC DB parameters to RAM. + bool read_parameters(const std::string& file_name); + // Returns the size of a suffix-record in disk (in bytes); i.e. suffix-size plus counter-size. uint32_t suff_record_size() const; diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 9feb2a2d..39da187c 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -9,7 +9,7 @@ Kmer_Container::Kmer_Container(const std::string& kmc_file_path): kmc_file_path(kmc_file_path) { CKMCFile kmer_database; - if(!kmer_database.open_for_listing_unbuffered(kmc_file_path)) + if(!kmer_database.read_parameters(kmc_file_path)) { std::cout << "Error opening KMC database files with prefix " << kmc_file_path << ". Aborting.\n"; std::exit(EXIT_FAILURE); diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index d1aa36dc..ab9bbdfc 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -136,22 +136,46 @@ bool CKMCFile::open_for_listing_unbuffered(const std::string& file_name) suffix_file_total_to_read = size; suf_file_left_to_read = suffix_file_total_to_read; + sufix_file_buf = NULL; + is_opened = opened_for_listing; + prefix_index = 0; + sufix_number = 0; + index_in_partial_buf = 0; + return true; +} - sufix_file_buf = NULL; - /* - sufix_file_buf = new uchar[part_size]; +//---------------------------------------------------------------------------------- +// Open files *kmc_pre & *.kmc_suf and reads KMC DB parameters to RAM. +// *.kmc_suf is not buffered +// IN : file_name - the name of kmer_counter's output +// RET : true - if successful +//---------------------------------------------------------------------------------- +bool CKMCFile::read_parameters(const std::string& file_name) +{ + uint64 size; - auto to_read = MIN(suf_file_left_to_read, part_size); - auto readed = fread(sufix_file_buf, 1, to_read, file_suf); - if (readed != to_read) - { - std::cerr << "Error: some error while reading suffix file\n"; + if (is_opened) + return false; + + if (file_pre || file_suf) return false; - } - suf_file_left_to_read -= readed; - */ + if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) + return false; + + ReadParamsFrom_prefix_file_buf(size, false); + fclose(file_pre); + file_pre = NULL; + + end_of_file = total_kmers == 0; + + if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) + return false; + + suffix_file_total_to_read = size; + suf_file_left_to_read = suffix_file_total_to_read; + sufix_file_buf = NULL; is_opened = opened_for_listing; prefix_index = 0; @@ -236,7 +260,7 @@ bool CKMCFile::OpenASingleFile(const std::string &file_name, FILE *&file_handler // IN : the size of the file *.kmc_pre, without initial and terminal markers // RET : true - if succesfull //---------------------------------------------------------------------------------- -bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size) +bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_file) { size_t prev_pos = my_ftell(file_pre); my_fseek(file_pre, -12, SEEK_END); @@ -277,19 +301,22 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size) single_LUT_size = 1 << (2 * lut_prefix_length); uint64 last_data_index = lut_area_size_in_bytes / sizeof(uint64); - rewind(file_pre); - my_fseek(file_pre, +4, SEEK_CUR); - prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) - prefix_file_buf = new uint64[prefix_file_buf_size]; - result = fread(prefix_file_buf, 1, (size_t)(lut_area_size_in_bytes + 8), file_pre); - if (result == 0) - return false; - prefix_file_buf[last_data_index] = total_kmers + 1; - - signature_map = new uint32[signature_map_size]; - result = fread(signature_map, 1, signature_map_size * sizeof(uint32), file_pre); - if (result == 0) - return false; + if(load_pref_file) + { + rewind(file_pre); + my_fseek(file_pre, +4, SEEK_CUR); + prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) + prefix_file_buf = new uint64[prefix_file_buf_size]; + result = fread(prefix_file_buf, 1, (size_t)(lut_area_size_in_bytes + 8), file_pre); + if (result == 0) + return false; + prefix_file_buf[last_data_index] = total_kmers + 1; + + signature_map = new uint32[signature_map_size]; + result = fread(signature_map, 1, signature_map_size * sizeof(uint32), file_pre); + if (result == 0) + return false; + } sufix_size = (kmer_length - lut_prefix_length) / 4; @@ -297,7 +324,7 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size) return true; } - else if (kmc_version == 0) + else if (kmc_version == 0) // Not used with cuttlefish. { prefix_file_buf_size = (size - 4) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) prefix_file_buf = new uint64[prefix_file_buf_size]; From db35feb714eeb285131616e0ff220839599a28d0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 8 Jul 2021 17:01:04 -0400 Subject: [PATCH 122/350] Note important TODO --- src/Read_CdBG_Extractor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index ff4d6b5a..9efd3013 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -180,6 +180,8 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const unipath.emplace_back(Kmer::map_char(b_ext)); if(params.dcc_opt()) path_hashes.emplace_back(v.hash()); + // TODO: write-out to disk in case of the size crossing some threshold, and modify `mark_path` accordingly — + // would prevent unwanted memory blow-up in presence of very large maximal unitigs. } const Directed_Vertex& term_vertex = v; From e6c919b70688fff5430a33bec75d61f8c0f67b6e Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 8 Jul 2021 22:01:47 -0400 Subject: [PATCH 123/350] Add DCC info to JSON & loading JSON --- include/Build_Params.hpp | 10 +++++++ include/Read_CdBG_Extractor.hpp | 2 -- include/Unipaths_Meta_info.hpp | 29 +++++++++++++++++++++ include/dBG_Info.hpp | 18 +++++++++++-- src/Detached_Cycles_Extractor.cpp | 7 +++-- src/Read_CdBG.cpp | 11 ++++---- src/Unipaths_Meta_info.cpp | 26 ++++++++++++++++++- src/dBG_Info.cpp | 43 ++++++++++++++++++++++++++++++- src/main.cpp | 6 ++++- 9 files changed, 135 insertions(+), 17 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 33261d36..d13310ae 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -30,6 +30,7 @@ class Build_Params const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string& mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string& buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const std::string& json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. @@ -51,6 +52,7 @@ class Build_Params const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, + const std::string& json_file_path, const bool dcc_opt, const bool extract_cycles): is_read_graph_(is_read_graph), @@ -65,6 +67,7 @@ class Build_Params remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), + json_file_path_(json_file_path), dcc_opt_(dcc_opt), extract_cycles_(extract_cycles) {} @@ -154,6 +157,13 @@ class Build_Params } + // Returns the path to the optional file storing meta-information about the graph and cuttlefish executions. + const std::string& json_file_path() const + { + return json_file_path_; + } + + // Returns whether the option of optimizing post-cdBG-construction extraction of DCCs is specified. bool dcc_opt() const { diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 6cf15424..c856d7f8 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -45,8 +45,6 @@ class Read_CdBG_Extractor mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. mutable uint64_t vertices_marked = 0; // Total number of vertices marked as present in maximal unitigs; used for the extraction of detached chordless cycle(s), if any. - mutable uint64_t cycle_count = 0; // Total number of detached chordless cycles. - mutable uint64_t cycle_vertex_count = 0; // Total number of vertices present in the detached chordless cycles. Unipaths_Meta_info unipaths_meta_info_; // Meta-information over the extracted maximal unitigs. diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index dde13d52..df2199ad 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -22,6 +22,10 @@ class Unipaths_Meta_info std::size_t min_len_; // Length of the shortest maximal unitig. uint64_t sum_len_; // Sum length of the maximal unitigs. + uint64_t dcc_count_; // Total number of DCCs (Detached Chordless Cycles). + uint64_t dcc_kmer_count_; // Total number of k-mers in the DCCs. + uint64_t dcc_sum_len_; // Sum length of the DCCs. + public: @@ -32,6 +36,10 @@ class Unipaths_Meta_info template void add_maximal_unitig(const T_container_& unipath); + // Adds information of the DCC (Detached Chordless Cycle) `cycle` to the tracker. + template + void add_DCC(const T_container_& cycle); + // Aggregates the information of the tracker `other` to this tracker. void aggregate(const Unipaths_Meta_info& other); @@ -53,6 +61,15 @@ class Unipaths_Meta_info // Returns the average length of the maximal unitigs. uint64_t avg_len() const; + // Returns the total number of DCCs (Detached Chordless Cycles). + uint64_t dcc_count() const; + + // Returns the total number of k-mers in the DCCs. + uint64_t dcc_kmer_count() const; + + // Returns the sum length of the DCCs. + uint64_t dcc_sum_len() const; + // Prints the tracked information to the standard output. void print() const; }; @@ -76,5 +93,17 @@ inline void Unipaths_Meta_info::add_maximal_unitig(const T_container_& unipat } +template +template +inline void Unipaths_Meta_info::add_DCC(const T_container_& cycle) +{ + dcc_count_++; + + dcc_kmer_count_ += cycle.size() - (k - 1); + + dcc_sum_len_ += cycle.size(); +} + + #endif diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index cdcd86ec..451e1d05 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -23,20 +23,34 @@ class dBG_Info nlohmann::ordered_json dBg_info; // A JSON object wrapping all the information. + const std::string file_path; // Path to the disk-file to store the JSON object. + static constexpr const char* basic_field = "basic info"; // Category header for basic graph information. static constexpr const char* contigs_field = "contigs info"; // Category header for information about the contigs (maximal unitigs). + static constexpr const char* dcc_field = "detached chordless cycles (DCC) info"; // Category header for information about the DCCs. + + + // Loads the JSON file from disk, if the corresponding file exists. + void load_from_file(); public: + // Constructs a `dBG_Info` object that would correspond to the file at + // path `file_path`. + dBG_Info(const std::string& file_path); + // Adds basic graph structural information from `cdbg_constructor`. void add_basic_info(const Read_CdBG_Constructor& cdbg_constructor); // Adds information about the extracted maximal unitigs from `cdbg_extractor`. void add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor); - // Writes the information to a file at path `file_path`. - void dump_info(const std::string& file_path) const; + // Adds information about the extracted DCCs from `cdbg_extractor`. + void add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor); + + // Writes the JSON object to aits corresponding disk-file. + void dump_info() const; }; diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 618c2d1d..a09da61c 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -22,8 +22,8 @@ void Read_CdBG_Extractor::extract_detached_cycles() std::cout << "Extracting the cycles.\n"; extract_detached_chordless_cycles(); - std::cout << "\nNumber of detached chordless cycles: " << cycle_count << ".\n" - "Number of vertices in the cycles: " << cycle_vertex_count << ".\n"; + std::cout << "\nNumber of detached chordless cycles: " << unipaths_meta_info_.dcc_count() << ".\n" + "Number of vertices in the cycles: " << unipaths_meta_info_.dcc_kmer_count() << ".\n"; std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); @@ -198,6 +198,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato { cycles_extracted++; cycle_vertices += cycle.size() - (k - 1); + unipaths_meta_info_.add_DCC(cycle); // cycle.emplace_back('\n'); // output_buffer += FASTA_Record>(id, cycle); @@ -215,8 +216,6 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato " and extracted " << cycles_extracted << " cycles.\n"; vertices_scanned += vertex_count; - cycle_count += cycles_extracted; - cycle_vertex_count += cycle_vertices; lock.unlock(); } diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 38c47c84..a613269f 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -9,7 +9,8 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): params(params), - hash_table(params.vertex_db_path()) + hash_table(params.vertex_db_path()), + dbg_info(params.json_file_path().empty() ? (params.output_file_path() + ".json") : params.json_file_path()) {} @@ -31,15 +32,13 @@ void Read_CdBG::construct() "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); Read_CdBG_Extractor cdBg_extractor(params, hash_table); !params.extract_cycles() ? - cdBg_extractor.extract_maximal_unitigs(), dbg_info.add_unipaths_info(cdBg_extractor): - cdBg_extractor.extract_detached_cycles(); + (cdBg_extractor.extract_maximal_unitigs(), dbg_info.add_unipaths_info(cdBg_extractor)): + (cdBg_extractor.extract_detached_cycles(), dbg_info.add_DCC_info(cdBg_extractor)); hash_table.clear(); - const std::string info_file_path = params.output_file_path() + ".json"; - dbg_info.dump_info(info_file_path); - std::cout << "\nStructural information for the de Bruijn graph is written to " << info_file_path << ".\n"; + dbg_info.dump_info(); } diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index a2a08ad7..e323c2ce 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -14,7 +14,10 @@ Unipaths_Meta_info::Unipaths_Meta_info(): kmer_count_(0), max_len_(0), min_len_(std::numeric_limits::max()), - sum_len_(0) + sum_len_(0), + dcc_count_(0), + dcc_kmer_count_(0), + dcc_sum_len_(0) {} @@ -72,6 +75,27 @@ uint64_t Unipaths_Meta_info::avg_len() const } +template +uint64_t Unipaths_Meta_info::dcc_count() const +{ + return dcc_count_; +} + + +template +uint64_t Unipaths_Meta_info::dcc_kmer_count() const +{ + return dcc_kmer_count_; +} + + +template +uint64_t Unipaths_Meta_info::dcc_sum_len() const +{ + return dcc_sum_len_; +} + + template void Unipaths_Meta_info::print() const { diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 78700545..e2bc2795 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -2,8 +2,36 @@ #include "dBG_Info.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include "utility.hpp" #include +#include + + +template +dBG_Info::dBG_Info(const std::string& file_path): + file_path(file_path) +{ + if(file_exists(file_path)) + load_from_file(); +} + + +template +void dBG_Info::load_from_file() +{ + std::ifstream input(file_path.c_str()); + + input >> dBg_info; + + if(input.fail()) + { + std::cerr << "Error loading JSON object from file " << file_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + input.close(); +} template @@ -30,7 +58,18 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor template -void dBG_Info::dump_info(const std::string& file_path) const +void dBG_Info::add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor) +{ + const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); + + dBg_info[dcc_field]["DCC count"] = unipaths_info.dcc_count(); + dBg_info[dcc_field]["vertex count in the DCCs"] = unipaths_info.dcc_kmer_count(); + dBg_info[dcc_field]["sum DCC length (in bases)"] = unipaths_info.dcc_sum_len(); +} + + +template +void dBG_Info::dump_info() const { std::ofstream output(file_path.c_str()); output << std::setw(4) << dBg_info << "\n"; // Pretty-print the JSON wrapper with overloaded `std::setw`. @@ -42,6 +81,8 @@ void dBG_Info::dump_info(const std::string& file_path) const } output.close(); + + std::cout << "\nStructural information for the de Bruijn graph is written to " << file_path << ".\n"; } diff --git a/src/main.cpp b/src/main.cpp index e8f94ba0..1d5ac7e4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,7 @@ void build(int argc, char** argv) ("rm", "remove the KMC database") ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("dcc", "turn on optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") ("cycles", "extract the detached chordless cycles of the graph") ("h,help", "print usage"); @@ -63,10 +64,13 @@ void build(int argc, char** argv) const auto working_dir = result["work_dir"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); + const auto json_file = result["json"].as(); const auto dcc_opt = result["dcc"].as(); const auto extract_cycles = result["cycles"].as(); - const Build_Params params(is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, dcc_opt, extract_cycles); + const Build_Params params( is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, + output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, json_file, + dcc_opt, extract_cycles); if(!params.is_valid()) { std::cerr << "Invalid input configuration. Aborting.\n"; From 2fb117b135485030ed43ce8813aa54fbf81b4483 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 9 Jul 2021 12:33:04 -0400 Subject: [PATCH 124/350] Invert DCC-opt CLI --- src/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 1d5ac7e4..99fa8331 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,7 +37,7 @@ void build(int argc, char** argv) ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("dcc", "turn on optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") + ("no-dcc", "turn off optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") ("cycles", "extract the detached chordless cycles of the graph") ("h,help", "print usage"); @@ -65,7 +65,7 @@ void build(int argc, char** argv) const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto json_file = result["json"].as(); - const auto dcc_opt = result["dcc"].as(); + const auto dcc_opt = !result["no-dcc"].as(); const auto extract_cycles = result["cycles"].as(); const Build_Params params( is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, From d08644e548f4903ba3492d126b1944692637823c Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 9 Jul 2021 13:34:52 -0400 Subject: [PATCH 125/350] Lessen CLI mess Still a SNAFU --- src/Read_CdBG_Extractor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 9efd3013..b733454b 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -61,7 +61,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; - if(params.dcc_opt()) // Save the hash table buckets. + if(params.dcc_opt() && !params.buckets_file_path().empty()) // Save the hash table buckets. { // TODO: `params.buckets_file_path()` might be empty. // TODO: Rectify the CLI. From b855378cbbc41703e379ec1f1269f4ef24092a26 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 9 Jul 2021 16:46:12 -0400 Subject: [PATCH 126/350] Inline hash table size --- include/Kmer_Hash_Table.hpp | 7 +++++++ src/Kmer_Hash_Table.cpp | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index c1de1fcf..03cf2037 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -201,5 +201,12 @@ inline void Kmer_Hash_Table::update(const uint64_t bucket_id, c } +template +inline uint64_t Kmer_Hash_Table::size() const +{ + return kmer_count; +} + + #endif diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 574b1ac4..9d7d4e89 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -159,13 +159,6 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co } -template -uint64_t Kmer_Hash_Table::size() const -{ - return kmer_count; -} - - template void Kmer_Hash_Table::clear() { From 7d44c442669eb9a376b09ab564f0543b3ada3550 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 9 Jul 2021 17:10:40 -0400 Subject: [PATCH 127/350] Add basic progress tracker design --- include/Progress_Tracker.hpp | 67 ++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Progress_Tracker.cpp | 16 +++++++++ 3 files changed, 84 insertions(+) create mode 100644 include/Progress_Tracker.hpp create mode 100644 src/Progress_Tracker.cpp diff --git a/include/Progress_Tracker.hpp b/include/Progress_Tracker.hpp new file mode 100644 index 00000000..1f2bff4c --- /dev/null +++ b/include/Progress_Tracker.hpp @@ -0,0 +1,67 @@ + +#ifndef PROGRESS_TRACKER_HPP +#define PROGRESS_TRACKER_HPP + + + +#include "Spin_Lock.hpp" + +#include +#include +#include + + +// A basic class to track and display progress for some work. +class Progress_Tracker +{ +private: + + uint64_t total_work_load; // Total amount of work to be done over time. + uint64_t work_chunk_threshold; // Granularity of the provided work chunk sizes that triggers tracking updates. + + uint64_t total_work_done; // Amount of work done until now. + uint16_t percent_work_done; // Percentage of the completed workload. + std::string log_message; // Message to display at the logs. + + Spin_Lock lock; // Lock to ensure multiple threads can access the tracker safely. + +public: + + // Sets up the tracker for some task with total size `total_work_load`; updates to the tracking are to + // be triggered when some work-chunk of size at least `work_chunk_threshold` is provided to it. The + // log message to be displayed over the course of tracking is `log_message`. + void setup(uint64_t total_work_load, uint64_t work_chunk_threshold, const std::string& log_message); + + // Tracks progress made for a work-chunk of size `work_chunk_size`. If an update it made towards progress, + // then the chunk-size is set to 0 to refresh it for the next cycle. + // Note that, the chunk-size must be at least `work_chunk_threshold` for any updates to be made towards + // the progress. All lesser sized chunk update requests are ignored. So, repeated invocation is suggested. + void track_work(uint64_t& work_chunk_size); +}; + + +inline void Progress_Tracker::track_work(uint64_t& work_chunk_size) +{ + if(work_chunk_size >= work_chunk_threshold) + { + lock.lock(); + + total_work_done += work_chunk_size; + + const uint16_t new_percent = static_cast(std::round((total_work_done * 100.0) / total_work_load)); + if(percent_work_done < new_percent) + { + percent_work_done = new_percent; + std::cerr << "\r[" << log_message << "]\t" << percent_work_done << "\%"; + } + + lock.unlock(); + + + work_chunk_size = 0; + } +} + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4caaffdd..35d061f6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,7 @@ set(PROJECT_SRC Unipaths_Meta_info.cpp Detached_Cycles_Extractor.cpp Character_Buffer_Flusher.cpp + Progress_Tracker.cpp dBG_Info.cpp Validator.cpp Validator_Hash_Table.cpp diff --git a/src/Progress_Tracker.cpp b/src/Progress_Tracker.cpp new file mode 100644 index 00000000..b673d91d --- /dev/null +++ b/src/Progress_Tracker.cpp @@ -0,0 +1,16 @@ + +#include "Progress_Tracker.hpp" + + +void Progress_Tracker::setup(uint64_t total_work_load, uint64_t work_chunk_threshold, const std::string& log_message) +{ + this->total_work_load = total_work_load; + this->work_chunk_threshold = work_chunk_threshold; + + total_work_done = 0; + percent_work_done = 0; + + this->log_message = log_message; + + std::cerr << "\n"; +} From d51d9c54dcec3fe6319ce748ac2b5b6fef6c061e Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 9 Jul 2021 17:21:22 -0400 Subject: [PATCH 128/350] Track progress --- include/Read_CdBG_Constructor.hpp | 3 +++ include/Read_CdBG_Extractor.hpp | 3 +++ src/Read_CdBG_Constructor.cpp | 13 ++++++++++--- src/Read_CdBG_Extractor.cpp | 13 +++++++++---- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 0658b315..9f53c18f 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -12,6 +12,7 @@ #include "Thread_Pool.hpp" #include "Kmer_Container.hpp" #include "Kmer_SPMC_Iterator.hpp" +#include "Progress_Tracker.hpp" // A class to construct compacted read de Bruijn graphs. @@ -30,6 +31,8 @@ class Read_CdBG_Constructor // Members required to keep track of the total number of edges processed across different threads. mutable Spin_Lock lock; mutable uint64_t edges_processed = 0; + + Progress_Tracker progress_tracker; // Progress tracker for the DFA states computation task. // Distributes the DFA-states computation task — disperses the graph edges (i.e. (k + 1)-mers) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index c856d7f8..96d64c1b 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -12,6 +12,7 @@ #include "Async_Logger_Wrapper.hpp" #include "Output_Sink.hpp" #include "Unipaths_Meta_info.hpp" +#include "Progress_Tracker.hpp" #include #include @@ -48,6 +49,8 @@ class Read_CdBG_Extractor Unipaths_Meta_info unipaths_meta_info_; // Meta-information over the extracted maximal unitigs. + Progress_Tracker progress_tracker; // Progress tracker for the maximal unitigs extraction task. + // Distributes the maximal unitigs extraction task — disperses the graph vertices (i.e. k-mers) // parsed by the parser `vertex_parser` to the worker threads in the thread pool `thread_pool`, diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index bed807f8..2145aa35 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -3,7 +3,8 @@ #include "Edge.hpp" #include "utility.hpp" -#include "chrono" +#include +#include template @@ -43,6 +44,8 @@ void Read_CdBG_Constructor::compute_DFA_states() edge_parser.launch_production(); // Launch (multi-threaded) computation of the states. + const uint64_t thread_load_percentile = static_cast(std::round((edge_count_ / 100.0) / params.thread_count())); + progress_tracker.setup(edge_count_, thread_load_percentile, "Computing DFA states"); distribute_states_computation(&edge_parser, thread_pool); // Wait for the edges to be depleted from the database. @@ -51,7 +54,7 @@ void Read_CdBG_Constructor::compute_DFA_states() // Wait for the consumer threads to finish parsing and processing the edges. thread_pool.close(); - std::cout << "Number of processed edges: " << edges_processed << "\n"; + std::cout << "\nNumber of processed edges: " << edges_processed << "\n"; if(!buckets_file_path.empty() && !params.dcc_opt()) // Save the hash table buckets, if a file path is provided. @@ -92,6 +95,7 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed cuttlefish::edge_encoding_t e_v_old, e_v_new; // Edges incident to some particular side of a vertex `v`, before and after the addition of a new edge. uint64_t edge_count = 0; // Number of edges processed by this thread. + uint64_t progress = 0; // Number of edges processed by the thread; is reset at reaching 1% of its approximate workload. while(edge_parser->tasks_expected(thread_id)) if(edge_parser->value_at(thread_id, e.e())) @@ -125,10 +129,13 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed } edge_count++; + + + progress_tracker.track_work(++progress); } lock.lock(); - std::cout << "Thread " << thread_id << " processed " << edge_count << " edges.\n"; // Temporary log. TODO: remove. + // std::cout << "Thread " << thread_id << " processed " << edge_count << " edges.\n"; edges_processed += edge_count; lock.unlock(); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index b733454b..261e8cc2 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -37,6 +37,8 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() init_output_sink(); // Launch (multi-threaded) extraction of the maximal unitigs. + const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); + progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting maximal unitigs"); distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. @@ -48,7 +50,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() // Close the output sink. close_output_sink(); - std::cout << "Number of scanned vertices: " << vertices_scanned << ".\n"; + std::cout << "\nNumber of scanned vertices: " << vertices_scanned << ".\n"; unipaths_meta_info_.print(); // Check for the existence of cycle(s). @@ -99,6 +101,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p uint64_t vertex_count = 0; // Number of vertices scanned by this thread. Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. + uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(SEQ_SZ); @@ -127,14 +130,16 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p } vertex_count++; + + + progress_tracker.track_work(++progress); } // Aggregate the meta-information over the extracted maximal unitigs and the thread-executions. lock.lock(); + // std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; - std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; // TODO: remove. - vertices_scanned += vertex_count; unipaths_meta_info_.aggregate(extracted_unipaths_info); @@ -212,7 +217,7 @@ const Unipaths_Meta_info& Read_CdBG_Extractor::unipaths_meta_info() const template uint64_t Read_CdBG_Extractor::vertex_count() const { - return vertices_scanned; + return hash_table.size(); } From 9d89afd848f85264286179078e4c32d228dc235f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 21 Jul 2021 23:16:54 -0400 Subject: [PATCH 129/350] Cleanup CLI for DCC extraction mass cleaning --- include/Build_Params.hpp | 24 ++++++--- include/File_Extensions.hpp | 27 ++++++++++ include/Kmer_Hash_Table.hpp | 9 ++++ include/Read_CdBG.hpp | 12 +++++ include/Read_CdBG_Extractor.hpp | 15 +++++- include/dBG_Info.hpp | 14 +++++ src/Build_Params.cpp | 37 ++----------- src/Detached_Cycles_Extractor.cpp | 6 ++- src/Kmer_Hash_Table.cpp | 34 +++++++++--- src/Read_CdBG.cpp | 86 +++++++++++++++++++++++++++---- src/Read_CdBG_Constructor.cpp | 8 --- src/Read_CdBG_Extractor.cpp | 43 ++++++++++------ src/dBG_Info.cpp | 37 +++++++++++++ src/main.cpp | 1 + 14 files changed, 268 insertions(+), 85 deletions(-) create mode 100644 include/File_Extensions.hpp diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index d13310ae..e5afd517 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -7,6 +7,7 @@ #include "globals.hpp" #include "Reference_Input.hpp" #include "Output_Format.hpp" +#include "File_Extensions.hpp" #include #include @@ -115,13 +116,20 @@ class Build_Params } - // Returns the path to the output file. - const std::string& output_file_path() const + // Returns the path prefix for all outputs of the algorithm. + const std::string output_prefix() const { return output_file_path_; } + // Returns the path to the output file. + const std::string output_file_path() const + { + return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::unipaths_ext) : output_file_path_; + } + + // Returns the output format. cuttlefish::Output_Format output_format() const { @@ -144,23 +152,23 @@ class Build_Params // Returns the path to the optional MPH file. - const std::string& mph_file_path() const + const std::string mph_file_path() const { - return mph_file_path_; + return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::hash_ext) : mph_file_path_; } // Returns the path to the optional file storing the hash table buckets. - const std::string& buckets_file_path() const + const std::string buckets_file_path() const { - return buckets_file_path_; + return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::buckets_ext) : buckets_file_path_; } // Returns the path to the optional file storing meta-information about the graph and cuttlefish executions. - const std::string& json_file_path() const + const std::string json_file_path() const { - return json_file_path_; + return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::json_ext) : json_file_path_; } diff --git a/include/File_Extensions.hpp b/include/File_Extensions.hpp new file mode 100644 index 00000000..e1e576b6 --- /dev/null +++ b/include/File_Extensions.hpp @@ -0,0 +1,27 @@ + +#ifndef FILE_EXTENSIONS_HPP +#define FILE_EXTENSIONS_HPP + + + +namespace cuttlefish +{ + // File extensions for the data structures and files output by the algorithm. + namespace file_ext + { + constexpr char hash_ext[] = ".cf_hf"; + constexpr char buckets_ext[] = ".cf_hb"; + constexpr char unipaths_ext[] = ".fa"; + constexpr char json_ext[] = ".json"; + + // For reference dBGs only: + + // TODO: use these to replace the corresponding constants from `CdBG_Writer`. + constexpr char seg_ext[] = ".cf_seg"; + constexpr char seq_ext[] = ".cf_seq"; + } +} + + + +#endif diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 03cf2037..2951fe53 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -13,6 +13,7 @@ #include "Kmer_Hash_Entry_API.hpp" #include "Spin_Lock.hpp" #include "Sparse_Lock.hpp" +#include "Build_Params.hpp" template @@ -117,6 +118,14 @@ class Kmer_Hash_Table // Loads the hash table buckets `hash_table` from the file at `file_path`. void load_hash_buckets(const std::string& file_path); + // Saves the hash table (i.e. the hash function and the buckets) into file + // paths determined from the parameters collection `params`. + void save(const Build_Params& params) const; + + // Removes the hash table files (if exists) from disk, with the file paths + // being determined from the parameters collection `params`. + void remove(const Build_Params& params) const; + // Destructs the hash table. ~Kmer_Hash_Table(); }; diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 1f155bcb..c64e8334 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -22,6 +22,13 @@ class Read_CdBG dBG_Info dbg_info; // Wrapper object for structural information of the graph. + // Computes the states of the automata, i.e. the vertices in the graph. + void compute_DFA_states(); + + // Extracts the maximal unitigs from the graph. + void extract_maximal_unitigs(); + + public: // Constructs a `Read_CdBG` object with the parameters required for the construction of @@ -31,6 +38,11 @@ class Read_CdBG // Constructs the compacted read de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); + + // Returns `true` iff the compacted de Bruijn graph to be built from the parameters + // collection `params` had been constructed in an earlier execution. + // NB: only the existence of the output meta-info file is checked for this purpose. + static bool is_constructed(const Build_Params& params); }; diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 96d64c1b..711dd915 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -21,6 +21,7 @@ // Forward declarations. template class Kmer_SPMC_Iterator; template class Thread_Pool; +template class dBG_Info; // A class to extract the vertices from a compacted de Bruin graph — which are the maximal unitigs of some ordinary de Bruijn graph. @@ -175,14 +176,24 @@ class Read_CdBG_Extractor void extract_maximal_unitigs(); // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the - // rest of the graph. A precondition for the algorithm is the availability of the maximal unitigs. - void extract_detached_cycles(); + // rest of the graph. `dbg_info` is used to determine whether the compacted graph had been constructed + // earlier — in which case some data structures are re-used from the earlier construction. + void extract_detached_cycles(const dBG_Info& dbg_info); + + // Returns the parameters collection for the compacted graph construction. + const Build_Params& get_params() const; // Returns a wrapper over the meta-information of the extracted unitigs. const Unipaths_Meta_info& unipaths_meta_info() const; // Returns the number of vertices in the underlying graph. uint64_t vertex_count() const; + + // Returns `true` iff the de Bruijn graph has DCCs (Detached Chordless Cycles). + bool has_dcc() const; + + // Returns the number of vertices present in maximal unitigs (excluding the DCCs). + uint64_t unipaths_vertex_count() const; }; diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index 451e1d05..3ed79f2d 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -12,6 +12,7 @@ // Forward declarations. template class Read_CdBG_Constructor; template class Read_CdBG_Extractor; +class Build_Params; // A class to wrap the structural information of a de Bruijn graph and some execution @@ -28,6 +29,7 @@ class dBG_Info static constexpr const char* basic_field = "basic info"; // Category header for basic graph information. static constexpr const char* contigs_field = "contigs info"; // Category header for information about the contigs (maximal unitigs). static constexpr const char* dcc_field = "detached chordless cycles (DCC) info"; // Category header for information about the DCCs. + static constexpr const char* params_field = "parameters info"; // Category header for the graph build parameters. // Loads the JSON file from disk, if the corresponding file exists. @@ -40,6 +42,9 @@ class dBG_Info // path `file_path`. dBG_Info(const std::string& file_path); + // Adds build parameters information of the Cuttlefish algorithm from `params`. + void add_build_params(const Build_Params& params); + // Adds basic graph structural information from `cdbg_constructor`. void add_basic_info(const Read_CdBG_Constructor& cdbg_constructor); @@ -51,6 +56,15 @@ class dBG_Info // Writes the JSON object to aits corresponding disk-file. void dump_info() const; + + // Returns whether the graph has been recorded to contain DCCs. + bool has_dcc() const; + + // Returns whether DCC-extraction optimization has been performed on the graph. + bool dcc_opt_performed() const; + + // Returns whether the DCCs have been extracted fron the graph. + bool dcc_extracted() const; }; diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 10fe695e..8e7477c3 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -5,6 +5,8 @@ bool Build_Params::is_valid() const { + // TODO: do better — is a mess. + bool valid = true; @@ -17,39 +19,10 @@ bool Build_Params::is_valid() const valid = false; } - if(!extract_cycles_) // Construction of the compacted dBG is requested, not the detached chordless cycles extraction. + if(edge_db_path_.empty()) { - if(edge_db_path_.empty()) - { - std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; - valid = false; - } - } - else // Detached chordless cycles extraction is requested. - { - if(vertex_db_path_.empty()) - { - std::cout << "The path prefix to the KMC-database for vertices (i.e. k-mers) is required for the cycles' extraction.\n"; - valid = false; - } - - if(mph_file_path_.empty() || !file_exists(mph_file_path_)) - { - std::cout << "The Minimal Perfect Hash Function (MPHF) file (*.bbh) is required for the cycles' extraction.\n"; - valid = false; - } - - if(buckets_file_path_.empty() || !file_exists(buckets_file_path_)) - { - std::cout << "The hash table buckets file (*.cf) is required for the cycles' extraction.\n"; - valid = false; - } - - if(output_file_path_.empty() || !file_exists(output_file_path_)) - { - std::cout << "The output maximal unitigs file (*.fasta) is required for the cycles' extraction.\n"; - valid = false; - } + std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; + valid = false; } } else // Is a reference de Bruijn graph. diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index a09da61c..67d04ded 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -1,18 +1,20 @@ #include "Read_CdBG_Extractor.hpp" +#include "Read_CdBG.hpp" #include "Kmer_SPMC_Iterator.hpp" #include "FASTA_Record.hpp" #include "Character_Buffer.hpp" #include "Thread_Pool.hpp" +#include "dBG_Info.hpp" template -void Read_CdBG_Extractor::extract_detached_cycles() +void Read_CdBG_Extractor::extract_detached_cycles(const dBG_Info& dbg_info) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - if(!params.dcc_opt()) + if(Read_CdBG::is_constructed(params) && !dbg_info.dcc_opt_performed()) { std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; mark_maximal_unitig_vertices(); diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 9d7d4e89..9feef388 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -1,11 +1,13 @@ #include "Kmer_Hash_Table.hpp" #include "Kmer_SPMC_Iterator.hpp" +#include "utility.hpp" #include #include #include #include +#include template @@ -48,14 +50,7 @@ void Kmer_Hash_Table::build_mph_function(const uint16_t thread_ // Save the MPHF if specified. - if(!mph_file_path.empty()) - { - std::cout << "Saving the MPHF in file " << mph_file_path << ".\n"; - - save_mph_function(mph_file_path); - - std::cout << "Saved the MPHF in disk.\n"; - } + // TODO: add `--save-hash` CL-parameter in main, replacing `--mph`. } } @@ -126,6 +121,29 @@ void Kmer_Hash_Table::load_hash_buckets(const std::string& file } +template +void Kmer_Hash_Table::save(const Build_Params& params) const +{ + save_mph_function(params.mph_file_path()); + save_hash_buckets(params.buckets_file_path()); +} + + +template +void Kmer_Hash_Table::remove(const Build_Params& params) const +{ + const std::string mph_file_path = params.mph_file_path(); + const std::string buckets_file_path = params.buckets_file_path(); + + if( (file_exists(mph_file_path) && std::remove(mph_file_path.c_str()) != 0) || + (file_exists(buckets_file_path) && std::remove(buckets_file_path.c_str()) != 0)) + { + std::cerr << "Error removing the hash table files from disk. Aborting.\n"; + std::exit(EXIT_FAILURE); + } +} + + template void Kmer_Hash_Table::construct(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index a613269f..cbddad40 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -10,35 +10,101 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): params(params), hash_table(params.vertex_db_path()), - dbg_info(params.json_file_path().empty() ? (params.output_file_path() + ".json") : params.json_file_path()) + dbg_info(params.json_file_path()) {} template void Read_CdBG::construct() { + if(is_constructed(params) && (!dbg_info.has_dcc() || dbg_info.dcc_extracted())) + { + std::cout << "\nThe compacted de Bruijn graph has already been completely constructed earlier.\n"; + return; + } + + + dbg_info.add_build_params(params); + + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; hash_table.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); - std::cout << "\nComputing the DFA states.\n"; + compute_DFA_states(); + + if(!params.extract_cycles() && !params.dcc_opt()) + hash_table.save(params); + + std::cout << "\nExtracting the maximal unitigs.\n"; + extract_maximal_unitigs(); + + if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) + hash_table.remove(params); + + + hash_table.clear(); + dbg_info.dump_info(); +} + + +template +void Read_CdBG::compute_DFA_states() +{ Read_CdBG_Constructor cdBg_constructor(params, hash_table); cdBg_constructor.compute_DFA_states(); dbg_info.add_basic_info(cdBg_constructor); +} - std::cout << (!params.extract_cycles() ? - "\nExtracting the maximal unitigs.\n": "\nExtracting the detached chordless cycles.\n"); +template +void Read_CdBG::extract_maximal_unitigs() +{ Read_CdBG_Extractor cdBg_extractor(params, hash_table); - !params.extract_cycles() ? - (cdBg_extractor.extract_maximal_unitigs(), dbg_info.add_unipaths_info(cdBg_extractor)): - (cdBg_extractor.extract_detached_cycles(), dbg_info.add_DCC_info(cdBg_extractor)); - + if(!is_constructed(params)) + { + cdBg_extractor.extract_maximal_unitigs(); + + dbg_info.add_unipaths_info(cdBg_extractor); + + if(cdBg_extractor.has_dcc()) + { + if(params.extract_cycles()) + { + cdBg_extractor.extract_detached_cycles(dbg_info); + + dbg_info.add_DCC_info(cdBg_extractor); + } + else if(params.dcc_opt()) + hash_table.save(params); + } + } + else if(params.extract_cycles()) + { + if(dbg_info.has_dcc()) + { + if(!dbg_info.dcc_extracted()) + { + cdBg_extractor.extract_detached_cycles(dbg_info); + + dbg_info.add_DCC_info(cdBg_extractor); + } + else + std::cout << "\nThe DCCs (Detached Chordless Cycles) have already been extracted earlier.\n"; + } + else + std::cout << "\nThe de Bruijn graph has no DCCs (Detached Chordless Cycles).\n"; + } + else + std::cout << "\nNothing to do.\n"; +} - hash_table.clear(); - dbg_info.dump_info(); +template +bool Read_CdBG::is_constructed(const Build_Params& params) +{ + return file_exists(params.json_file_path()); } diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 2145aa35..e0627b64 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -55,14 +55,6 @@ void Read_CdBG_Constructor::compute_DFA_states() thread_pool.close(); std::cout << "\nNumber of processed edges: " << edges_processed << "\n"; - - - if(!buckets_file_path.empty() && !params.dcc_opt()) // Save the hash table buckets, if a file path is provided. - { - std::cout << "Saving the hash table buckets in file " << buckets_file_path << ".\n"; - hash_table.save_hash_buckets(buckets_file_path); - std::cout << "Saved the buckets in disk.\n"; - } } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 261e8cc2..228f2761 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -54,7 +54,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() unipaths_meta_info_.print(); // Check for the existence of cycle(s). - if(unipaths_meta_info_.kmer_count() != vertex_container.size()) + if(has_dcc()) std::cout << "\nCycles disconnected from the rest of the graph are present." " I.e. the cycles are graph components exclusively on their own.\n\n"; @@ -62,16 +62,6 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; - - if(params.dcc_opt() && !params.buckets_file_path().empty()) // Save the hash table buckets. - { - // TODO: `params.buckets_file_path()` might be empty. - // TODO: Rectify the CLI. - const std::string buckets_file_path = params.buckets_file_path(); - std::cout << "Saving the hash table buckets in file " << buckets_file_path << ".\n"; - hash_table.save_hash_buckets(buckets_file_path); - std::cout << "Saved the buckets in disk.\n"; - } } @@ -106,7 +96,8 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. unipath.reserve(SEQ_SZ); - if(params.dcc_opt()) + const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); + if(mark_unipaths) path_hashes.reserve(BUFF_SZ); @@ -125,7 +116,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p output_buffer += FASTA_Record>(id, unipath); // unipath.clear(); - if(params.dcc_opt()) + if(mark_unipaths) mark_path(path_hashes); } @@ -156,10 +147,11 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. + const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); // Whether to mark the vertices present in the maximal unitigs. const Directed_Vertex init_vertex(v); init_vertex.kmer().get_label(unipath); - if(params.dcc_opt()) + if(mark_unipaths) { path_hashes.clear(); path_hashes.emplace_back(init_vertex.hash()); @@ -183,7 +175,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const state = hash_table[v.hash()].state(); unipath.emplace_back(Kmer::map_char(b_ext)); - if(params.dcc_opt()) + if(mark_unipaths) path_hashes.emplace_back(v.hash()); // TODO: write-out to disk in case of the size crossing some threshold, and modify `mark_path` accordingly — // would prevent unwanted memory blow-up in presence of very large maximal unitigs. @@ -207,6 +199,13 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } +template +const Build_Params& Read_CdBG_Extractor::get_params() const +{ + return params; +} + + template const Unipaths_Meta_info& Read_CdBG_Extractor::unipaths_meta_info() const { @@ -221,6 +220,20 @@ uint64_t Read_CdBG_Extractor::vertex_count() const } +template +uint64_t Read_CdBG_Extractor::unipaths_vertex_count() const +{ + return unipaths_meta_info_.kmer_count(); +} + + +template +bool Read_CdBG_Extractor::has_dcc() const +{ + return unipaths_vertex_count() != vertex_count(); +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index e2bc2795..99a522f7 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -2,6 +2,7 @@ #include "dBG_Info.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include "Build_Params.hpp" #include "utility.hpp" #include @@ -54,6 +55,11 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor dBg_info[contigs_field]["sum maximal unitig length"] = unipaths_info.sum_len(); dBg_info[contigs_field]["avg. maximal unitig length"] = unipaths_info.avg_len(); dBg_info[contigs_field]["_comment"] = "lengths are in bases"; + + const Build_Params& params = cdbg_extractor.get_params(); + dBg_info[dcc_field]["DCCs present?"] = cdbg_extractor.has_dcc(); + dBg_info[dcc_field]["DCCs extracted?"] = false; + dBg_info[dcc_field]["DCC optimization performed?"] = (params.extract_cycles() || params.dcc_opt()); } @@ -62,12 +68,22 @@ void dBG_Info::add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor) { const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); + dBg_info[dcc_field]["DCCs extracted?"] = true; dBg_info[dcc_field]["DCC count"] = unipaths_info.dcc_count(); dBg_info[dcc_field]["vertex count in the DCCs"] = unipaths_info.dcc_kmer_count(); dBg_info[dcc_field]["sum DCC length (in bases)"] = unipaths_info.dcc_sum_len(); } +template +void dBG_Info::add_build_params(const Build_Params& params) +{ + // TODO: add input files information — after major generalization of the class `Reference_Input` and KMC library integration. + dBg_info[params_field]["k"] = params.k(); + dBg_info[params_field]["output prefix"] = params.output_prefix(); +} + + template void dBG_Info::dump_info() const { @@ -86,6 +102,27 @@ void dBG_Info::dump_info() const } +template +bool dBG_Info::has_dcc() const +{ + return dBg_info[dcc_field]["DCCs present?"]; +} + + +template +bool dBG_Info::dcc_opt_performed() const +{ + return dBg_info[dcc_field]["DCC optimization performed?"]; +} + + +template +bool dBG_Info::dcc_extracted() const +{ + return dBg_info[dcc_field]["DCCs extracted?"]; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, dBG_Info) diff --git a/src/main.cpp b/src/main.cpp index 99fa8331..0794442d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,6 +34,7 @@ void build(int argc, char** argv) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ("w,work_dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("rm", "remove the KMC database") + // TODO: remove the following three options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) From 5fe0ac3d4e2ed101daa6c54b17daa5ec8d8272fc Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 21 Jul 2021 23:40:23 -0400 Subject: [PATCH 130/350] Track DCC extraction progress --- src/Detached_Cycles_Extractor.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 67d04ded..bfd90101 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -50,6 +50,8 @@ void Read_CdBG_Extractor::mark_maximal_unitig_vertices() // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); + progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting maximal unitigs"); distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. @@ -74,6 +76,7 @@ void Read_CdBG_Extractor::mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t marked_count = 0; // Number of vertices marked as present in maximal unitigs by this thread. + uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. while(vertex_parser->tasks_expected(thread_id)) if(vertex_parser->value_at(thread_id, v)) @@ -84,6 +87,9 @@ void Read_CdBG_Extractor::mark_maximal_unitig_vertices(Kmer_SPMC_Iterator* marked_count += mark_maximal_unitig(v, s_v); vertex_count++; + + + progress_tracker.track_work(++progress); } @@ -160,6 +166,8 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles() init_output_sink(); // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. + const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); + progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting maximal unitigs"); distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. @@ -186,6 +194,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato uint64_t vertex_count = 0; // Number of vertices scanned by this thread. uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. uint64_t cycle_vertices = 0; // Number of vertices found to be in detached chordless cycles by this thread. + uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for the cycles. cycle.reserve(SEQ_SZ); @@ -208,14 +217,14 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato } vertex_count++; + + + progress_tracker.track_work(++progress); } // Aggregate the meta-information over the marked maximal unitigs and the thread-executions. lock.lock(); - - std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices," // TODO: remove. - " and extracted " << cycles_extracted << " cycles.\n"; vertices_scanned += vertex_count; From 2885a6eec2ef9b33697e3dbb3e871059590beb20 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 22 Jul 2021 11:05:45 -0400 Subject: [PATCH 131/350] Fix progress message --- src/Detached_Cycles_Extractor.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index bfd90101..eb069b58 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -51,7 +51,7 @@ void Read_CdBG_Extractor::mark_maximal_unitig_vertices() // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); - progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting maximal unitigs"); + progress_tracker.setup(vertex_count(), thread_load_percentile, "Marking maximal unitigs"); distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. @@ -144,13 +144,6 @@ std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, co template void Read_CdBG_Extractor::extract_detached_chordless_cycles() { - // TODO: put the information for this utility check in a meta JSON file. - // if(vertices_marked == vertices_scanned) - // { - // std::cout << "\nNo detached chordless cycle exists in the de Bruijn graph.\n"; - // return; - // } - // Construct a thread pool. const uint16_t thread_count = params.thread_count(); Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_cycles); @@ -167,7 +160,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles() // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); - progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting maximal unitigs"); + progress_tracker.setup(vertex_count(), thread_load_percentile, "Extracting detached chordless cycles"); distribute_unipaths_extraction(&vertex_parser, thread_pool); // Wait for the vertices to be depleted from the database. From 0864a99ca90414dd98e6ca6bdddc5e33b34e86de Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 25 Aug 2021 19:12:31 -0400 Subject: [PATCH 132/350] Add char-buffer tester code --- src/test.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/test.cpp b/src/test.cpp index 3e6e6e28..6ee33f71 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -7,6 +7,9 @@ #include "BBHash/BooPHF.h" #include "Kmer_Hasher.hpp" #include "Validator.hpp" +#include "Character_Buffer.hpp" +#include "Kmer_SPMC_Iterator.hpp" +#include "FASTA_Record.hpp" #include "kseq/kseq.h" #include "spdlog/spdlog.h" #include "spdlog/async.h" @@ -546,6 +549,58 @@ void test_iterator_correctness(const char* const db_path, const size_t consumer_ } +template +void write_kmers(const std::string& kmc_db_path, const uint16_t thread_count, const std::string& output_file_path) +{ + const Kmer_Container kmer_container(kmc_db_path); + Kmer_SPMC_Iterator parser(&kmer_container, thread_count); + + parser.launch_production(); + + std::ofstream output(output_file_path); + + std::vector> T(thread_count); + + for(size_t i = 0; i < thread_count; ++i) + { + const size_t consumer_id = i; + + T[consumer_id].reset( + new std::thread([&parser, consumer_id, &output]() + { + Kmer kmer; + std::vector str; + str.reserve(k + 2); + + uint64_t local_count{0}; + Character_Buffer<10485760, std::ofstream> buffer(output); + + while(parser.tasks_expected(consumer_id)) + if(parser.value_at(consumer_id, kmer)) + { + kmer.get_label(str); + str.emplace_back('\n'); + // buffer += str; + buffer += FASTA_Record>(0, str); + + local_count++; + if(local_count % 10000000 == 0) + std::cout << "Thread " << consumer_id << " parsed " << local_count << " k-mers\n"; + } + } + ) + ); + } + + + parser.seize_production(); + for(std::size_t id = 0; id < thread_count; ++id) + T[id]->join(); + + output.close(); +} + + int main(int argc, char** argv) { (void)argc; @@ -582,6 +637,7 @@ int main(int argc, char** argv) // test_buffered_iterator_performance(argv[1]); // test_SPMC_iterator_performance(argv[1], consumer_count); + // write_kmers<32>(argv[1], std::atoi(argv[2]), argv[3]); return 0; } From 60b437191a0a8bd52cf4e2657aeeb68f1757f138 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 25 Aug 2021 19:33:36 -0400 Subject: [PATCH 133/350] Restore iterator performance tester --- src/test.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index 6ee33f71..8899bf3b 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -419,20 +419,20 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons std::cout << "Launched consumer " << consumer_id << ".\n"; Kmer kmer; Kmer max_kmer; - uint64_t local_count{0}; + // uint64_t local_count{0}; while(it.tasks_expected(consumer_id)) if(it.value_at(consumer_id, kmer)) { max_kmer = std::max(max_kmer, kmer); - local_count++; - if (local_count % 5000000 == 0) { - ctr += local_count; - local_count = 0; - std::cerr << "parsed " << ctr << " k-mers\n"; - } + // local_count++; + // if (local_count % 5000000 == 0) { + // ctr += local_count; + // local_count = 0; + // std::cerr << "parsed " << ctr << " k-mers\n"; + // } } - ctr += local_count; + // ctr += local_count; mk = max_kmer; } ) @@ -631,11 +631,11 @@ int main(int argc, char** argv) // count_kmers_in_unitigs(argv[1], atoi(argv[2])); - // static constexpr uint16_t k = 26; - // static const size_t consumer_count = std::atoi(argv[2]); + static constexpr uint16_t k = 28; + static const size_t consumer_count = std::atoi(argv[2]); // test_buffered_iterator_performance(argv[1]); - // test_SPMC_iterator_performance(argv[1], consumer_count); + test_SPMC_iterator_performance(argv[1], consumer_count); // write_kmers<32>(argv[1], std::atoi(argv[2]), argv[3]); From ed949dc9ba1342cc137ca3e020eb6a59f72db3c0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 25 Aug 2021 19:34:19 -0400 Subject: [PATCH 134/350] Better document k-mer iterator ambiguous `prefix_idx` to `prefix`; and relevent documentation --- include/Kmer_SPMC_Iterator.hpp | 12 ++++++------ include/kmc_api/kmc_file.h | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index 3a3c61c0..a684daea 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -17,10 +17,10 @@ struct Consumer_Data { uint8_t* buffer{nullptr}; // Buffer for the raw binary k-mers. - uint64_t pref_idx; // Index of the prefix (into the in-memory KMC prefix buffer) to start parsing (and using) k-mers from. + uint64_t prefix; // The potential prefix to start parsing (and using) k-mers from (used as index into the in-memory KMC prefix-buffer). uint64_t suff_idx; // Index of the suffix (into the in-disk KMC suffix collection) to start parsing (and using) k-mers from. uint64_t kmers_available; // Number of raw suffixes present in the current buffer. - uint64_t kmers_parsed; // Number of k-mers parsed by from the current buffer. + uint64_t kmers_parsed; // Number of k-mers parsed from the current buffer. uint64_t pad_[3]; // Padding to avoid false-sharing. // TODO: use better soln: https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size }; @@ -47,7 +47,7 @@ class Kmer_SPMC_Iterator std::unique_ptr reader{nullptr}; // The thread doing the actual disk-read of the binary data, i.e. the producer thread. - static constexpr size_t BUF_SZ_PER_CONSUMER = (1 << 24); // Size of the consumer-specific buffers (in bytes). + static constexpr size_t BUF_SZ_PER_CONSUMER = (1 << 24); // Size of the consumer-specific buffers (in bytes): 16 MB. std::vector consumer; // Parsing data required for each consumer. @@ -212,7 +212,7 @@ inline void Kmer_SPMC_Iterator::launch_production() { auto& consumer_state = consumer[id]; consumer_state.buffer = new uint8_t[BUF_SZ_PER_CONSUMER]; - consumer_state.pref_idx = 0; + consumer_state.prefix = 0; consumer_state.suff_idx = 0; consumer_state.kmers_available = 0; consumer_state.kmers_parsed = 0; @@ -248,7 +248,7 @@ inline void Kmer_SPMC_Iterator::read_raw_kmers() const size_t consumer_id = get_idle_consumer(); auto& consumer_state = consumer[consumer_id]; - consumer_state.pref_idx = kmer_database.curr_prefix_idx(); + consumer_state.prefix = kmer_database.curr_prefix(); consumer_state.suff_idx = kmer_database.curr_suffix_idx(); consumer_state.kmers_available = kmer_database.read_raw_suffixes(consumer_state.buffer, BUF_SZ_PER_CONSUMER); @@ -323,7 +323,7 @@ inline bool Kmer_SPMC_Iterator::value_at(const size_t consumer_id, Kmer& k return false; } - kmer_database.parse_kmer(ts.pref_idx, ts.suff_idx, ts.buffer, + kmer_database.parse_kmer(ts.prefix, ts.suff_idx, ts.buffer, ts.kmers_parsed * kmer_database.suff_record_size(), kmer); ts.kmers_parsed++; diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 6e3ee388..64be5367 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -119,8 +119,8 @@ class CKMCFile // Returns the size of a suffix-record in disk (in bytes); i.e. suffix-size plus counter-size. uint32_t suff_record_size() const; - // Returns the current prefix's index (i.e. the next one to be parsed). - uint64_t curr_prefix_idx() const; + // Returns the current prefix (i.e. the next one to be potentially parsed). + uint64_t curr_prefix() const; // Returns the current suffix's index (i.e. the next one to be parsed). uint64_t curr_suffix_idx() const; @@ -130,10 +130,10 @@ class CKMCFile uint64_t read_raw_suffixes(uint8_t* suff_buf, size_t max_bytes_to_read); // Parses a raw binary k-mer from the `buf_idx`'th byte onwards of the buffer `suff_buf`, into - // the Cuttlefish k-mer object `kmer`. `pref_idx` and `suff_idx` are the indices of the potential - // prefix and the exact suffix record that make up the k-mer to be parsed. The indices are adjusted + // the Cuttlefish k-mer object `kmer`. `prefix` and `suff_idx` are respectively the potential + // prefix and the exact suffix record that make up the k-mer to be parsed. The values are adjusted // accordingly for the next parse operation into the buffer. - template void parse_kmer(uint64_t& pref_idx, uint64_t& suff_idx, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; + template void parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF bool ReadNextKmer(CKmerAPI &kmer, float &count); @@ -530,7 +530,7 @@ inline uint32_t CKMCFile::suff_record_size() const } -inline uint64_t CKMCFile::curr_prefix_idx() const +inline uint64_t CKMCFile::curr_prefix() const { return prefix_index; } @@ -543,21 +543,21 @@ inline uint64_t CKMCFile::curr_suffix_idx() const template -inline void CKMCFile::parse_kmer(uint64_t& pref_idx, uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const +inline void CKMCFile::parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const { static constexpr uint16_t NUM_INTS = (k + 31) / 32; uint64_t kmc_data[NUM_INTS]{}; // Get the prefix. - while(suff_idx == prefix_file_buf[pref_idx + 1]) - pref_idx++; + while(suff_idx == prefix_file_buf[prefix + 1]) + prefix++; // TODO: make some of these constant class-fields, to avoid repeated calculations. const uint64_t prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db constexpr uint8_t byte_alignment = (k % 4 != 0 ? 4 - (k % 4) : 0); - uint32_t off = (sizeof(pref_idx) * 8) - (lut_prefix_length * 2) - byte_alignment * 2; - const uint64_t temp_prefix = (pref_idx & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format + uint32_t off = (sizeof(prefix) * 8) - (lut_prefix_length * 2) - byte_alignment * 2; + const uint64_t temp_prefix = (prefix & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format // Store prefix in a KMC alignment (differs in endianness from Cuttlefish's). kmc_data[0] = temp_prefix; From eaba4579666776cc41d31d2bee894fff01e9feba Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 26 Aug 2021 15:17:23 -0400 Subject: [PATCH 135/350] initial (certainly buggy) impl of improved iterator --- include/Kmer_SPMC_Iterator.hpp | 9 +- include/kmc_api/kmc_file.h | 168 +++++++++++++++++++++++++++++++-- src/kmc_api/kmc_file.cpp | 17 ++-- 3 files changed, 179 insertions(+), 15 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index a684daea..d084142c 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -21,6 +21,8 @@ struct Consumer_Data uint64_t suff_idx; // Index of the suffix (into the in-disk KMC suffix collection) to start parsing (and using) k-mers from. uint64_t kmers_available; // Number of raw suffixes present in the current buffer. uint64_t kmers_parsed; // Number of k-mers parsed from the current buffer. + std::vector> prefix_vec; + std::vector>::iterator prefix_iterator; uint64_t pad_[3]; // Padding to avoid false-sharing. // TODO: use better soln: https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size }; @@ -251,7 +253,10 @@ inline void Kmer_SPMC_Iterator::read_raw_kmers() consumer_state.prefix = kmer_database.curr_prefix(); consumer_state.suff_idx = kmer_database.curr_suffix_idx(); - consumer_state.kmers_available = kmer_database.read_raw_suffixes(consumer_state.buffer, BUF_SZ_PER_CONSUMER); + consumer_state.kmers_available = kmer_database.read_raw_suffixes( + consumer_state.buffer, consumer_state.prefix_vec, BUF_SZ_PER_CONSUMER); + consumer_state.prefix_iterator = consumer_state.prefix_vec.begin(); + if(!consumer_state.kmers_available) { std::cerr << "Error reading the suffix file. Aborting.\n"; @@ -323,7 +328,7 @@ inline bool Kmer_SPMC_Iterator::value_at(const size_t consumer_id, Kmer& k return false; } - kmer_database.parse_kmer(ts.prefix, ts.suff_idx, ts.buffer, + kmer_database.parse_kmer_buf(ts.prefix_iterator, ts.suff_idx, ts.buffer, ts.kmers_parsed * kmer_database.suff_record_size(), kmer); ts.kmers_parsed++; diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 64be5367..da113364 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -13,9 +13,99 @@ #include "kmer_defs.h" #include "kmer_api.h" +#include #include #include +#include + +class VirtualFileBuffer { + private: + static constexpr uint64 buffelem = 65536;//1048576; + static constexpr uint64 buffsize = buffelem * sizeof(uint64); // 8MB buffer, why not? + uint64 prefix_file_buf_size; + std::array prefix_file_buf; + uint64 last_data_index; + + uint64 lut_area_size_in_bytes; + uint64 prefix_chunk_start_index; // The index where the prefix chunk currently loaded into memory starts + uint64 prefix_chunk_end_index; + + uint64 total_kmers; + FILE* fp = nullptr; + public: + VirtualFileBuffer() { + fp = nullptr; + } + + VirtualFileBuffer(VirtualFileBuffer&& o) { + prefix_file_buf_size = o.prefix_file_buf_size; + prefix_file_buf = std::move(prefix_file_buf); + last_data_index = o.last_data_index; + lut_area_size_in_bytes = o.lut_area_size_in_bytes; + prefix_chunk_start_index = o.prefix_chunk_start_index; + prefix_chunk_end_index = o.prefix_chunk_end_index; + total_kmers = o.total_kmers; + fp = o.fp; + o.fp = nullptr; + } + + VirtualFileBuffer(const VirtualFileBuffer& o) = delete; + VirtualFileBuffer& operator=(const VirtualFileBuffer& o) = delete; + VirtualFileBuffer& operator=(VirtualFileBuffer& o) = delete; + + void init(FILE*& fptr, uint64 lut_area_size_in_bytes_in, uint64 total_kmers_in) { + // here, we *take ownership* of fptr + fp = fptr; + fptr = NULL; + + // skip the first 4 bytes of header to get to the start of + // the prefixes + my_fseek(fp, +4, SEEK_CUR); + lut_area_size_in_bytes = lut_area_size_in_bytes_in; + prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) + last_data_index = (lut_area_size_in_bytes + 8) / sizeof(uint64); + total_kmers = total_kmers_in; + + // read the minimum of whatever is left or our buffersize + auto amount_to_read = std::min((lut_area_size_in_bytes + 8), buffsize); + auto result = fread(prefix_file_buf.data(), 1, static_cast(amount_to_read), fp); + if (result == 0) { + std::cerr << "fread returned 0 in init(), should not happen\n\n"; + } + prefix_chunk_start_index = 0; + prefix_chunk_end_index = amount_to_read / sizeof(uint64); + } + + ~VirtualFileBuffer() { + if (fp) { + fclose(fp); + fp = nullptr; + } + } + + uint64 operator[](size_t index) { + if (index >= prefix_file_buf_size) { + return total_kmers + 1; + } else if (index >= prefix_chunk_end_index) { + auto current_end_byte = prefix_chunk_end_index * sizeof(uint64); + // read the minimum of whatever is left or our buffersize + auto amount_to_read = std::min(((lut_area_size_in_bytes + 8) - current_end_byte), + buffsize); + auto result = fread(prefix_file_buf.data(), 1, + static_cast(amount_to_read), fp); + if (result == 0) { + std::cerr << "fread returned 0, should not happen\n\n"; + return total_kmers + 1; + } + prefix_chunk_start_index = index; + prefix_chunk_end_index = index + (amount_to_read / sizeof(uint64)); + } + size_t rel_index = index - prefix_chunk_start_index; + auto rv = prefix_file_buf[rel_index]; + return rv; + } +}; struct CKMCFileInfo { uint32 kmer_length; @@ -44,6 +134,7 @@ class CKMCFile FILE *file_suf; uint64* prefix_file_buf; + VirtualFileBuffer prefix_virt_buf; uint64 prefix_file_buf_size; uint64 prefix_index; // The current prefix's index in an array "prefix_file_buf", readed from *.kmc_pre uint32 single_LUT_size; // The size of a single LUT (in no. of elements) @@ -127,7 +218,9 @@ class CKMCFile // Reads up-to `max_bytes_to_read` bytes worth of raw suffix records into the buffer `suff_buf`. // Returns the number of suffixes read. `0` is returned if error(s) occurred during the read. - uint64_t read_raw_suffixes(uint8_t* suff_buf, size_t max_bytes_to_read); + uint64_t read_raw_suffixes(uint8_t* suff_buf, + std::vector>& prefix_vec, + size_t max_bytes_to_read); // Parses a raw binary k-mer from the `buf_idx`'th byte onwards of the buffer `suff_buf`, into // the Cuttlefish k-mer object `kmer`. `prefix` and `suff_idx` are respectively the potential @@ -135,6 +228,9 @@ class CKMCFile // accordingly for the next parse operation into the buffer. template void parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; + template void parse_kmer_buf(std::vector>::iterator& prefix_it, + uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const; + // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF bool ReadNextKmer(CKmerAPI &kmer, float &count); @@ -469,27 +565,30 @@ inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint64 &count) } -inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, const size_t max_bytes_to_read) + +inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, + std::vector>& prefix_vec, const size_t max_bytes_to_read) { if(is_opened != opened_for_listing) return 0; - const size_t max_suff_count = max_bytes_to_read / suff_record_size(); uint64_t suff_read_count = 0; // Count of suffixes to be read into the buffer `suff_buf`. + prefix_vec.clear(); while(!end_of_file) { - if(prefix_file_buf[prefix_index] > total_kmers) + if(prefix_virt_buf[prefix_index] > total_kmers) break; // This conditional might be removable, by fixing the last entry of `prefix_file_buf` to `total_kmers` during its initialization. // TODO: Check if setting `prefix_file_buf[last_data_index]` to `total_kmers` instead of `total_kmers + 1` (current scheme) breaks stuffs. - const uint64_t suff_id_next = (prefix_file_buf[prefix_index + 1] > total_kmers ? total_kmers : prefix_file_buf[prefix_index + 1]); + const uint64_t suff_id_next = (prefix_virt_buf[prefix_index + 1] > total_kmers ? total_kmers : prefix_virt_buf[prefix_index + 1]); // const uint64_t suff_id_next = std::min(prefix_file_buf[prefix_index + 1], total_kmers); // There are this many k-mers with the prefix `prefix_index`. const uint64_t suff_to_read = suff_id_next - sufix_number; + uint64_t prev_sufix_number = sufix_number; if(suff_to_read > 0) { if(suff_read_count + suff_to_read <= max_suff_count) @@ -507,11 +606,13 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, const size_ break; } + // number of suffixes this prefix corrresponds to + int64_t num_suf = sufix_number - prev_sufix_number; + prefix_vec.emplace_back(prefix_index, num_suf); } prefix_index++; } - const size_t bytes_to_read = suff_read_count * suff_record_size(); const size_t bytes_read = std::fread(suff_buf, 1, bytes_to_read, file_suf); @@ -591,6 +692,61 @@ inline void CKMCFile::parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uin kmer.from_KMC_data(kmc_data); } +template +inline void CKMCFile::parse_kmer_buf( + std::vector>::iterator& prefix_it, + uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const +{ + static constexpr uint16_t NUM_INTS = (k + 31) / 32; + uint64_t kmc_data[NUM_INTS]{}; + + // check if we have exhausted the currrent prefix + if (prefix_it->second == 0) { + ++prefix_it; + } + prefix_it->second--; + auto prefix = prefix_it->first; + + //while(suff_idx == prefix_file_buf[prefix + 1]) + // prefix++; + + // TODO: make some of these constant class-fields, to avoid repeated calculations. + const uint64_t prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db + constexpr uint8_t byte_alignment = (k % 4 != 0 ? 4 - (k % 4) : 0); + uint32_t off = (sizeof(prefix) * 8) - (lut_prefix_length * 2) - byte_alignment * 2; + const uint64_t temp_prefix = (prefix & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format + + // Store prefix in a KMC alignment (differs in endianness from Cuttlefish's). + kmc_data[0] = temp_prefix; + + + // Parse suffix. + uint32_t row_idx{0}; + uint64_t suff{0}; + + off -= 8; + for(uint32 a = 0; a < sufix_size; a++) + { + suff = suff_buf[buf_idx++]; + suff = suff << off; + kmc_data[row_idx] = kmc_data[row_idx] | suff; + + if(off == 0) //the end of a word in kmer_data + { + off = 56; + row_idx++; + } + else + off -= 8; + } + suff_idx++; + + // Skip counter. + // buf_idx += counter_size; + + // Parse KMC raw-binary k-mer data to Cuttlefish's k-mer format. + kmer.from_KMC_data(kmc_data); +} #endif diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index ab9bbdfc..b576b455 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -36,8 +36,7 @@ bool CKMCFile::OpenForRA(const std::string &file_name) return false; ReadParamsFrom_prefix_file_buf(size); - - fclose(file_pre); + if (file_pre) { fclose(file_pre); } file_pre = NULL; if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) @@ -77,7 +76,7 @@ bool CKMCFile::OpenForListing(const std::string &file_name) return false; ReadParamsFrom_prefix_file_buf(size); - fclose(file_pre); + if (file_pre) { fclose(file_pre); } file_pre = NULL; end_of_file = total_kmers == 0; @@ -124,9 +123,9 @@ bool CKMCFile::open_for_listing_unbuffered(const std::string& file_name) if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) return false; - - ReadParamsFrom_prefix_file_buf(size); - fclose(file_pre); + + ReadParamsFrom_prefix_file_buf(size, false); + if (file_pre) { fclose(file_pre); } file_pre = NULL; end_of_file = total_kmers == 0; @@ -165,7 +164,7 @@ bool CKMCFile::read_parameters(const std::string& file_name) return false; ReadParamsFrom_prefix_file_buf(size, false); - fclose(file_pre); + if (file_pre) { fclose(file_pre); } file_pre = NULL; end_of_file = total_kmers == 0; @@ -313,9 +312,13 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref prefix_file_buf[last_data_index] = total_kmers + 1; signature_map = new uint32[signature_map_size]; + result = fread(signature_map, 1, signature_map_size * sizeof(uint32), file_pre); if (result == 0) return false; + } else { + rewind(file_pre); + prefix_virt_buf.init(file_pre, lut_area_size_in_bytes, total_kmers); } sufix_size = (kmer_length - lut_prefix_length) / 4; From 55901ca36bde17f7c23dd05e4e8f5277b27a744e Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 26 Aug 2021 23:43:02 -0400 Subject: [PATCH 136/350] passing validation --- include/Kmer_SPMC_Iterator.hpp | 10 +++++++++- include/kmc_api/kmc_file.h | 19 ++++++++++++++++--- src/test.cpp | 11 ++++++++--- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index d084142c..81b7c494 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -218,6 +218,8 @@ inline void Kmer_SPMC_Iterator::launch_production() consumer_state.suff_idx = 0; consumer_state.kmers_available = 0; consumer_state.kmers_parsed = 0; + consumer_state.prefix_vec.clear(); + consumer_state.prefix_iterator = consumer_state.prefix_vec.begin(); task_status[id] = Task_Status::pending; } @@ -256,7 +258,7 @@ inline void Kmer_SPMC_Iterator::read_raw_kmers() consumer_state.kmers_available = kmer_database.read_raw_suffixes( consumer_state.buffer, consumer_state.prefix_vec, BUF_SZ_PER_CONSUMER); consumer_state.prefix_iterator = consumer_state.prefix_vec.begin(); - + if(!consumer_state.kmers_available) { std::cerr << "Error reading the suffix file. Aborting.\n"; @@ -324,6 +326,12 @@ inline bool Kmer_SPMC_Iterator::value_at(const size_t consumer_id, Kmer& k auto& ts = consumer[consumer_id]; if(ts.kmers_parsed == ts.kmers_available) { + /* + auto d = std::distance(ts.prefix_iterator, ts.prefix_vec.end()); + std::cerr << "pref. remaining = " + << d << ", " + << "count of last pref = " << ((d > 0) ? (ts.prefix_iterator->second) : 0) << "\n"; + */ task_status[consumer_id] = Task_Status::pending; return false; } diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index da113364..123dec4b 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -574,6 +574,7 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, const size_t max_suff_count = max_bytes_to_read / suff_record_size(); uint64_t suff_read_count = 0; // Count of suffixes to be read into the buffer `suff_buf`. + uint64_t pref_sum = 0; prefix_vec.clear(); while(!end_of_file) @@ -596,6 +597,11 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, suff_read_count += suff_to_read; sufix_number += suff_to_read; + // number of suffixes this prefix corrresponds to + int64_t num_suf = sufix_number - prev_sufix_number; + pref_sum += num_suf; + prefix_vec.emplace_back(prefix_index, num_suf); + if(sufix_number == total_kmers) end_of_file = true; } @@ -604,15 +610,22 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, sufix_number += (max_suff_count - suff_read_count); suff_read_count = max_suff_count; + // number of suffixes this prefix corrresponds to + int64_t num_suf = sufix_number - prev_sufix_number; + pref_sum += num_suf; + prefix_vec.emplace_back(prefix_index, num_suf); + break; } - // number of suffixes this prefix corrresponds to - int64_t num_suf = sufix_number - prev_sufix_number; - prefix_vec.emplace_back(prefix_index, num_suf); + } prefix_index++; } + if (pref_sum != suff_read_count) { + std::cerr << "pref_sum = " << pref_sum + << ", suff_read_count = " << suff_read_count << "\n"; + } const size_t bytes_to_read = suff_read_count * suff_record_size(); const size_t bytes_read = std::fread(suff_buf, 1, bytes_to_read, file_suf); diff --git a/src/test.cpp b/src/test.cpp index 8899bf3b..c16d13d2 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -540,6 +540,12 @@ void test_iterator_correctness(const char* const db_path, const size_t consumer_ if(!(buf_kmers[i] == spmc_kmers[i])) { // std::cout << "Mismatching k-mers found\n"; + if (mis == 0) { + std::cout << "first mismatching k-mers were:\n"; + std::cout << "buf[" << i << "] = " + << buf_kmers[i].string_label() << " != spmc[" + << i << "] = " << spmc_kmers[i].string_label() << "\n"; + } mis++; } @@ -635,9 +641,8 @@ int main(int argc, char** argv) static const size_t consumer_count = std::atoi(argv[2]); // test_buffered_iterator_performance(argv[1]); - test_SPMC_iterator_performance(argv[1], consumer_count); - + // test_SPMC_iterator_performance(argv[1], consumer_count); + test_iterator_correctness(argv[1], consumer_count); // write_kmers<32>(argv[1], std::atoi(argv[2]), argv[3]); - return 0; } From 5a8819c4a1732bdf126a790adfca68f350669e0e Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 2 Sep 2021 18:25:02 -0400 Subject: [PATCH 137/350] Update k-mer iterator --- include/Kmer_SPMC_Iterator.hpp | 48 ++---- include/kmc_api/kmc_file.h | 304 +++++++++++++++------------------ src/kmc_api/kmc_file.cpp | 55 +++--- src/test.cpp | 25 +-- 4 files changed, 187 insertions(+), 245 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index 81b7c494..f625cca9 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -16,14 +16,12 @@ // Data required by the consumers to correctly parse raw binary k-mers. struct Consumer_Data { - uint8_t* buffer{nullptr}; // Buffer for the raw binary k-mers. - uint64_t prefix; // The potential prefix to start parsing (and using) k-mers from (used as index into the in-memory KMC prefix-buffer). - uint64_t suff_idx; // Index of the suffix (into the in-disk KMC suffix collection) to start parsing (and using) k-mers from. - uint64_t kmers_available; // Number of raw suffixes present in the current buffer. - uint64_t kmers_parsed; // Number of k-mers parsed from the current buffer. - std::vector> prefix_vec; - std::vector>::iterator prefix_iterator; - uint64_t pad_[3]; // Padding to avoid false-sharing. + uint8_t* suff_buf{nullptr}; // Buffer for the raw binary suffixes of the k-mers. + uint64_t kmers_available; // Number of k-mers present in the current buffer. + uint64_t kmers_parsed; // Number of k-mers parsed from the current buffers. + std::vector> pref_buf; // Buffer for the raw binary prefixes of the k-mers, in the form: + std::vector>::iterator pref_it; // Pointer to the prefix to start parsing k-mers from. + uint64_t pad_[1]; // Padding to avoid false-sharing. // TODO: use better soln: https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size }; @@ -168,9 +166,8 @@ inline Kmer_SPMC_Iterator::~Kmer_SPMC_Iterator() { delete[] task_status; - for(size_t id = 0; id < consumer_count; ++id) { - delete[] consumer[id].buffer; - } + for(size_t id = 0; id < consumer_count; ++id) + delete[] consumer[id].suff_buf; std::cerr << "\nCompleted a pass over the k-mer database.\n"; } @@ -180,7 +177,7 @@ inline Kmer_SPMC_Iterator::~Kmer_SPMC_Iterator() template inline void Kmer_SPMC_Iterator::open_kmer_database(const std::string& db_path) { - if(!kmer_database.open_for_listing_unbuffered(db_path)) + if(!kmer_database.open_for_cuttlefish_listing(db_path)) { std::cerr << "Error opening k-mer database with prefix " << db_path << ". Aborting.\n"; std::exit(EXIT_FAILURE); @@ -213,13 +210,11 @@ inline void Kmer_SPMC_Iterator::launch_production() for(size_t id = 0; id < consumer_count; ++id) { auto& consumer_state = consumer[id]; - consumer_state.buffer = new uint8_t[BUF_SZ_PER_CONSUMER]; - consumer_state.prefix = 0; - consumer_state.suff_idx = 0; + consumer_state.suff_buf = new uint8_t[BUF_SZ_PER_CONSUMER]; consumer_state.kmers_available = 0; consumer_state.kmers_parsed = 0; - consumer_state.prefix_vec.clear(); - consumer_state.prefix_iterator = consumer_state.prefix_vec.begin(); + consumer_state.pref_buf.clear(); + consumer_state.pref_it = consumer_state.pref_buf.begin(); task_status[id] = Task_Status::pending; } @@ -250,15 +245,11 @@ inline void Kmer_SPMC_Iterator::read_raw_kmers() while(!kmer_database.Eof()) { const size_t consumer_id = get_idle_consumer(); - auto& consumer_state = consumer[consumer_id]; + Consumer_Data& consumer_state = consumer[consumer_id]; - consumer_state.prefix = kmer_database.curr_prefix(); - consumer_state.suff_idx = kmer_database.curr_suffix_idx(); + consumer_state.kmers_available = kmer_database.read_raw_suffixes(consumer_state.suff_buf, consumer_state.pref_buf, BUF_SZ_PER_CONSUMER); + consumer_state.pref_it = consumer_state.pref_buf.begin(); - consumer_state.kmers_available = kmer_database.read_raw_suffixes( - consumer_state.buffer, consumer_state.prefix_vec, BUF_SZ_PER_CONSUMER); - consumer_state.prefix_iterator = consumer_state.prefix_vec.begin(); - if(!consumer_state.kmers_available) { std::cerr << "Error reading the suffix file. Aborting.\n"; @@ -326,18 +317,11 @@ inline bool Kmer_SPMC_Iterator::value_at(const size_t consumer_id, Kmer& k auto& ts = consumer[consumer_id]; if(ts.kmers_parsed == ts.kmers_available) { - /* - auto d = std::distance(ts.prefix_iterator, ts.prefix_vec.end()); - std::cerr << "pref. remaining = " - << d << ", " - << "count of last pref = " << ((d > 0) ? (ts.prefix_iterator->second) : 0) << "\n"; - */ task_status[consumer_id] = Task_Status::pending; return false; } - kmer_database.parse_kmer_buf(ts.prefix_iterator, ts.suff_idx, ts.buffer, - ts.kmers_parsed * kmer_database.suff_record_size(), kmer); + kmer_database.parse_kmer_buf(ts.pref_it, ts.suff_buf, ts.kmers_parsed * kmer_database.suff_record_size(), kmer); ts.kmers_parsed++; return true; diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 123dec4b..b1db7d59 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -3,6 +3,7 @@ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot + Cuttlefish support: Jamshed Khan, Rob Patro Version: 3.1.1 Date : 2019-05-19 @@ -17,95 +18,126 @@ #include #include #include +#include -class VirtualFileBuffer { - private: - static constexpr uint64 buffelem = 65536;//1048576; - static constexpr uint64 buffsize = buffelem * sizeof(uint64); // 8MB buffer, why not? - uint64 prefix_file_buf_size; - std::array prefix_file_buf; - uint64 last_data_index; +// A class to imitate the KMC3 prefix-file access as if it's in memory. +// Note: only linear indexing is supported. +class Virtual_Prefix_File +{ +private: - uint64 lut_area_size_in_bytes; - uint64 prefix_chunk_start_index; // The index where the prefix chunk currently loaded into memory starts - uint64 prefix_chunk_end_index; + static constexpr size_t buffer_elem_count = (1 << 16); // Number of prefixes to be kept in memory buffer at a time. + static constexpr size_t buffer_sz = buffer_elem_count * sizeof(uint64_t); // Size of buffer in bytes: 512KB. TODO: try small benchmarking for this size. + size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). + std::array prefix_file_buf; // The in-memory prefix-file buffer. - uint64 total_kmers; - FILE* fp = nullptr; - public: - VirtualFileBuffer() { - fp = nullptr; - } - - VirtualFileBuffer(VirtualFileBuffer&& o) { - prefix_file_buf_size = o.prefix_file_buf_size; - prefix_file_buf = std::move(prefix_file_buf); - last_data_index = o.last_data_index; - lut_area_size_in_bytes = o.lut_area_size_in_bytes; - prefix_chunk_start_index = o.prefix_chunk_start_index; - prefix_chunk_end_index = o.prefix_chunk_end_index; - total_kmers = o.total_kmers; - fp = o.fp; - o.fp = nullptr; - } - - VirtualFileBuffer(const VirtualFileBuffer& o) = delete; - VirtualFileBuffer& operator=(const VirtualFileBuffer& o) = delete; - VirtualFileBuffer& operator=(VirtualFileBuffer& o) = delete; - - void init(FILE*& fptr, uint64 lut_area_size_in_bytes_in, uint64 total_kmers_in) { - // here, we *take ownership* of fptr + uint64_t lut_area_size_in_bytes; // From KMC3. + size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. + size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. + + uint64_t total_kmers; // Total number of k-mers in the KMC3 database. + FILE* fp; // File handle to the KMC3 prefix-file. + + + // Reads in as much data as possible from the prefix-file into the in-memory buffer, + // and returns the number of elements read. + size_t read_prefixes(); + + +public: + + // Constructs an empty virtual file buffer. + Virtual_Prefix_File(); + + // Invalidate move and copy constructors, and copy-assignment operators. + Virtual_Prefix_File(Virtual_Prefix_File&& rhs) = delete; + Virtual_Prefix_File(const Virtual_Prefix_File& rhs) = delete; + Virtual_Prefix_File& operator=(const Virtual_Prefix_File& rhs) = delete; + Virtual_Prefix_File& operator=(Virtual_Prefix_File& rhs) = delete; + + // Destructs the virtual file. + ~Virtual_Prefix_File(); + + // Initializes the file buffer with the file handle `fptr` that is supposed to contain + // `lut_area_bytes` amount of bytes for its prefix-content, and the associated KMC3 + // database has `kmer_count` number of k-mers. + void init(FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); + + // Returns the data at index `idx` of the prefix-file. + uint64_t operator[](size_t idx); +}; + + +inline Virtual_Prefix_File::Virtual_Prefix_File(): + prefix_file_elem_count(0), + lut_area_size_in_bytes(0), + prefix_chunk_start_index(0), + prefix_chunk_end_index(0), + total_kmers(0), + fp(nullptr) +{} + + +inline Virtual_Prefix_File::~Virtual_Prefix_File() +{ + if(fp) + { + std::fclose(fp); + fp = nullptr; + } +} + + +inline void Virtual_Prefix_File::init(FILE*& fptr, const uint64_t lut_area_bytes, const uint64_t kmer_count) +{ + // *Take ownership* of `fptr`. fp = fptr; fptr = NULL; - // skip the first 4 bytes of header to get to the start of - // the prefixes + // Skip the first 4 bytes of header to get to the start of the prefixes. my_fseek(fp, +4, SEEK_CUR); - lut_area_size_in_bytes = lut_area_size_in_bytes_in; - prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) - last_data_index = (lut_area_size_in_bytes + 8) / sizeof(uint64); - total_kmers = total_kmers_in; - - // read the minimum of whatever is left or our buffersize - auto amount_to_read = std::min((lut_area_size_in_bytes + 8), buffsize); - auto result = fread(prefix_file_buf.data(), 1, static_cast(amount_to_read), fp); - if (result == 0) { - std::cerr << "fread returned 0 in init(), should not happen\n\n"; - } + lut_area_size_in_bytes = lut_area_bytes; + prefix_file_elem_count = (lut_area_size_in_bytes + 8) / sizeof(uint64_t); // What's that extra 1 element for? KMC3 comment: reads without 4 bytes of a header_offset (and without markers) + total_kmers = kmer_count; + + // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. prefix_chunk_start_index = 0; - prefix_chunk_end_index = amount_to_read / sizeof(uint64); - } + prefix_chunk_end_index = read_prefixes(); +} - ~VirtualFileBuffer() { - if (fp) { - fclose(fp); - fp = nullptr; + +inline size_t Virtual_Prefix_File::read_prefixes() +{ + const size_t elems_to_read = std::min(prefix_file_elem_count - prefix_chunk_end_index, buffer_elem_count); + const size_t bytes_to_read = elems_to_read * sizeof(uint64_t); + const size_t bytes_read = std::fread(prefix_file_buf.data(), 1, bytes_to_read, fp); + + if(bytes_read != bytes_to_read) + { + std::cerr << "Error reading the KMC database prefix file. Aborting.\n"; + std::exit(EXIT_FAILURE); } - } - - uint64 operator[](size_t index) { - if (index >= prefix_file_buf_size) { - return total_kmers + 1; - } else if (index >= prefix_chunk_end_index) { - auto current_end_byte = prefix_chunk_end_index * sizeof(uint64); - // read the minimum of whatever is left or our buffersize - auto amount_to_read = std::min(((lut_area_size_in_bytes + 8) - current_end_byte), - buffsize); - auto result = fread(prefix_file_buf.data(), 1, - static_cast(amount_to_read), fp); - if (result == 0) { - std::cerr << "fread returned 0, should not happen\n\n"; - return total_kmers + 1; - } - prefix_chunk_start_index = index; - prefix_chunk_end_index = index + (amount_to_read / sizeof(uint64)); + + return elems_to_read; +} + + +inline uint64_t Virtual_Prefix_File::operator[](const size_t idx) +{ + if(idx >= prefix_file_elem_count) + return total_kmers + 1; + + if(idx >= prefix_chunk_end_index) + { + prefix_chunk_start_index = idx; + prefix_chunk_end_index = idx + read_prefixes(); } - size_t rel_index = index - prefix_chunk_start_index; - auto rv = prefix_file_buf[rel_index]; - return rv; - } -}; + + return prefix_file_buf[idx - prefix_chunk_start_index]; +} + + struct CKMCFileInfo { uint32 kmer_length; @@ -134,7 +166,7 @@ class CKMCFile FILE *file_suf; uint64* prefix_file_buf; - VirtualFileBuffer prefix_virt_buf; + Virtual_Prefix_File prefix_virt_buf; // Virtual file to read over the prefix file in a buffered manner; for Cuttlefish. uint64 prefix_file_buf_size; uint64 prefix_index; // The current prefix's index in an array "prefix_file_buf", readed from *.kmc_pre uint32 single_LUT_size; // The size of a single LUT (in no. of elements) @@ -171,7 +203,7 @@ class CKMCFile bool OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]); // Recognize current parameters. Auxiliary function. - bool ReadParamsFrom_prefix_file_buf(uint64 &size, bool load_pref_file = true); + bool ReadParamsFrom_prefix_file_buf(uint64 &size, bool load_pref_file = true, bool init_pref_buf = true); // Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function. void Reload_sufix_file_buf(); @@ -202,7 +234,7 @@ class CKMCFile bool OpenForListing(const std::string& file_name); // Open files `*kmc_pre` & `*.kmc_suf`, read `*.kmc_pre` to RAM; `*.kmc_suf` is not buffered internally. - bool open_for_listing_unbuffered(const std::string& file_name); + bool open_for_cuttlefish_listing(const std::string& file_name); // Open files `*kmc_pre` & `*.kmc_suf`, and read KMC DB parameters to RAM. bool read_parameters(const std::string& file_name); @@ -217,19 +249,16 @@ class CKMCFile uint64_t curr_suffix_idx() const; // Reads up-to `max_bytes_to_read` bytes worth of raw suffix records into the buffer `suff_buf`. - // Returns the number of suffixes read. `0` is returned if error(s) occurred during the read. - uint64_t read_raw_suffixes(uint8_t* suff_buf, - std::vector>& prefix_vec, - size_t max_bytes_to_read); + // The prefixes corresponding to these suffixes are read into `pref_buf`, in the form + // . Returns the number of suffixes read. `0` is returned if + // error(s) occurred during the read. + uint64_t read_raw_suffixes(uint8_t* suff_buf, std::vector>& pref_buf, size_t max_bytes_to_read); - // Parses a raw binary k-mer from the `buf_idx`'th byte onwards of the buffer `suff_buf`, into - // the Cuttlefish k-mer object `kmer`. `prefix` and `suff_idx` are respectively the potential - // prefix and the exact suffix record that make up the k-mer to be parsed. The values are adjusted - // accordingly for the next parse operation into the buffer. - template void parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; - - template void parse_kmer_buf(std::vector>::iterator& prefix_it, - uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const; + // Parses a raw binary k-mer from the `buf_idx`'th byte onward of the buffer `suff_buf`, into + // the Cuttlefish k-mer object `kmer`. `prefix_it` points to a pair of the form + // where "abundance" is the count of remaining k-mers to be parsed having this "prefix". The + // iterator is adjusted accordingly for the next parse operation from the buffers. + template void parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF bool ReadNextKmer(CKmerAPI &kmer, float &count); @@ -566,16 +595,14 @@ inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint64 &count) -inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, - std::vector>& prefix_vec, const size_t max_bytes_to_read) +inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, std::vector>& pref_buf, const size_t max_bytes_to_read) { if(is_opened != opened_for_listing) return 0; const size_t max_suff_count = max_bytes_to_read / suff_record_size(); uint64_t suff_read_count = 0; // Count of suffixes to be read into the buffer `suff_buf`. - uint64_t pref_sum = 0; - prefix_vec.clear(); + pref_buf.clear(); while(!end_of_file) { @@ -589,18 +616,15 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, // There are this many k-mers with the prefix `prefix_index`. const uint64_t suff_to_read = suff_id_next - sufix_number; - uint64_t prev_sufix_number = sufix_number; if(suff_to_read > 0) { + const uint64_t prev_sufix_number = sufix_number; + if(suff_read_count + suff_to_read <= max_suff_count) { suff_read_count += suff_to_read; sufix_number += suff_to_read; - - // number of suffixes this prefix corrresponds to - int64_t num_suf = sufix_number - prev_sufix_number; - pref_sum += num_suf; - prefix_vec.emplace_back(prefix_index, num_suf); + pref_buf.emplace_back(prefix_index, sufix_number - prev_sufix_number); if(sufix_number == total_kmers) end_of_file = true; @@ -609,11 +633,7 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, { sufix_number += (max_suff_count - suff_read_count); suff_read_count = max_suff_count; - - // number of suffixes this prefix corrresponds to - int64_t num_suf = sufix_number - prev_sufix_number; - pref_sum += num_suf; - prefix_vec.emplace_back(prefix_index, num_suf); + pref_buf.emplace_back(prefix_index, sufix_number - prev_sufix_number); break; } @@ -622,11 +642,7 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, prefix_index++; } - if (pref_sum != suff_read_count) { - std::cerr << "pref_sum = " << pref_sum - << ", suff_read_count = " << suff_read_count << "\n"; - } - + const size_t bytes_to_read = suff_read_count * suff_record_size(); const size_t bytes_read = std::fread(suff_buf, 1, bytes_to_read, file_suf); if(bytes_read != bytes_to_read) @@ -657,71 +673,18 @@ inline uint64_t CKMCFile::curr_suffix_idx() const template -inline void CKMCFile::parse_kmer(uint64_t& prefix, uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const +inline void CKMCFile::parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const { static constexpr uint16_t NUM_INTS = (k + 31) / 32; uint64_t kmc_data[NUM_INTS]{}; - // Get the prefix. - - while(suff_idx == prefix_file_buf[prefix + 1]) - prefix++; - - // TODO: make some of these constant class-fields, to avoid repeated calculations. - const uint64_t prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db - constexpr uint8_t byte_alignment = (k % 4 != 0 ? 4 - (k % 4) : 0); - uint32_t off = (sizeof(prefix) * 8) - (lut_prefix_length * 2) - byte_alignment * 2; - const uint64_t temp_prefix = (prefix & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format - - // Store prefix in a KMC alignment (differs in endianness from Cuttlefish's). - kmc_data[0] = temp_prefix; - - - // Parse suffix. - uint32_t row_idx{0}; - uint64_t suff{0}; - - off -= 8; - for(uint32 a = 0; a < sufix_size; a++) - { - suff = suff_buf[buf_idx++]; - suff = suff << off; - kmc_data[row_idx] = kmc_data[row_idx] | suff; - - if(off == 0) //the end of a word in kmer_data - { - off = 56; - row_idx++; - } - else - off -= 8; - } - suff_idx++; - - // Skip counter. - // buf_idx += counter_size; - - // Parse KMC raw-binary k-mer data to Cuttlefish's k-mer format. - kmer.from_KMC_data(kmc_data); -} - -template -inline void CKMCFile::parse_kmer_buf( - std::vector>::iterator& prefix_it, - uint64_t& suff_idx, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const -{ - static constexpr uint16_t NUM_INTS = (k + 31) / 32; - uint64_t kmc_data[NUM_INTS]{}; - - // check if we have exhausted the currrent prefix - if (prefix_it->second == 0) { + // Check if we have exhausted the currrent prefix. + if(prefix_it->second == 0) ++prefix_it; - } + + const uint64_t prefix = prefix_it->first; prefix_it->second--; - auto prefix = prefix_it->first; - //while(suff_idx == prefix_file_buf[prefix + 1]) - // prefix++; // TODO: make some of these constant class-fields, to avoid repeated calculations. const uint64_t prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db @@ -752,7 +715,6 @@ inline void CKMCFile::parse_kmer_buf( else off -= 8; } - suff_idx++; // Skip counter. // buf_idx += counter_size; diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index b576b455..2cba9531 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -3,6 +3,7 @@ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot + Cuttlefish support: Jamshed Khan, Rob Patro Version: 3.1.1 Date : 2019-05-19 @@ -36,8 +37,7 @@ bool CKMCFile::OpenForRA(const std::string &file_name) return false; ReadParamsFrom_prefix_file_buf(size); - if (file_pre) { fclose(file_pre); } - file_pre = NULL; + fclose(file_pre); if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) return false; @@ -76,8 +76,7 @@ bool CKMCFile::OpenForListing(const std::string &file_name) return false; ReadParamsFrom_prefix_file_buf(size); - if (file_pre) { fclose(file_pre); } - file_pre = NULL; + fclose(file_pre); end_of_file = total_kmers == 0; @@ -106,31 +105,32 @@ bool CKMCFile::OpenForListing(const std::string &file_name) } //---------------------------------------------------------------------------------- -// Open files *kmc_pre & *.kmc_suf, read *.kmc_pre to RAM, close *kmc.pre -// *.kmc_suf is not buffered +// Opens files *kmc_pre & *.kmc_suf, reads database parameters; +// starts buffering *.kmc_pre, while *.kmc_suf is buffered through the Cuttlefish-iterator's consumers. // IN : file_name - the name of kmer_counter's output // RET : true - if successful //---------------------------------------------------------------------------------- -bool CKMCFile::open_for_listing_unbuffered(const std::string& file_name) +bool CKMCFile::open_for_cuttlefish_listing(const std::string& file_name) { uint64 size; - if (is_opened) + if(is_opened) return false; - if (file_pre || file_suf) + if(file_pre || file_suf) return false; - if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) + if(!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) return false; ReadParamsFrom_prefix_file_buf(size, false); - if (file_pre) { fclose(file_pre); } + if(file_pre) + std::fclose(file_pre); file_pre = NULL; - end_of_file = total_kmers == 0; + end_of_file = (total_kmers == 0); - if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) + if(!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) return false; suffix_file_total_to_read = size; @@ -145,8 +145,8 @@ bool CKMCFile::open_for_listing_unbuffered(const std::string& file_name) } //---------------------------------------------------------------------------------- -// Open files *kmc_pre & *.kmc_suf and reads KMC DB parameters to RAM. -// *.kmc_suf is not buffered +// Opens files *kmc_pre & *.kmc_suf and reads KMC DB parameters to RAM; +// none of the files are buffered. // IN : file_name - the name of kmer_counter's output // RET : true - if successful //---------------------------------------------------------------------------------- @@ -154,22 +154,23 @@ bool CKMCFile::read_parameters(const std::string& file_name) { uint64 size; - if (is_opened) + if(is_opened) return false; - if (file_pre || file_suf) + if(file_pre || file_suf) return false; - if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) + if(!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP")) return false; - ReadParamsFrom_prefix_file_buf(size, false); - if (file_pre) { fclose(file_pre); } + ReadParamsFrom_prefix_file_buf(size, false, false); + if(file_pre) + std::fclose(file_pre); file_pre = NULL; - end_of_file = total_kmers == 0; + end_of_file = (total_kmers == 0); - if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) + if(!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS")) return false; suffix_file_total_to_read = size; @@ -259,7 +260,7 @@ bool CKMCFile::OpenASingleFile(const std::string &file_name, FILE *&file_handler // IN : the size of the file *.kmc_pre, without initial and terminal markers // RET : true - if succesfull //---------------------------------------------------------------------------------- -bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_file) +bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_file, const bool init_pref_buf) { size_t prev_pos = my_ftell(file_pre); my_fseek(file_pre, -12, SEEK_END); @@ -302,7 +303,7 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref if(load_pref_file) { - rewind(file_pre); + std::rewind(file_pre); my_fseek(file_pre, +4, SEEK_CUR); prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) prefix_file_buf = new uint64[prefix_file_buf_size]; @@ -316,8 +317,10 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref result = fread(signature_map, 1, signature_map_size * sizeof(uint32), file_pre); if (result == 0) return false; - } else { - rewind(file_pre); + } + else if(init_pref_buf) + { + std::rewind(file_pre); prefix_virt_buf.init(file_pre, lut_area_size_in_bytes, total_kmers); } diff --git a/src/test.cpp b/src/test.cpp index c16d13d2..bb55237a 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -419,20 +419,20 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons std::cout << "Launched consumer " << consumer_id << ".\n"; Kmer kmer; Kmer max_kmer; - // uint64_t local_count{0}; + uint64_t local_count{0}; while(it.tasks_expected(consumer_id)) if(it.value_at(consumer_id, kmer)) { max_kmer = std::max(max_kmer, kmer); - // local_count++; - // if (local_count % 5000000 == 0) { - // ctr += local_count; - // local_count = 0; - // std::cerr << "parsed " << ctr << " k-mers\n"; - // } + local_count++; + if (local_count % 10000000 == 0) { + ctr += local_count; + local_count = 0; + std::cerr << "\rparsed " << ctr << " k-mers"; + } } - // ctr += local_count; + ctr += local_count; mk = max_kmer; } ) @@ -448,7 +448,7 @@ void test_SPMC_iterator_performance(const char* const db_path, const size_t cons //for (size_t i = 0; i < consumer_count; ++i) { // global_max = std::max(global_max, max_kmer[i]); //} - std::cout << "Parsed " << ctr << " k-mers\n"; + std::cout << "\nParsed " << ctr << " k-mers\n"; std::cout << "Max k-mer: " << std::max_element(max_kmer.begin(), max_kmer.end())->string_label() << "\n"; } @@ -539,13 +539,6 @@ void test_iterator_correctness(const char* const db_path, const size_t consumer_ for(size_t i = 0; i < buf_kmers.size(); ++i) if(!(buf_kmers[i] == spmc_kmers[i])) { - // std::cout << "Mismatching k-mers found\n"; - if (mis == 0) { - std::cout << "first mismatching k-mers were:\n"; - std::cout << "buf[" << i << "] = " - << buf_kmers[i].string_label() << " != spmc[" - << i << "] = " << spmc_kmers[i].string_label() << "\n"; - } mis++; } From 3b610d7983d967d3ff54e1965f140f011d067fd0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 2 Sep 2021 19:39:14 -0400 Subject: [PATCH 138/350] Increase prefix-file buffer size --- include/kmc_api/kmc_file.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index b1db7d59..f3c66c33 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -27,10 +27,9 @@ class Virtual_Prefix_File { private: - static constexpr size_t buffer_elem_count = (1 << 16); // Number of prefixes to be kept in memory buffer at a time. - static constexpr size_t buffer_sz = buffer_elem_count * sizeof(uint64_t); // Size of buffer in bytes: 512KB. TODO: try small benchmarking for this size. + static constexpr size_t buffer_elem_count = (1 << 21); // Number of prefixes to be kept in memory buffer at a time. size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). - std::array prefix_file_buf; // The in-memory prefix-file buffer. + std::vector prefix_file_buf; // The in-memory prefix-file buffer. uint64_t lut_area_size_in_bytes; // From KMC3. size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. @@ -101,6 +100,9 @@ inline void Virtual_Prefix_File::init(FILE*& fptr, const uint64_t lut_area_bytes prefix_file_elem_count = (lut_area_size_in_bytes + 8) / sizeof(uint64_t); // What's that extra 1 element for? KMC3 comment: reads without 4 bytes of a header_offset (and without markers) total_kmers = kmer_count; + // Allocate the prefix-file buffer. + prefix_file_buf.reserve(buffer_elem_count); + // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. prefix_chunk_start_index = 0; prefix_chunk_end_index = read_prefixes(); From d03de194ce1ba036cc232274ff55d7d286771708 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 2 Sep 2021 19:43:48 -0400 Subject: [PATCH 139/350] Separate virtual file --- include/kmc_api/Virtual_Prefix_File.hpp | 93 ++++++++++++++++++ include/kmc_api/kmc_file.h | 120 +----------------------- src/CMakeLists.txt | 1 + src/kmc_api/Virtual_Prefix_File.cpp | 44 +++++++++ 4 files changed, 139 insertions(+), 119 deletions(-) create mode 100644 include/kmc_api/Virtual_Prefix_File.hpp create mode 100644 src/kmc_api/Virtual_Prefix_File.cpp diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp new file mode 100644 index 00000000..54c2a003 --- /dev/null +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -0,0 +1,93 @@ + +#ifndef VIRTUAL_PREFIX_FILE_HPP +#define VIRTUAL_PREFIX_FILE_HPP + + + +#include +#include +#include +#include +#include + + +// A class to imitate the KMC3 prefix-file access as if it's in memory. +// Note: only linear indexing is supported. +class Virtual_Prefix_File +{ +private: + + static constexpr size_t buffer_elem_count = (1 << 21); // Number of prefixes to be kept in memory buffer at a time. + size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). + std::vector prefix_file_buf; // The in-memory prefix-file buffer. + + uint64_t lut_area_size_in_bytes; // From KMC3. + size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. + size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. + + uint64_t total_kmers; // Total number of k-mers in the KMC3 database. + std::FILE* fp; // File handle to the KMC3 prefix-file. + + + // Reads in as much data as possible from the prefix-file into the in-memory buffer, + // and returns the number of elements read. + size_t read_prefixes(); + + +public: + + // Constructs an empty virtual file buffer. + Virtual_Prefix_File(); + + // Invalidate move and copy constructors, and copy-assignment operators. + Virtual_Prefix_File(Virtual_Prefix_File&& rhs) = delete; + Virtual_Prefix_File(const Virtual_Prefix_File& rhs) = delete; + Virtual_Prefix_File& operator=(const Virtual_Prefix_File& rhs) = delete; + Virtual_Prefix_File& operator=(Virtual_Prefix_File& rhs) = delete; + + // Destructs the virtual file. + ~Virtual_Prefix_File(); + + // Initializes the file buffer with the file handle `fptr` that is supposed to contain + // `lut_area_bytes` amount of bytes for its prefix-content, and the associated KMC3 + // database has `kmer_count` number of k-mers. + void init(std::FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); + + // Returns the data at index `idx` of the prefix-file. + uint64_t operator[](size_t idx); +}; + + +inline size_t Virtual_Prefix_File::read_prefixes() +{ + const size_t elems_to_read = std::min(prefix_file_elem_count - prefix_chunk_end_index, buffer_elem_count); + const size_t bytes_to_read = elems_to_read * sizeof(uint64_t); + const size_t bytes_read = std::fread(prefix_file_buf.data(), 1, bytes_to_read, fp); + + if(bytes_read != bytes_to_read) + { + std::cerr << "Error reading the KMC database prefix file. Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + return elems_to_read; +} + + +inline uint64_t Virtual_Prefix_File::operator[](const size_t idx) +{ + if(idx >= prefix_file_elem_count) + return total_kmers + 1; + + if(idx >= prefix_chunk_end_index) + { + prefix_chunk_start_index = idx; + prefix_chunk_end_index = idx + read_prefixes(); + } + + return prefix_file_buf[idx - prefix_chunk_start_index]; +} + + + +#endif diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index f3c66c33..a0ad4bcf 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -14,6 +14,7 @@ #include "kmer_defs.h" #include "kmer_api.h" +#include "Virtual_Prefix_File.hpp" #include #include #include @@ -21,125 +22,6 @@ #include -// A class to imitate the KMC3 prefix-file access as if it's in memory. -// Note: only linear indexing is supported. -class Virtual_Prefix_File -{ -private: - - static constexpr size_t buffer_elem_count = (1 << 21); // Number of prefixes to be kept in memory buffer at a time. - size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). - std::vector prefix_file_buf; // The in-memory prefix-file buffer. - - uint64_t lut_area_size_in_bytes; // From KMC3. - size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. - size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. - - uint64_t total_kmers; // Total number of k-mers in the KMC3 database. - FILE* fp; // File handle to the KMC3 prefix-file. - - - // Reads in as much data as possible from the prefix-file into the in-memory buffer, - // and returns the number of elements read. - size_t read_prefixes(); - - -public: - - // Constructs an empty virtual file buffer. - Virtual_Prefix_File(); - - // Invalidate move and copy constructors, and copy-assignment operators. - Virtual_Prefix_File(Virtual_Prefix_File&& rhs) = delete; - Virtual_Prefix_File(const Virtual_Prefix_File& rhs) = delete; - Virtual_Prefix_File& operator=(const Virtual_Prefix_File& rhs) = delete; - Virtual_Prefix_File& operator=(Virtual_Prefix_File& rhs) = delete; - - // Destructs the virtual file. - ~Virtual_Prefix_File(); - - // Initializes the file buffer with the file handle `fptr` that is supposed to contain - // `lut_area_bytes` amount of bytes for its prefix-content, and the associated KMC3 - // database has `kmer_count` number of k-mers. - void init(FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); - - // Returns the data at index `idx` of the prefix-file. - uint64_t operator[](size_t idx); -}; - - -inline Virtual_Prefix_File::Virtual_Prefix_File(): - prefix_file_elem_count(0), - lut_area_size_in_bytes(0), - prefix_chunk_start_index(0), - prefix_chunk_end_index(0), - total_kmers(0), - fp(nullptr) -{} - - -inline Virtual_Prefix_File::~Virtual_Prefix_File() -{ - if(fp) - { - std::fclose(fp); - fp = nullptr; - } -} - - -inline void Virtual_Prefix_File::init(FILE*& fptr, const uint64_t lut_area_bytes, const uint64_t kmer_count) -{ - // *Take ownership* of `fptr`. - fp = fptr; - fptr = NULL; - - // Skip the first 4 bytes of header to get to the start of the prefixes. - my_fseek(fp, +4, SEEK_CUR); - lut_area_size_in_bytes = lut_area_bytes; - prefix_file_elem_count = (lut_area_size_in_bytes + 8) / sizeof(uint64_t); // What's that extra 1 element for? KMC3 comment: reads without 4 bytes of a header_offset (and without markers) - total_kmers = kmer_count; - - // Allocate the prefix-file buffer. - prefix_file_buf.reserve(buffer_elem_count); - - // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. - prefix_chunk_start_index = 0; - prefix_chunk_end_index = read_prefixes(); -} - - -inline size_t Virtual_Prefix_File::read_prefixes() -{ - const size_t elems_to_read = std::min(prefix_file_elem_count - prefix_chunk_end_index, buffer_elem_count); - const size_t bytes_to_read = elems_to_read * sizeof(uint64_t); - const size_t bytes_read = std::fread(prefix_file_buf.data(), 1, bytes_to_read, fp); - - if(bytes_read != bytes_to_read) - { - std::cerr << "Error reading the KMC database prefix file. Aborting.\n"; - std::exit(EXIT_FAILURE); - } - - return elems_to_read; -} - - -inline uint64_t Virtual_Prefix_File::operator[](const size_t idx) -{ - if(idx >= prefix_file_elem_count) - return total_kmers + 1; - - if(idx >= prefix_chunk_end_index) - { - prefix_chunk_start_index = idx; - prefix_chunk_end_index = idx + read_prefixes(); - } - - return prefix_file_buf[idx - prefix_chunk_start_index]; -} - - struct CKMCFileInfo { uint32 kmer_length; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35d061f6..20892383 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,7 @@ set(PROJECT_SRC kmc_api/kmc_file.cpp kmc_api/kmer_api.cpp kmc_api/mmer.cpp + kmc_api/Virtual_Prefix_File.cpp xxHash/xxhash.c xxHash/xxhsum.c Build_Params.cpp diff --git a/src/kmc_api/Virtual_Prefix_File.cpp b/src/kmc_api/Virtual_Prefix_File.cpp new file mode 100644 index 00000000..2a7d75da --- /dev/null +++ b/src/kmc_api/Virtual_Prefix_File.cpp @@ -0,0 +1,44 @@ + +#include "kmc_api/Virtual_Prefix_File.hpp" +#include "kmc_api/kmer_defs.h" + + +Virtual_Prefix_File::Virtual_Prefix_File(): + prefix_file_elem_count(0), + lut_area_size_in_bytes(0), + prefix_chunk_start_index(0), + prefix_chunk_end_index(0), + total_kmers(0), + fp(nullptr) +{} + + +Virtual_Prefix_File::~Virtual_Prefix_File() +{ + if(fp) + { + std::fclose(fp); + fp = nullptr; + } +} + + +void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t lut_area_bytes, const uint64_t kmer_count) +{ + // *Take ownership* of `fptr`. + fp = fptr; + fptr = NULL; + + // Skip the first 4 bytes of header to get to the start of the prefixes. + my_fseek(fp, +4, SEEK_CUR); + lut_area_size_in_bytes = lut_area_bytes; + prefix_file_elem_count = (lut_area_size_in_bytes + 8) / sizeof(uint64_t); // What's that extra 1 element for? KMC3 comment: reads without 4 bytes of a header_offset (and without markers) + total_kmers = kmer_count; + + // Allocate the prefix-file buffer. + prefix_file_buf.reserve(buffer_elem_count); + + // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. + prefix_chunk_start_index = 0; + prefix_chunk_end_index = read_prefixes(); +} From da4e9b16aa634a3d39b93cc76513dcd2728c0361 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 3 Sep 2021 13:40:07 -0400 Subject: [PATCH 140/350] Better jemalloc linking --- CMakeLists.txt | 49 +++++++++++++++++++++++----------------------- src/CMakeLists.txt | 2 +- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8621aa15..600b7259 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,32 +53,33 @@ set(THREADS_PREFER_PTHREAD_FLAG TRUE) # The BBHash library uses `pthread`. # the `kseq` library to gzipped compressed files. find_package(ZLIB REQUIRED) + +# Module required to download and install external projects. +include(ExternalProject) +set(EXT_LIB ${CMAKE_SOURCE_DIR}/external/lib/) +set(EXT_INCLUDE ${CMAKE_SOURCE_DIR}/external/include/) + + # Search and load setting for the `jemalloc` library. It provides scalable concurrency support # and better avoidance of fragmentation. -set(FAST_MALLOC_LIB "") -find_package(jemalloc) -if(jemalloc_FOUND) - message("Found the Jemalloc library") - set(FAST_MALLOC_LIB ${JEMALLOC_LIBRARIES}) -else() - message("Build system is fetching and installing jemalloc") - - include(ExternalProject) - - ExternalProject_Add(libjemalloc - DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external - DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz - && tar -xzf jemalloc-5.2.1.tar.gz - - SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 - BUILD_IN_SOURCE TRUE - INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/install - CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" - INSTALL_COMMAND cp -r lib / && cp -r include / - ) - - set(FAST_MALLOC_LIB ${CMAKE_SOURCE_DIR}/external/install/lib/libjemalloc.a) -endif() +message("Build system will fetch and install jemalloc") +ExternalProject_Add(prj_jemalloc + DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external + DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz + && tar -xzf jemalloc-5.2.1.tar.gz + && rm jemalloc-5.2.1.tar.gz + + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 + BUILD_IN_SOURCE TRUE + INSTALL_DIR ${CMAKE_SOURCE_DIR}/external + CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" + INSTALL_COMMAND cp -r lib ${EXT_LIB} && + cp -r include ${EXT_INCLUDE} +) + +add_library(jemalloc STATIC IMPORTED) +set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION ${EXT_LIB}/libjemalloc.a) +add_dependencies(jemalloc prj_jemalloc) # The `Debug` configuration optimizes the program for debugging and enables full debug information. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 20892383..37cf93a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -65,7 +65,7 @@ add_executable(${PROJECT_NAME} main.cpp) # Link the core library to the `jemalloc` library, for better `malloc` support. -target_link_libraries(core PRIVATE ${FAST_MALLOC_LIB}) +target_link_libraries(core PRIVATE jemalloc) # Link the core library to the `dl` library, required in using dynamic shared object. # Needed by `jemalloc`. From 060ec8ecf777a445542aa62106868384d1acdce9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 3 Sep 2021 14:42:10 -0400 Subject: [PATCH 141/350] Match (my) practice --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 600b7259..00cf60a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,9 +65,9 @@ set(EXT_INCLUDE ${CMAKE_SOURCE_DIR}/external/include/) message("Build system will fetch and install jemalloc") ExternalProject_Add(prj_jemalloc DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external - DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz - && tar -xzf jemalloc-5.2.1.tar.gz - && rm jemalloc-5.2.1.tar.gz + DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz && + tar -xzf jemalloc-5.2.1.tar.gz && + rm jemalloc-5.2.1.tar.gz SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 BUILD_IN_SOURCE TRUE From 0fa7147cd0a7a41abea8ed4d212cc281db6cdbd6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 3 Sep 2021 14:50:55 -0400 Subject: [PATCH 142/350] Better indent ExtPrj_Add --- CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 00cf60a0..d1a469b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,21 +60,21 @@ set(EXT_LIB ${CMAKE_SOURCE_DIR}/external/lib/) set(EXT_INCLUDE ${CMAKE_SOURCE_DIR}/external/include/) -# Search and load setting for the `jemalloc` library. It provides scalable concurrency support -# and better avoidance of fragmentation. +# Prepare the `jemalloc` library. It provides scalable concurrency support and better avoidance +# of fragmentation. message("Build system will fetch and install jemalloc") ExternalProject_Add(prj_jemalloc - DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external + DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz && tar -xzf jemalloc-5.2.1.tar.gz && rm jemalloc-5.2.1.tar.gz - SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 - BUILD_IN_SOURCE TRUE - INSTALL_DIR ${CMAKE_SOURCE_DIR}/external - CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" - INSTALL_COMMAND cp -r lib ${EXT_LIB} && - cp -r include ${EXT_INCLUDE} + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 + BUILD_IN_SOURCE TRUE + INSTALL_DIR ${CMAKE_SOURCE_DIR}/external + CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" + INSTALL_COMMAND cp -r lib ${EXT_LIB} && + cp -r include ${EXT_INCLUDE} ) add_library(jemalloc STATIC IMPORTED) From df0f26627f3f310d0b08aa5b68e940a66a457ff0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 6 Sep 2021 23:51:42 -0400 Subject: [PATCH 143/350] Draft link kmc lib --- CMakeLists.txt | 40 ++++++++++++++++++++++++++++++++++++++-- src/CMakeLists.txt | 14 ++++++++++---- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d1a469b0..27b98065 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,8 +50,18 @@ find_package(Threads REQUIRED) # The threads package is required for the BBHash set(THREADS_PREFER_PTHREAD_FLAG TRUE) # The BBHash library uses `pthread`. # Search and load setting for the `zlib` library. The library is required to seamlessly adapt -# the `kseq` library to gzipped compressed files. -find_package(ZLIB REQUIRED) +# the `kseq` and the `kmc` libraries to gzip-compressed files. +include(FindZLIB) +if(NOT ZLIB_FOUND) + message(FATAL_ERROR "zlib (https://zlib.net/) is required. Aborting.") +endif() + +# Search and load setting for the `bzip2` library. It is required to seamlessly adapt the +# `kmc` library to bzip-compressed files. +include(FindBZip2) +if(NOT BZIP2_FOUND) + message(FATAL_ERROR "bzip2 (https://sourceware.org/bzip2/) is required. Aborting.") +endif() # Module required to download and install external projects. @@ -82,6 +92,32 @@ set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION ${EXT_LIB}/libjemall add_dependencies(jemalloc prj_jemalloc) +# Prepare the `kmc` library — required by the Cuttlefish algorithm implementation. +message("Build system will fetch and install KMC3") +ExternalProject_Add(prj_kmc + DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external + # DOWNLOAD_COMMAND curl -k -L https://github.com/refresh-bio/KMC/archive/refs/heads/master.zip -o KMC-master.zip && + # unzip -qq KMC-master.zip && + # rm KMC-master.zip + DOWNLOAD_COMMAND git clone https://github.com/refresh-bio/kmc && + cd kmc && + git checkout f36cf7ca452c08e5fbb694bde3ee7d430f7bdb7a + + # SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-master + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/kmc + BUILD_IN_SOURCE TRUE + INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ + CONFIGURE_COMMAND "" + BUILD_COMMAND make -j16 + INSTALL_COMMAND cp bin/libkmc_core.a ${EXT_LIB} && + cp include/kmc_runner.h ${EXT_INCLUDE} +) + +add_library(kmc STATIC IMPORTED) +set_target_properties(kmc PROPERTIES IMPORTED_LOCATION ${EXT_LIB}/libkmc_core.a) +add_dependencies(kmc prj_kmc) + + # The `Debug` configuration optimizes the program for debugging and enables full debug information. # The `Release` configuration enables most compiler optimizations for speed and defines `NDEBUG` # (No Debug) which will remove all traces of the standard library assert calls. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 37cf93a1..7a69e7b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -50,8 +50,8 @@ add_library(core STATIC ${PROJECT_SRC}) # This controls the `-I` (include paths) switch of the compiler invocation. # Since `include` is declared `PUBLIC` for the library, it is propagated to all the # consumers of the library. -set(INCLUDE_DIR include) -target_include_directories(core PUBLIC ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}) +set(INCLUDE_DIR ${CMAKE_SOURCE_DIR}/include) +target_include_directories(core PUBLIC ${INCLUDE_DIR} ${EXT_INCLUDE}) # Specify the warnings and the extra optimization flags to the compiler for the target library. @@ -71,12 +71,18 @@ target_link_libraries(core PRIVATE jemalloc) # Needed by `jemalloc`. target_link_libraries(core PRIVATE ${CMAKE_DL_LIBS}) -# Link the core library to the threads package in the platform. -target_link_libraries(core PRIVATE ${CMAKE_THREAD_LIBS_INIT}) +# Link the core library to the `kmc` library. +target_link_libraries(core PRIVATE kmc) # Link the core library to the `zlib` library. target_link_libraries(core PRIVATE ZLIB::ZLIB) +# Link the core library ti the `bzip2` library. +target_link_libraries(core PRIVATE BZip2::BZip2) + +# Link the core library to the threads package in the platform. +target_link_libraries(core PRIVATE ${CMAKE_THREAD_LIBS_INIT}) + # Link the executable to the required libraries. target_link_libraries(${PROJECT_NAME} PRIVATE core) From 31b0e5aca187ad5471dbc4c1001e19f691799b52 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 7 Sep 2021 18:22:55 -0400 Subject: [PATCH 144/350] Fetch latest KMC --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27b98065..6216de42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,8 +100,7 @@ ExternalProject_Add(prj_kmc # unzip -qq KMC-master.zip && # rm KMC-master.zip DOWNLOAD_COMMAND git clone https://github.com/refresh-bio/kmc && - cd kmc && - git checkout f36cf7ca452c08e5fbb694bde3ee7d430f7bdb7a + cd kmc # SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-master SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/kmc From 25bc1535e9d0d4ffdbecfb188944d6fc85d7a44c Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 8 Sep 2021 17:05:28 -0400 Subject: [PATCH 145/350] Rename stuff for general purpose usage reference -> sequence --- include/Build_Params.hpp | 18 ++++---- include/{Parser.hpp => Ref_Parser.hpp} | 13 ++++-- include/Reference_Input.hpp | 62 ------------------------- include/Sequence_Input.hpp | 63 ++++++++++++++++++++++++++ include/Validation_Params.hpp | 6 +-- src/Build_Params.cpp | 12 ++--- src/CMakeLists.txt | 2 +- src/CdBG_Builder.cpp | 4 +- src/CdBG_Writer.cpp | 14 +++--- src/{Parser.cpp => Ref_Parser.cpp} | 37 +++++++++------ src/Sequence_Validator.cpp | 4 +- 11 files changed, 124 insertions(+), 111 deletions(-) rename include/{Parser.hpp => Ref_Parser.hpp} (88%) delete mode 100644 include/Reference_Input.hpp create mode 100644 include/Sequence_Input.hpp rename src/{Parser.cpp => Ref_Parser.cpp} (78%) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index e5afd517..941104d5 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -5,7 +5,7 @@ #include "globals.hpp" -#include "Reference_Input.hpp" +#include "Sequence_Input.hpp" #include "Output_Format.hpp" #include "File_Extensions.hpp" @@ -19,8 +19,8 @@ class Build_Params { private: - const bool is_read_graph_; // Whether to build a read- or a reference-compacted de Bruijn graph. - const Reference_Input reference_input_; // Collection of the input references. + const bool is_read_graph_; // Whether to build a compacted read or reference de Bruijn graph. + const Sequence_Input seq_input_; // Collection of the input sequences. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). @@ -40,7 +40,7 @@ class Build_Params // Constructs a parameters wrapper object with the self-explanatory parameters. Build_Params( const bool is_read_graph, - const std::vector& ref_paths, + const std::vector& seq_paths, const std::vector& list_paths, const std::vector& dir_paths, const uint16_t k, @@ -57,7 +57,7 @@ class Build_Params const bool dcc_opt, const bool extract_cycles): is_read_graph_(is_read_graph), - reference_input_(ref_paths, list_paths, dir_paths), + seq_input_(seq_paths, list_paths, dir_paths), k_(k), vertex_db_path_(vertex_db_path), edge_db_path_(edge_db_path), @@ -74,17 +74,17 @@ class Build_Params {} - // Returns the boolean flag to whether to build a read- or a reference-compacted de Bruijn graph. + // Returns the boolean flag to whether to build a compacted read or reference de Bruijn graph. bool is_read_graph() const { return is_read_graph_; } - // Returns the reference input collections. - const Reference_Input& reference_input() const + // Returns the sequence input collection. + const Sequence_Input& sequence_input() const { - return reference_input_; + return seq_input_; } diff --git a/include/Parser.hpp b/include/Ref_Parser.hpp similarity index 88% rename from include/Parser.hpp rename to include/Ref_Parser.hpp index 9053dd4f..29dad0ed 100644 --- a/include/Parser.hpp +++ b/include/Ref_Parser.hpp @@ -1,10 +1,10 @@ -#ifndef PARSER_HPP -#define PARSER_HPP +#ifndef REF_PARSER_HPP +#define REF_PARSER_HPP -#include "Reference_Input.hpp" +#include "Sequence_Input.hpp" #include #include @@ -15,7 +15,7 @@ struct _KSEQ_DATA; // Forward declaration for `kseq`'s sequence-data format. // Wrapper class to parse FASTA/FASTQ files using the `kseq` library. -class Parser +class Ref_Parser { typedef _KSEQ_DATA kseq_t; @@ -39,8 +39,11 @@ class Parser public: + // Constructs a parser for the file at path `file_path`. + Ref_Parser(const std::string& file_path); + // Constructs a parser for the reference input collection present at `ref_input`. - Parser(const Reference_Input& ref_input); + Ref_Parser(const Sequence_Input& ref_input); // Returns the path to the reference currently being parsed. const std::string& curr_ref() const; diff --git a/include/Reference_Input.hpp b/include/Reference_Input.hpp deleted file mode 100644 index 970b8eb4..00000000 --- a/include/Reference_Input.hpp +++ /dev/null @@ -1,62 +0,0 @@ - -#ifndef REFERENCE_INPUT_HPP -#define REFERENCE_INPUT_HPP - - - -#include -#include - - -class Reference_Input -{ -private: - - const std::vector ref_paths_; // Collection of paths to raw references. - const std::vector list_paths_; // Collection of paths to lists containing reference file paths. - const std::vector dir_paths_; // Collection of paths to directories containing reference files. - - -public: - - // Constructs a collection of input references. - Reference_Input(const std::vector& refs, - const std::vector& lists, - const std::vector& dirs): - ref_paths_(refs), - list_paths_(lists), - dir_paths_(dirs) - {} - - - // Returns the collection of paths to raw references. - const std::vector& ref_paths() const - { - return ref_paths_; - } - - - // Returns the collection of paths to lists containing reference file paths. - const std::vector& list_paths() const - { - return list_paths_; - } - - - // Returns the collection of paths to directories containing reference files. - const std::vector& dir_paths() const - { - return dir_paths_; - } - - - // Returns whether the reference collection is empty or not. - bool empty() const - { - return ref_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); - } -}; - - - -#endif diff --git a/include/Sequence_Input.hpp b/include/Sequence_Input.hpp new file mode 100644 index 00000000..e0c4fbe0 --- /dev/null +++ b/include/Sequence_Input.hpp @@ -0,0 +1,63 @@ + +#ifndef SEQUENCE_INPUT_HPP +#define SEQUENCE_INPUT_HPP + + + +#include +#include + + +// A class to pack the input sequences. +class Sequence_Input +{ +private: + + const std::vector seq_paths_; // Collection of paths to raw sequences. + const std::vector list_paths_; // Collection of paths to lists containing sequence file paths. + const std::vector dir_paths_; // Collection of paths to directories containing sequence files. + + +public: + + // Constructs a collection of input sequences. + Sequence_Input( const std::vector& seqs, + const std::vector& lists, + const std::vector& dirs): + seq_paths_(seqs), + list_paths_(lists), + dir_paths_(dirs) + {} + + + // Returns the collection of paths to raw sequences. + const std::vector& seq_paths() const + { + return seq_paths_; + } + + + // Returns the collection of paths to lists containing sequence file paths. + const std::vector& list_paths() const + { + return list_paths_; + } + + + // Returns the collection of paths to directories containing sequence files. + const std::vector& dir_paths() const + { + return dir_paths_; + } + + + // Returns whether the sequence collection is empty or not. + bool empty() const + { + return seq_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); + } +}; + + + +#endif diff --git a/include/Validation_Params.hpp b/include/Validation_Params.hpp index 28423473..06c59d12 100644 --- a/include/Validation_Params.hpp +++ b/include/Validation_Params.hpp @@ -4,7 +4,7 @@ -#include "Reference_Input.hpp" +#include "Sequence_Input.hpp" #include #include @@ -14,7 +14,7 @@ class Validation_Params { private: - const Reference_Input reference_input_; // Collection of the input references. + const Sequence_Input reference_input_; // Collection of the input references. const uint16_t k_; // The k-parameter of the compacted edge-centric de Bruijn graph. const std::string kmc_db_path_; // Prefix of the KMC database of the k-mer set of the reference. const std::string cdbg_file_path_; // Path to the file containing the maximal unitigs. @@ -46,7 +46,7 @@ class Validation_Params // Returns the reference input collections. - const Reference_Input& reference_input() const + const Sequence_Input& reference_input() const { return reference_input_; } diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 8e7477c3..a016ba62 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -13,7 +13,7 @@ bool Build_Params::is_valid() const // Check if read and reference de Bruijn graph parameters are being mixed with. if(is_read_graph_) // Is a read de Bruijn graph. { - if(!reference_input_.empty()) + if(!seq_input_.empty()) { std::cout << "No reference is to be provided for a compacted read de Bruijn graph construction.\n"; valid = false; @@ -33,11 +33,11 @@ bool Build_Params::is_valid() const valid = false; } - if(dcc_opt_ || extract_cycles_) - { - std::cout << "Existence of detached chordless cycles are impossible for reference de Bruijn graphs by definition.\n"; - valid = false; - } + // if(dcc_opt_ || extract_cycles_) + // { + // std::cout << "Existence of detached chordless cycles are impossible for reference de Bruijn graphs by definition.\n"; + // valid = false; + // } } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7a69e7b0..cbc2fc89 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,7 @@ set(PROJECT_SRC xxHash/xxhsum.c Build_Params.cpp Application.cpp - Parser.cpp + Ref_Parser.cpp Async_Logger_Wrapper.cpp Thread_Pool.cpp DNA_Utility.cpp diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index 8d1cfa2e..33a4af2a 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -1,7 +1,7 @@ #include "CdBG.hpp" #include "Kmer_Iterator.hpp" -#include "Parser.hpp" +#include "Ref_Parser.hpp" #include #include @@ -30,7 +30,7 @@ void CdBG::classify_vertices() else // No buckets file name provided, or does not exist. Build and save (if specified) one now. { // Open a parser for the FASTA / FASTQ file containing the reference. - Parser parser(params.reference_input()); + Ref_Parser parser(params.sequence_input()); // Construct a thread pool. diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index bb61698d..bf56c6b5 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -1,6 +1,6 @@ #include "CdBG.hpp" -#include "Parser.hpp" +#include "Ref_Parser.hpp" #include "Output_Format.hpp" #include "utility.hpp" #include "spdlog/spdlog.h" @@ -36,11 +36,11 @@ void CdBG::output_maximal_unitigs_plain() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Reference_Input& reference_input = params.reference_input(); + const Sequence_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); // Open a parser for the FASTA / FASTQ file containing the reference. - Parser parser(reference_input); + Ref_Parser parser(reference_input); // Clear the output file and initialize the output loggers. @@ -149,7 +149,7 @@ void CdBG::output_maximal_unitigs_gfa() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Reference_Input& reference_input = params.reference_input(); + const Sequence_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); const std::string& working_dir_path = params.working_dir_path(); @@ -181,7 +181,7 @@ void CdBG::output_maximal_unitigs_gfa() // Open a parser for the FASTA / FASTQ file containing the reference. - Parser parser(reference_input); + Ref_Parser parser(reference_input); // Track the maximum sequence buffer size used and the total length of the references. size_t max_buf_sz = 0; @@ -313,7 +313,7 @@ void CdBG::output_maximal_unitigs_gfa_reduced() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Reference_Input& reference_input = params.reference_input(); + const Sequence_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); const std::string& working_dir_path = params.working_dir_path(); @@ -356,7 +356,7 @@ void CdBG::output_maximal_unitigs_gfa_reduced() // Open a parser for the FASTA / FASTQ file containing the reference. - Parser parser(reference_input); + Ref_Parser parser(reference_input); // Track the maximum sequence buffer size used and the total length of the references. size_t max_buf_sz = 0; diff --git a/src/Parser.cpp b/src/Ref_Parser.cpp similarity index 78% rename from src/Parser.cpp rename to src/Ref_Parser.cpp index 65fd865e..0ef83e35 100644 --- a/src/Parser.cpp +++ b/src/Ref_Parser.cpp @@ -1,5 +1,5 @@ -#include "Parser.hpp" +#include "Ref_Parser.hpp" #include "kseq/kseq.h" #include "ghc/filesystem.hpp" @@ -12,10 +12,19 @@ KSEQ_INIT(gzFile, gzread); -Parser::Parser(const Reference_Input& ref_input) +Ref_Parser::Ref_Parser(const std::string& file_path) +{ + ref_paths.push(file_path); + + // Open the first reference for subsequent parsing. + open_next_reference(); +} + + +Ref_Parser::Ref_Parser(const Sequence_Input& ref_input) { // Collect references from the raw reference paths provided. - for(const std::string& ref_path: ref_input.ref_paths()) + for(const std::string& ref_path: ref_input.seq_paths()) ref_paths.push(ref_path); @@ -48,7 +57,7 @@ Parser::Parser(const Reference_Input& ref_input) } -void Parser::open_reference(const std::string& reference_path) +void Ref_Parser::open_reference(const std::string& reference_path) { file_ptr = gzopen(reference_path.c_str(), "r"); // Open the file handler. if(file_ptr == nullptr) @@ -67,7 +76,7 @@ void Parser::open_reference(const std::string& reference_path) } -bool Parser::open_next_reference() +bool Ref_Parser::open_next_reference() { if(ref_paths.empty()) return false; @@ -80,13 +89,13 @@ bool Parser::open_next_reference() } -const std::string& Parser::curr_ref() const +const std::string& Ref_Parser::curr_ref() const { return curr_ref_path; } -bool Parser::read_next_seq() +bool Ref_Parser::read_next_seq() { // Sequences still remain at the current reference being parsed. if(parser != nullptr && kseq_read(parser) >= 0) @@ -106,43 +115,43 @@ bool Parser::read_next_seq() } -const char* Parser::seq() const +const char* Ref_Parser::seq() const { return parser->seq.s; } -size_t Parser::seq_len() const +size_t Ref_Parser::seq_len() const { return parser->seq.l; } -size_t Parser::buff_sz() const +size_t Ref_Parser::buff_sz() const { return parser->seq.m; } -uint64_t Parser::ref_id() const +uint64_t Ref_Parser::ref_id() const { return ref_count; } -uint64_t Parser::seq_id() const +uint64_t Ref_Parser::seq_id() const { return seq_id_; } -const char* Parser::seq_name() const +const char* Ref_Parser::seq_name() const { return parser->name.s; } -void Parser::close() +void Ref_Parser::close() { if(file_ptr != nullptr) { diff --git a/src/Sequence_Validator.cpp b/src/Sequence_Validator.cpp index 575ae87d..0e9498b3 100644 --- a/src/Sequence_Validator.cpp +++ b/src/Sequence_Validator.cpp @@ -1,6 +1,6 @@ #include "Validator.hpp" -#include "Parser.hpp" +#include "Ref_Parser.hpp" #include "Directed_Kmer.hpp" #include "Kmer_Container.hpp" #include "BBHash/BooPHF.h" @@ -37,7 +37,7 @@ void Validator::validate_sequence_completion(bool& result) // Open a parser for the FASTA / FASTQ file containing the reference. - Parser parser(params.reference_input()); + Ref_Parser parser(params.reference_input()); std::vector th(thread_count); // Thread-pool (round-robin) to validate the sequences parallelly. From f5312ff59d177135c90870203f5dabe255da4ba4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 8 Sep 2021 17:13:02 -0400 Subject: [PATCH 146/350] Shorten class name --- include/Build_Params.hpp | 6 +++--- include/Ref_Parser.hpp | 4 ++-- include/{Sequence_Input.hpp => Seq_Input.hpp} | 8 ++++---- include/Validation_Params.hpp | 6 +++--- src/CdBG_Writer.cpp | 6 +++--- src/Ref_Parser.cpp | 2 +- 6 files changed, 16 insertions(+), 16 deletions(-) rename include/{Sequence_Input.hpp => Seq_Input.hpp} (86%) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 941104d5..b75ef5b2 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -5,7 +5,7 @@ #include "globals.hpp" -#include "Sequence_Input.hpp" +#include "Seq_Input.hpp" #include "Output_Format.hpp" #include "File_Extensions.hpp" @@ -20,7 +20,7 @@ class Build_Params private: const bool is_read_graph_; // Whether to build a compacted read or reference de Bruijn graph. - const Sequence_Input seq_input_; // Collection of the input sequences. + const Seq_Input seq_input_; // Collection of the input sequences. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). @@ -82,7 +82,7 @@ class Build_Params // Returns the sequence input collection. - const Sequence_Input& sequence_input() const + const Seq_Input& sequence_input() const { return seq_input_; } diff --git a/include/Ref_Parser.hpp b/include/Ref_Parser.hpp index 29dad0ed..f9280486 100644 --- a/include/Ref_Parser.hpp +++ b/include/Ref_Parser.hpp @@ -4,7 +4,7 @@ -#include "Sequence_Input.hpp" +#include "Seq_Input.hpp" #include #include @@ -43,7 +43,7 @@ class Ref_Parser Ref_Parser(const std::string& file_path); // Constructs a parser for the reference input collection present at `ref_input`. - Ref_Parser(const Sequence_Input& ref_input); + Ref_Parser(const Seq_Input& ref_input); // Returns the path to the reference currently being parsed. const std::string& curr_ref() const; diff --git a/include/Sequence_Input.hpp b/include/Seq_Input.hpp similarity index 86% rename from include/Sequence_Input.hpp rename to include/Seq_Input.hpp index e0c4fbe0..c25932cf 100644 --- a/include/Sequence_Input.hpp +++ b/include/Seq_Input.hpp @@ -9,7 +9,7 @@ // A class to pack the input sequences. -class Sequence_Input +class Seq_Input { private: @@ -21,9 +21,9 @@ class Sequence_Input public: // Constructs a collection of input sequences. - Sequence_Input( const std::vector& seqs, - const std::vector& lists, - const std::vector& dirs): + Seq_Input( const std::vector& seqs, + const std::vector& lists, + const std::vector& dirs): seq_paths_(seqs), list_paths_(lists), dir_paths_(dirs) diff --git a/include/Validation_Params.hpp b/include/Validation_Params.hpp index 06c59d12..2bc08d06 100644 --- a/include/Validation_Params.hpp +++ b/include/Validation_Params.hpp @@ -4,7 +4,7 @@ -#include "Sequence_Input.hpp" +#include "Seq_Input.hpp" #include #include @@ -14,7 +14,7 @@ class Validation_Params { private: - const Sequence_Input reference_input_; // Collection of the input references. + const Seq_Input reference_input_; // Collection of the input references. const uint16_t k_; // The k-parameter of the compacted edge-centric de Bruijn graph. const std::string kmc_db_path_; // Prefix of the KMC database of the k-mer set of the reference. const std::string cdbg_file_path_; // Path to the file containing the maximal unitigs. @@ -46,7 +46,7 @@ class Validation_Params // Returns the reference input collections. - const Sequence_Input& reference_input() const + const Seq_Input& reference_input() const { return reference_input_; } diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index bf56c6b5..bed78766 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -36,7 +36,7 @@ void CdBG::output_maximal_unitigs_plain() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Sequence_Input& reference_input = params.sequence_input(); + const Seq_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); // Open a parser for the FASTA / FASTQ file containing the reference. @@ -149,7 +149,7 @@ void CdBG::output_maximal_unitigs_gfa() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Sequence_Input& reference_input = params.sequence_input(); + const Seq_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); const std::string& working_dir_path = params.working_dir_path(); @@ -313,7 +313,7 @@ void CdBG::output_maximal_unitigs_gfa_reduced() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Sequence_Input& reference_input = params.sequence_input(); + const Seq_Input& reference_input = params.sequence_input(); const uint16_t thread_count = params.thread_count(); const std::string& working_dir_path = params.working_dir_path(); diff --git a/src/Ref_Parser.cpp b/src/Ref_Parser.cpp index 0ef83e35..7f904501 100644 --- a/src/Ref_Parser.cpp +++ b/src/Ref_Parser.cpp @@ -21,7 +21,7 @@ Ref_Parser::Ref_Parser(const std::string& file_path) } -Ref_Parser::Ref_Parser(const Sequence_Input& ref_input) +Ref_Parser::Ref_Parser(const Seq_Input& ref_input) { // Collect references from the raw reference paths provided. for(const std::string& ref_path: ref_input.seq_paths()) From 0805cdb56f64f8a5463fc6ccffa53e287eca81b8 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 8 Sep 2021 17:43:55 -0400 Subject: [PATCH 147/350] Reduce header pollution --- include/Seq_Input.hpp | 36 +++++++----------------------------- src/CMakeLists.txt | 1 + src/Seq_Input.cpp | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 29 deletions(-) create mode 100644 src/Seq_Input.cpp diff --git a/include/Seq_Input.hpp b/include/Seq_Input.hpp index c25932cf..6c4916bc 100644 --- a/include/Seq_Input.hpp +++ b/include/Seq_Input.hpp @@ -1,6 +1,6 @@ -#ifndef SEQUENCE_INPUT_HPP -#define SEQUENCE_INPUT_HPP +#ifndef SEQ_INPUT_HPP +#define SEQ_INPUT_HPP @@ -21,41 +21,19 @@ class Seq_Input public: // Constructs a collection of input sequences. - Seq_Input( const std::vector& seqs, - const std::vector& lists, - const std::vector& dirs): - seq_paths_(seqs), - list_paths_(lists), - dir_paths_(dirs) - {} - + Seq_Input(const std::vector& seqs, const std::vector& lists, const std::vector& dirs); // Returns the collection of paths to raw sequences. - const std::vector& seq_paths() const - { - return seq_paths_; - } - + const std::vector& seq_paths() const; // Returns the collection of paths to lists containing sequence file paths. - const std::vector& list_paths() const - { - return list_paths_; - } - + const std::vector& list_paths() const; // Returns the collection of paths to directories containing sequence files. - const std::vector& dir_paths() const - { - return dir_paths_; - } - + const std::vector& dir_paths() const; // Returns whether the sequence collection is empty or not. - bool empty() const - { - return seq_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); - } + bool empty() const; }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cbc2fc89..6dc69a58 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,7 @@ set(PROJECT_SRC xxHash/xxhsum.c Build_Params.cpp Application.cpp + Seq_Input.cpp Ref_Parser.cpp Async_Logger_Wrapper.cpp Thread_Pool.cpp diff --git a/src/Seq_Input.cpp b/src/Seq_Input.cpp new file mode 100644 index 00000000..504fd93b --- /dev/null +++ b/src/Seq_Input.cpp @@ -0,0 +1,35 @@ + +#include "Seq_Input.hpp" + + +Seq_Input::Seq_Input( const std::vector& seqs, + const std::vector& lists, + const std::vector& dirs): + seq_paths_(seqs), + list_paths_(lists), + dir_paths_(dirs) +{} + + +const std::vector& Seq_Input::seq_paths() const +{ + return seq_paths_; +} + + +const std::vector& Seq_Input::list_paths() const +{ + return list_paths_; +} + + +const std::vector& Seq_Input::dir_paths() const +{ + return dir_paths_; +} + + +bool Seq_Input::empty() const +{ + return seq_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); +} From 304b5ea49eb91ed64eb6d6a243c89c05b36473c6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 9 Sep 2021 16:28:23 -0400 Subject: [PATCH 148/350] Better practice --- include/kmc_api/Virtual_Prefix_File.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 54c2a003..ddaffef6 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -17,13 +17,13 @@ class Virtual_Prefix_File { private: - static constexpr size_t buffer_elem_count = (1 << 21); // Number of prefixes to be kept in memory buffer at a time. - size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). + static constexpr std::size_t buffer_elem_count = (1 << 21); // Number of prefixes to be kept in memory buffer at a time. + std::size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). std::vector prefix_file_buf; // The in-memory prefix-file buffer. uint64_t lut_area_size_in_bytes; // From KMC3. - size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. - size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. + std::size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. + std::size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. uint64_t total_kmers; // Total number of k-mers in the KMC3 database. std::FILE* fp; // File handle to the KMC3 prefix-file. @@ -31,7 +31,7 @@ class Virtual_Prefix_File // Reads in as much data as possible from the prefix-file into the in-memory buffer, // and returns the number of elements read. - size_t read_prefixes(); + std::size_t read_prefixes(); public: @@ -54,15 +54,15 @@ class Virtual_Prefix_File void init(std::FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); // Returns the data at index `idx` of the prefix-file. - uint64_t operator[](size_t idx); + uint64_t operator[](std::size_t idx); }; -inline size_t Virtual_Prefix_File::read_prefixes() +inline std::size_t Virtual_Prefix_File::read_prefixes() { - const size_t elems_to_read = std::min(prefix_file_elem_count - prefix_chunk_end_index, buffer_elem_count); - const size_t bytes_to_read = elems_to_read * sizeof(uint64_t); - const size_t bytes_read = std::fread(prefix_file_buf.data(), 1, bytes_to_read, fp); + const std::size_t elems_to_read = std::min(prefix_file_elem_count - prefix_chunk_end_index, buffer_elem_count); + const std::size_t bytes_to_read = elems_to_read * sizeof(uint64_t); + const std::size_t bytes_read = std::fread(prefix_file_buf.data(), 1, bytes_to_read, fp); if(bytes_read != bytes_to_read) { @@ -74,7 +74,7 @@ inline size_t Virtual_Prefix_File::read_prefixes() } -inline uint64_t Virtual_Prefix_File::operator[](const size_t idx) +inline uint64_t Virtual_Prefix_File::operator[](const std::size_t idx) { if(idx >= prefix_file_elem_count) return total_kmers + 1; From 343e89901d0ff6541fcfabf261888f73b3084cb5 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 10 Sep 2021 00:20:50 -0400 Subject: [PATCH 149/350] Rename symbol conflicting across libs --- include/Kmer_Iterator.hpp | 2 +- include/Kmer_SPMC_Iterator.hpp | 2 +- include/kmc_api/kmc_file.h | 22 +++++----- include/kmc_api/kmer_api.h | 4 +- src/Kmer_Container.cpp | 2 +- src/kmc_api/kmc_file.cpp | 76 +++++++++++++++++----------------- 6 files changed, 54 insertions(+), 54 deletions(-) diff --git a/include/Kmer_Iterator.hpp b/include/Kmer_Iterator.hpp index 2212041c..de6e1683 100644 --- a/include/Kmer_Iterator.hpp +++ b/include/Kmer_Iterator.hpp @@ -34,7 +34,7 @@ class Kmer_Iterator private: const Kmer_Container* kmer_container; // The associated k-mer container on which to iterate on. - CKMCFile kmer_database_input; // The input reader object (from KMC databases). + CKMC_DB kmer_database_input; // The input reader object (from KMC databases). CKmerAPI kmer_object; // Current KMC k-mer object that this iterator is holding. Kmer kmer; // K-mer present inside the `kmer_object` api. bool at_begin; // Whether this iterator points to the beginning of the KMC database or not. diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index f625cca9..e2d019a1 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -38,7 +38,7 @@ class Kmer_SPMC_Iterator private: const Kmer_Container* const kmer_container; // The associated k-mer container over which to iterate. - CKMCFile kmer_database; // The k-mer database object. + CKMC_DB kmer_database; // The k-mer database object. const uint64_t kmer_count; // Number of k-mers present in the underlying database. const size_t consumer_count; // Total number of consumer threads of the iterator. diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index a0ad4bcf..48958d0e 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -38,7 +38,7 @@ struct CKMCFileInfo // Forward declare Cuttlefish's k-mer class; required to parse KMC raw binary k-mers to Cuttlefish format. template class Kmer; -class CKMCFile +class CKMC_DB { enum open_mode {closed, opened_for_RA, opened_for_listing}; open_mode is_opened; @@ -108,8 +108,8 @@ class CKMCFile bool GetCountersForRead_kmc2(const std::string& read, std::vector& counters); public: - CKMCFile(); - ~CKMCFile(); + CKMC_DB(); + ~CKMC_DB(); // Open files *.kmc_pre & *.kmc_suf, read them to RAM, close files. *.kmc_suf is opened for random access bool OpenForRA(const std::string &file_name); @@ -215,7 +215,7 @@ class CKMCFile // OUT: count - kmer's counter // RET: true - if not EOF //----------------------------------------------------------------------------------------------- -inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer) +inline bool CKMC_DB::ReadNextKmer(CKmerAPI &kmer) { uint64 prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db @@ -304,7 +304,7 @@ inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer) // OUT: count - kmer's counter // RET: true - if not EOF //----------------------------------------------------------------------------------------------- -bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint32 &count) +bool CKMC_DB::ReadNextKmer(CKmerAPI &kmer, uint32 &count) { uint64 prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db @@ -399,7 +399,7 @@ bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint32 &count) // OUT: count - kmer's counter // RET: true - if not EOF //----------------------------------------------------------------------------------------------- -inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint64 &count) +inline bool CKMC_DB::ReadNextKmer(CKmerAPI &kmer, uint64 &count) { uint64 prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db @@ -479,7 +479,7 @@ inline bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint64 &count) -inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, std::vector>& pref_buf, const size_t max_bytes_to_read) +inline uint64_t CKMC_DB::read_raw_suffixes(uint8_t* const suff_buf, std::vector>& pref_buf, const size_t max_bytes_to_read) { if(is_opened != opened_for_listing) return 0; @@ -538,26 +538,26 @@ inline uint64_t CKMCFile::read_raw_suffixes(uint8_t* const suff_buf, std::vector } -inline uint32_t CKMCFile::suff_record_size() const +inline uint32_t CKMC_DB::suff_record_size() const { return sufix_rec_size; } -inline uint64_t CKMCFile::curr_prefix() const +inline uint64_t CKMC_DB::curr_prefix() const { return prefix_index; } -inline uint64_t CKMCFile::curr_suffix_idx() const +inline uint64_t CKMC_DB::curr_suffix_idx() const { return sufix_number; } template -inline void CKMCFile::parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const +inline void CKMC_DB::parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* const suff_buf, size_t buf_idx, Kmer& kmer) const { static constexpr uint16_t NUM_INTS = (k + 31) / 32; uint64_t kmc_data[NUM_INTS]{}; diff --git a/include/kmc_api/kmer_api.h b/include/kmc_api/kmer_api.h index 1d2ddb49..2e5cb891 100644 --- a/include/kmc_api/kmer_api.h +++ b/include/kmc_api/kmer_api.h @@ -17,7 +17,7 @@ Date : 2019-05-19 #include #include #include "mmer.h" -class CKMCFile; +class CKMC_DB; class CKmerAPI { @@ -32,7 +32,7 @@ class CKmerAPI uint32 no_of_rows; // A number of 64-bits words allocated for kmer_data - friend class CKMCFile; + friend class CKMC_DB; //---------------------------------------------------------------------------------- inline void clear() diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 92907439..e5b82eec 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -6,7 +6,7 @@ template Kmer_Container::Kmer_Container(const std::string& kmc_file_path): kmc_file_path(kmc_file_path) { - CKMCFile kmer_database; + CKMC_DB kmer_database; if(!kmer_database.read_parameters(kmc_file_path)) { std::cout << "Error opening KMC database files with prefix " << kmc_file_path << ". Aborting.\n"; diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index 2cba9531..3b2aacba 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -16,7 +16,7 @@ #include -uint64 CKMCFile::part_size = 1 << 28; +uint64 CKMC_DB::part_size = 1 << 28; // ---------------------------------------------------------------------------------- @@ -25,7 +25,7 @@ uint64 CKMCFile::part_size = 1 << 28; // IN : file_name - the name of kmer_counter's output // RET : true - if successful // ---------------------------------------------------------------------------------- -bool CKMCFile::OpenForRA(const std::string &file_name) +bool CKMC_DB::OpenForRA(const std::string &file_name) { uint64 size; size_t result; @@ -62,7 +62,7 @@ bool CKMCFile::OpenForRA(const std::string &file_name) // IN : file_name - the name of kmer_counter's output // RET : true - if successful //---------------------------------------------------------------------------------- -bool CKMCFile::OpenForListing(const std::string &file_name) +bool CKMC_DB::OpenForListing(const std::string &file_name) { uint64 size; @@ -110,7 +110,7 @@ bool CKMCFile::OpenForListing(const std::string &file_name) // IN : file_name - the name of kmer_counter's output // RET : true - if successful //---------------------------------------------------------------------------------- -bool CKMCFile::open_for_cuttlefish_listing(const std::string& file_name) +bool CKMC_DB::open_for_cuttlefish_listing(const std::string& file_name) { uint64 size; @@ -150,7 +150,7 @@ bool CKMCFile::open_for_cuttlefish_listing(const std::string& file_name) // IN : file_name - the name of kmer_counter's output // RET : true - if successful //---------------------------------------------------------------------------------- -bool CKMCFile::read_parameters(const std::string& file_name) +bool CKMC_DB::read_parameters(const std::string& file_name) { uint64 size; @@ -184,7 +184,7 @@ bool CKMCFile::read_parameters(const std::string& file_name) return true; } //---------------------------------------------------------------------------------- -CKMCFile::CKMCFile() +CKMC_DB::CKMC_DB() { file_pre = NULL; file_suf = NULL; @@ -197,7 +197,7 @@ CKMCFile::CKMCFile() end_of_file = false; } //---------------------------------------------------------------------------------- -CKMCFile::~CKMCFile() +CKMC_DB::~CKMC_DB() { if (file_pre) fclose(file_pre); @@ -215,7 +215,7 @@ CKMCFile::~CKMCFile() // IN : file_name - the name of a file to open // RET : true - if successful //---------------------------------------------------------------------------------- -bool CKMCFile::OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]) +bool CKMC_DB::OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]) { char _marker[4]; size_t result; @@ -260,7 +260,7 @@ bool CKMCFile::OpenASingleFile(const std::string &file_name, FILE *&file_handler // IN : the size of the file *.kmc_pre, without initial and terminal markers // RET : true - if succesfull //---------------------------------------------------------------------------------- -bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_file, const bool init_pref_buf) +bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_file, const bool init_pref_buf) { size_t prev_pos = my_ftell(file_pre); my_fseek(file_pre, -12, SEEK_END); @@ -394,7 +394,7 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref // OUT: count - kmer's counter if kmer exists // RET: true - if kmer exists //------------------------------------------------------------------------------------------ -bool CKMCFile::CheckKmer(CKmerAPI &kmer, float &count) +bool CKMC_DB::CheckKmer(CKmerAPI &kmer, float &count) { uint32 int_counter; if (CheckKmer(kmer, int_counter)) @@ -414,7 +414,7 @@ bool CKMCFile::CheckKmer(CKmerAPI &kmer, float &count) // OUT: count - kmer's counter if kmer exists // RET: true - if kmer exists //------------------------------------------------------------------------------------------ -bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint32 &count) +bool CKMC_DB::CheckKmer(CKmerAPI &kmer, uint32 &count) { if(is_opened != opened_for_RA) return false; @@ -458,7 +458,7 @@ bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint32 &count) // OUT: count - kmer's counter if kmer exists // RET: true - if kmer exists //------------------------------------------------------------------------------------------ -bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint64 &count) +bool CKMC_DB::CheckKmer(CKmerAPI &kmer, uint64 &count) { if (is_opened != opened_for_RA) return false; @@ -497,12 +497,12 @@ bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint64 &count) // Check if end of file // RET: true - all kmers are listed //----------------------------------------------------------------------------------------------- -bool CKMCFile::Eof(void) +bool CKMC_DB::Eof(void) { return end_of_file; } -bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, float &count) +bool CKMC_DB::ReadNextKmer(CKmerAPI &kmer, float &count) { uint32 int_counter; if (ReadNextKmer(kmer, int_counter)) @@ -520,7 +520,7 @@ bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, float &count) //------------------------------------------------------------------------------- // Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function. //------------------------------------------------------------------------------- -void CKMCFile::Reload_sufix_file_buf() +void CKMC_DB::Reload_sufix_file_buf() { auto to_read = MIN(suf_file_left_to_read, part_size); auto readed = fread(sufix_file_buf, 1, (size_t)to_read, file_suf); @@ -536,7 +536,7 @@ void CKMCFile::Reload_sufix_file_buf() // Release memory and close files in case they were opened // RET: true - if files have been readed //------------------------------------------------------------------------------- -bool CKMCFile::Close() +bool CKMC_DB::Close() { if(is_opened) { @@ -570,7 +570,7 @@ bool CKMCFile::Close() // Set initial values to enable listing kmers from the begining. Only in listing mode // RET: true - if a file has been opened for listing //---------------------------------------------------------------------------------- -bool CKMCFile::RestartListing(void) +bool CKMC_DB::RestartListing(void) { if(is_opened == opened_for_listing) { @@ -601,7 +601,7 @@ bool CKMCFile::RestartListing(void) // IN : x - minimal value for a counter // RET : true - if successful //---------------------------------------------------------------------------------------- -bool CKMCFile::SetMinCount(uint32 x) +bool CKMC_DB::SetMinCount(uint32 x) { if((original_min_count <= x) && (x <= max_count)) { @@ -616,7 +616,7 @@ bool CKMCFile::SetMinCount(uint32 x) // Return a value of min_count. Kmers with counters below this theshold are ignored // RET : a value of min_count //---------------------------------------------------------------------------------------- -uint32 CKMCFile::GetMinCount(void) +uint32 CKMC_DB::GetMinCount(void) { return min_count; } @@ -626,7 +626,7 @@ uint32 CKMCFile::GetMinCount(void) // IN : x - maximal value for a counter // RET : true - if successful //---------------------------------------------------------------------------------------- -bool CKMCFile::SetMaxCount(uint32 x) +bool CKMC_DB::SetMaxCount(uint32 x) { if((original_max_count >= x) && (x >= min_count)) { @@ -642,7 +642,7 @@ bool CKMCFile::SetMaxCount(uint32 x) // Return a value of max_count. Kmers with counters above this theshold are ignored // RET : a value of max_count //---------------------------------------------------------------------------------------- -uint64 CKMCFile::GetMaxCount(void) +uint64 CKMC_DB::GetMaxCount(void) { return max_count; } @@ -651,7 +651,7 @@ uint64 CKMCFile::GetMaxCount(void) // Return true if KMC was run without -b switch // RET : a value of both_strands //---------------------------------------------------------------------------------------- -bool CKMCFile::GetBothStrands(void) +bool CKMC_DB::GetBothStrands(void) { return both_strands; } @@ -661,7 +661,7 @@ bool CKMCFile::GetBothStrands(void) //---------------------------------------------------------------------------------------- // Set original (readed from *.kmer_pre) values for min_count and max_count //---------------------------------------------------------------------------------------- -void CKMCFile::ResetMinMaxCounts(void) +void CKMC_DB::ResetMinMaxCounts(void) { min_count = original_min_count; max_count = original_max_count; @@ -671,7 +671,7 @@ void CKMCFile::ResetMinMaxCounts(void) // Return the length of kmers // RET : the length of kmers //---------------------------------------------------------------------------------------- -uint32 CKMCFile::KmerLength(void) +uint32 CKMC_DB::KmerLength(void) { return kmer_length; } @@ -681,7 +681,7 @@ uint32 CKMCFile::KmerLength(void) // IN : kmer - kmer // RET : true if kmer exists //---------------------------------------------------------------------------------------- -bool CKMCFile::IsKmer(CKmerAPI &kmer) +bool CKMC_DB::IsKmer(CKmerAPI &kmer) { uint32 _count; if(CheckKmer(kmer, _count)) @@ -694,7 +694,7 @@ bool CKMCFile::IsKmer(CKmerAPI &kmer) // Check the total number of kmers between current min_count and max_count // RET : total number of kmers or 0 if a database has not been opened //----------------------------------------------------------------------------------------- -uint64 CKMCFile::KmerCount(void) +uint64 CKMC_DB::KmerCount(void) { if(is_opened) if((min_count == original_min_count) && (max_count == original_max_count)) @@ -761,7 +761,7 @@ uint64 CKMCFile::KmerCount(void) // _total_kmers - the total number of kmers // RET : true if kmer_database has been opened //--------------------------------------------------------------------------------- -bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint64 &_max_count, uint64 &_total_kmers) +bool CKMC_DB::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint64 &_max_count, uint64 &_total_kmers) { if(is_opened) { @@ -782,7 +782,7 @@ bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, } // Get current parameters from kmer_database -bool CKMCFile::Info(CKMCFileInfo& info) const +bool CKMC_DB::Info(CKMCFileInfo& info) const { if (is_opened) { @@ -810,7 +810,7 @@ bool CKMCFile::Info(CKMCFileInfo& info) const // IN : read - // RET : true if success, false if k > read length or some failure //--------------------------------------------------------------------------------- -bool CKMCFile::GetCountersForRead(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead(const std::string& read, std::vector& counters) { if (is_opened != opened_for_RA) return false; @@ -845,7 +845,7 @@ bool CKMCFile::GetCountersForRead(const std::string& read, std::vector& // IN : read - // RET : true if success //--------------------------------------------------------------------------------- -bool CKMCFile::GetCountersForRead(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead(const std::string& read, std::vector& counters) { if (is_opened != opened_for_RA) return false; @@ -873,7 +873,7 @@ bool CKMCFile::GetCountersForRead(const std::string& read, std::vector& c //--------------------------------------------------------------------------------- // Auxiliary function. //--------------------------------------------------------------------------------- -uint32 CKMCFile::count_for_kmer_kmc1(CKmerAPI& kmer) +uint32 CKMC_DB::count_for_kmer_kmc1(CKmerAPI& kmer) { //recognize a prefix: @@ -898,7 +898,7 @@ uint32 CKMCFile::count_for_kmer_kmc1(CKmerAPI& kmer) //--------------------------------------------------------------------------------- // Auxiliary function. //--------------------------------------------------------------------------------- -uint32 CKMCFile::count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos) +uint32 CKMC_DB::count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos) { //recognize a prefix: uint64 pattern_prefix_value = kmer.kmer_data[0]; @@ -922,7 +922,7 @@ uint32 CKMCFile::count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos) //--------------------------------------------------------------------------------- // Auxiliary function. //--------------------------------------------------------------------------------- -bool CKMCFile::GetCountersForRead_kmc1_both_strands(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead_kmc1_both_strands(const std::string& read, std::vector& counters) { uint32 read_len = static_cast(read.length()); counters.resize(read.length() - kmer_length + 1); @@ -1000,7 +1000,7 @@ bool CKMCFile::GetCountersForRead_kmc1_both_strands(const std::string& read, std //--------------------------------------------------------------------------------- // Auxiliary function. //--------------------------------------------------------------------------------- -bool CKMCFile::GetCountersForRead_kmc1(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead_kmc1(const std::string& read, std::vector& counters) { uint32 read_len = static_cast(read.length()); counters.resize(read.length() - kmer_length + 1); @@ -1064,7 +1064,7 @@ bool CKMCFile::GetCountersForRead_kmc1(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead_kmc2_both_strands(const std::string& read, std::vector& counters) { counters.resize(read.length() - kmer_length + 1); std::string transformed_read = read; @@ -1225,7 +1225,7 @@ bool CKMCFile::GetCountersForRead_kmc2_both_strands(const std::string& read, std //--------------------------------------------------------------------------------- // Auxiliary function. //--------------------------------------------------------------------------------- -bool CKMCFile::GetCountersForRead_kmc2(const std::string& read, std::vector& counters) +bool CKMC_DB::GetCountersForRead_kmc2(const std::string& read, std::vector& counters) { counters.resize(read.length() - kmer_length + 1); std::string transformed_read = read; @@ -1289,7 +1289,7 @@ bool CKMCFile::GetCountersForRead_kmc2(const std::string& read, std::vector= static_cast(total_kmers)) return false; From 9edea16d143ad9f2214f3dc9b7915ee55cf0d002 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 10 Sep 2021 14:55:32 -0400 Subject: [PATCH 150/350] Detach hash table ctr from CdBG ctr --- include/Read_CdBG.hpp | 4 +++- src/Read_CdBG.cpp | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index c64e8334..babbbda0 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -9,6 +9,8 @@ #include "Kmer_Hash_Table.hpp" #include "dBG_Info.hpp" +#include + // Read de Bruijn graph class to support the compaction algorithm. template @@ -17,7 +19,7 @@ class Read_CdBG private: const Build_Params params; // Required parameters (wrapped inside). - Kmer_Hash_Table hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + std::unique_ptr> hash_table; // Hash table for the vertices (canonical k-mers) of the graph. dBG_Info dbg_info; // Wrapper object for structural information of the graph. diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index cbddad40..834b6056 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -9,7 +9,6 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): params(params), - hash_table(params.vertex_db_path()), dbg_info(params.json_file_path()) {} @@ -28,22 +27,23 @@ void Read_CdBG::construct() std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - hash_table.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + hash_table = std::make_unique>(params.vertex_db_path()); + hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); std::cout << "\nComputing the DFA states.\n"; compute_DFA_states(); if(!params.extract_cycles() && !params.dcc_opt()) - hash_table.save(params); + hash_table->save(params); std::cout << "\nExtracting the maximal unitigs.\n"; extract_maximal_unitigs(); if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) - hash_table.remove(params); + hash_table->remove(params); - hash_table.clear(); + hash_table->clear(); dbg_info.dump_info(); } @@ -51,7 +51,7 @@ void Read_CdBG::construct() template void Read_CdBG::compute_DFA_states() { - Read_CdBG_Constructor cdBg_constructor(params, hash_table); + Read_CdBG_Constructor cdBg_constructor(params, *hash_table); cdBg_constructor.compute_DFA_states(); dbg_info.add_basic_info(cdBg_constructor); @@ -61,7 +61,7 @@ void Read_CdBG::compute_DFA_states() template void Read_CdBG::extract_maximal_unitigs() { - Read_CdBG_Extractor cdBg_extractor(params, hash_table); + Read_CdBG_Extractor cdBg_extractor(params, *hash_table); if(!is_constructed(params)) { cdBg_extractor.extract_maximal_unitigs(); @@ -77,7 +77,7 @@ void Read_CdBG::extract_maximal_unitigs() dbg_info.add_DCC_info(cdBg_extractor); } else if(params.dcc_opt()) - hash_table.save(params); + hash_table->save(params); } } else if(params.extract_cycles()) From 1e40cf911d5ee8b7872a903c0ed6ac7525584343 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 10 Sep 2021 21:16:33 -0400 Subject: [PATCH 151/350] Add k-mer enumerator --- include/Build_Params.hpp | 30 +++++++++++ include/Input_Defaults.hpp | 2 + include/kmer_Enumerator.hpp | 103 ++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/kmer_Enumerator.cpp | 75 ++++++++++++++++++++++++++ src/main.cpp | 14 ++++- 6 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 include/kmer_Enumerator.hpp create mode 100644 src/kmer_Enumerator.cpp diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index b75ef5b2..4a62404f 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -22,9 +22,12 @@ class Build_Params const bool is_read_graph_; // Whether to build a compacted read or reference de Bruijn graph. const Seq_Input seq_input_; // Collection of the input sequences. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. + const uint32_t cutoff_; // Frequency cutoff for the (k + 1)-mers (for short-read set input). const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). const uint16_t thread_count_; // Number of threads to work with. + const std::size_t max_memory_; // Soft maximum memory limit. + const bool strict_memory_; // Whether strict memory limit restriction is specifiied. const std::string& output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). const std::string& working_dir_path_; // Path to the working directory (for temporary files). @@ -44,9 +47,12 @@ class Build_Params const std::vector& list_paths, const std::vector& dir_paths, const uint16_t k, + const uint32_t cutoff, const std::string& vertex_db_path, const std::string& edge_db_path, const uint16_t thread_count, + const std::size_t max_memory, + const bool strict_memory, const std::string& output_file_path, const uint8_t output_format, const std::string& working_dir_path, @@ -59,9 +65,12 @@ class Build_Params is_read_graph_(is_read_graph), seq_input_(seq_paths, list_paths, dir_paths), k_(k), + cutoff_(cutoff), vertex_db_path_(vertex_db_path), edge_db_path_(edge_db_path), thread_count_(thread_count), + max_memory_(max_memory), + strict_memory_(strict_memory), output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), working_dir_path_(working_dir_path), @@ -95,6 +104,13 @@ class Build_Params } + // Returns the frequency cutoff for the (k + 1)-mers (for short-reads set input). + uint32_t cutoff() const + { + return cutoff_; + } + + // Returns the path to the vertex database. const std::string& vertex_db_path() const { @@ -116,6 +132,20 @@ class Build_Params } + // Returns the soft maximum memory limit. + std::size_t max_memory() const + { + return max_memory_; + } + + + // Returns whether strict memory limit restriction is specifiied. + bool strict_memory() const + { + return strict_memory_; + } + + // Returns the path prefix for all outputs of the algorithm. const std::string output_prefix() const { diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 4d29b08b..f571981d 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -14,7 +14,9 @@ namespace cuttlefish { constexpr char EMPTY[] = ""; constexpr uint16_t K = 25; // Set as per the KMC3 default. + constexpr uint32_t CUTOFF_FREQ = 2; // Typical practice constexpr uint16_t THREAD_COUNT = 1; + constexpr std::size_t MAX_MEMORY = 2; // Set as per KMC3 library requirement. constexpr uint16_t OP_FORMAT = Output_Format::txt; constexpr char WORK_DIR[] = "."; } diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp new file mode 100644 index 00000000..85d493d5 --- /dev/null +++ b/include/kmer_Enumerator.hpp @@ -0,0 +1,103 @@ + +#ifndef KMER_ENUMERATOR_HPP +#define KMER_ENUMERATOR_HPP + + + +#include "Build_Params.hpp" +#include "kmc_runner.h" + + +class kmer_Enumeration_Stats; + + +// Class to enumerate all the k-mers for some provided input collection. +template +class kmer_Enumerator +{ +private: + + static constexpr std::size_t min_memory = 2; // In GB; set as per the KMC3 library requirement. + static constexpr uint16_t bin_count = 2000; + static constexpr uint16_t signature_len = 11; + static constexpr double bits_per_kmer = 9.71; + static constexpr uint64_t counter_max = 1; // The `-cs` argument for KMC3; we're not interested in the counts and `cs = 1` will trigger skipping the counts. + + KMC::Stage1Params stage1_params; // Parameters collection for the k-mer statistics approximation step of KMC3. + KMC::Stage1Results stage1_results; // Results of the k-mer statistics approximation. + KMC::Stage2Params stage2_params; // Parameters collection for the actual KMC3 execution (some execution parameters are absent and is present in `stage1_params`). + KMC::Stage2Results stage2_results; // Results of the actual k-mer set enumeration. + + KMC::Runner kmc; // The KMC3 executor. + + + // Returns the count of solid k-mers, i.e. k-mers occuring at least `cutoff` number of times, + // estimated through KMC3's approximation step. + uint64_t solid_kmer_count_approx(uint16_t cutoff) const; + + // Returns the strict memory limit for the actual KMC3 execution, based on the number of + // unique k-mers `unique_kmer_count` (typically approximated earlier). + std::size_t memory_limit(uint64_t unique_kmer_count) const; + + +public: + + // Enumerates the k-mers from the sequences (of type `input_file_type`) present is `seqs`, that + // are present at least `cutoff` times. Employs `thread_count` number of processor threads and + // uses a soft memory-cap of `max_memory`. If `strict_memory` is `true`, then the memory usage + // is attempted to be kept within a limit—the max of `max_memory` and the estimated memory to + // be used by the downstream stages of Cuttlefish. This memory estimation is made only if + // `estimate_mem_usage` is `true`, otherwise `max_memory` is the limit. Temporary files are + // written to `working_dir_path`. The output database is stored at path prefix `output_db_path`. + // Returns summary statistics of the enumeration. + kmer_Enumeration_Stats enumerate( + KMC::InputFileType input_file_type, const std::vector& seqs, uint32_t cutoff, + uint16_t thread_count, std::size_t max_memory, bool strict_memory, bool estimate_mem_usage, + const std::string& working_dir_path, const std::string& output_db_path); +}; + + +// A class to wrap summary statistics of k-mer enumeration by `kmer_Enumerator`. +class kmer_Enumeration_Stats +{ +private: + + uint64_t kmer_count; + std::size_t max_memory; + std::size_t temp_disk_usage; + + +public: + + kmer_Enumeration_Stats(const uint64_t kmer_count, const std::size_t max_memory, const std::size_t temp_disk_usage): + kmer_count(kmer_count), + max_memory(max_memory), + temp_disk_usage(temp_disk_usage) + {} +}; + + +// A class to display progress of the k-mer enumeration execution. +class FunnyProgress: public KMC::IPercentProgressObserver +{ + std::string funnChars = "/-\\|"; + int current = 0; + + void SetLabel(const std::string& label) override + { + //ignore + (void)label; + } + void ProgressChanged(int newValue) override + { + if(newValue == 100) + std::cerr << "\rDone.\n"; + else + std::cerr << "\r" << funnChars[current++ % funnChars.size()]; + + } +}; + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6dc69a58..432d59f8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ set(PROJECT_SRC CdBG_Plain_Writer.cpp CdBG_GFA_Writer.cpp CdBG_GFA_Reduced_Writer.cpp + kmer_Enumerator.cpp State_Read_Space.cpp Read_CdBG.cpp Read_CdBG_Constructor.cpp diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp new file mode 100644 index 00000000..d99557a4 --- /dev/null +++ b/src/kmer_Enumerator.cpp @@ -0,0 +1,75 @@ + +#include "kmer_Enumerator.hpp" +#include "globals.hpp" + + +template +kmer_Enumeration_Stats kmer_Enumerator::enumerate( + const KMC::InputFileType input_file_type, const std::vector& seqs, const uint32_t cutoff, + const uint16_t thread_count, const std::size_t max_memory, const bool strict_memory, const bool estimate_mem_usage, + const std::string& working_dir_path, const std::string& output_db_path) +{ + FunnyProgress progress; + + std::size_t memory = std::max(max_memory, min_memory); + stage1_params + .SetInputFileType(input_file_type) + .SetInputFiles(seqs) + .SetKmerLen(k) + .SetNThreads(thread_count) + .SetMaxRamGB(memory) + .SetSignatureLen(signature_len) + .SetNBins(bin_count) + .SetTmpPath(working_dir_path) + .SetEstimateHistogramCfg(estimate_mem_usage ? KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS : KMC::EstimateHistogramCfg::DONT_ESTIMATE) + .SetPercentProgressObserver(&progress) + ; + + stage1_results = kmc.RunStage1(stage1_params); + + + memory = std::max( + (estimate_mem_usage ? std::max(memory_limit(solid_kmer_count_approx(cutoff)), max_memory) : max_memory), + min_memory); + stage2_params + .SetCutoffMin(cutoff) + .SetNThreads(thread_count) + .SetMaxRamGB(memory) + .SetStrictMemoryMode(strict_memory) + .SetCounterMax(counter_max) + .SetOutputFileName(output_db_path) + ; + + stage2_results = kmc.RunStage2(stage2_params); + + + const uint64_t kmer_count = stage2_results.nUniqueKmers - stage2_results.nBelowCutoffMin - stage2_results.nAboveCutoffMax; + return kmer_Enumeration_Stats(kmer_count, memory, stage2_results.maxDiskUsage); +} + + +template +uint64_t kmer_Enumerator::solid_kmer_count_approx(const uint16_t cutoff) const +{ + uint64_t solid_kmer_count = 0; + for (std::size_t freq = cutoff; freq < stage1_results.estimatedHistogram.size(); ++freq) + solid_kmer_count += stage1_results.estimatedHistogram[freq]; + + return solid_kmer_count; +} + + +template +std::size_t kmer_Enumerator::memory_limit(const uint64_t unique_kmer_count) const +{ + const double memory_in_bits = bits_per_kmer * unique_kmer_count; + const double memory_in_bytes = memory_in_bits / 8.0; + std::size_t memory_in_gb = static_cast(memory_in_bytes / (1024 * 1024 * 1024)); + + return memory_in_gb; +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE_ALL, kmer_Enumerator) diff --git a/src/main.cpp b/src/main.cpp index 0794442d..0d7f545d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,19 +22,24 @@ void build(int argc, char** argv) { cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from references or reads"); options.add_options() + // TODO: replace CLI underscores with hyphens + // TODO: better indent the following wall of text ("read", "construct a compacted read de Bruijn graph") ("r,refs", "reference files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("l,lists", "reference file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("k,kmer_len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) + ("c,cutoff", "frequency cutoff for (k + 1)-mers (inapplicable for references)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) ("s,kmc_db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()) ("e,edge_db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) + ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) + ("unrestrict-memory", "do not impose memory usage restriction") ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ("w,work_dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("rm", "remove the KMC database") - // TODO: remove the following three options + // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) @@ -56,9 +61,12 @@ void build(int argc, char** argv) const auto lists = result["lists"].as>(); const auto dirs = result["dirs"].as>(); const auto k = result["kmer_len"].as(); + const auto cutoff = result["cutoff"].as(); const auto kmer_database = result["kmc_db"].as(); const auto edge_database = result["edge_db"].as(); const auto thread_count = result["threads"].as(); + const auto max_memory = result["max-memory"].as(); + const auto strict_memory = !result["unrestrict-memory"].as(); const auto output_file = result["output"].as(); const auto format = result["format"].as(); const auto remove_kmc_db = result["rm"].as(); @@ -69,7 +77,9 @@ void build(int argc, char** argv) const auto dcc_opt = !result["no-dcc"].as(); const auto extract_cycles = result["cycles"].as(); - const Build_Params params( is_read_graph, refs, lists, dirs, k, kmer_database, edge_database, thread_count, + const Build_Params params( is_read_graph, + refs, lists, dirs, + k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, json_file, dcc_opt, extract_cycles); if(!params.is_valid()) From 5189004c1ef4f674b3c54e38f6dca5484a254198 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 12 Sep 2021 17:31:51 -0400 Subject: [PATCH 152/350] Centralize seq-collection builder --- include/Ref_Parser.hpp | 3 +++ include/Seq_Input.hpp | 3 +++ src/Ref_Parser.cpp | 36 ++++++------------------------------ src/Seq_Input.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 30 deletions(-) diff --git a/include/Ref_Parser.hpp b/include/Ref_Parser.hpp index f9280486..9e62bf96 100644 --- a/include/Ref_Parser.hpp +++ b/include/Ref_Parser.hpp @@ -30,6 +30,9 @@ class Ref_Parser uint64_t seq_id_; // Number of the current sequence (in the current reference). + // Constructs a parser for the reference input collection `refs`. + Ref_Parser(const std::vector& refs); + // Opens the reference at path `reference_path`. void open_reference(const std::string& reference_path); diff --git a/include/Seq_Input.hpp b/include/Seq_Input.hpp index 6c4916bc..aa5976c7 100644 --- a/include/Seq_Input.hpp +++ b/include/Seq_Input.hpp @@ -32,6 +32,9 @@ class Seq_Input // Returns the collection of paths to directories containing sequence files. const std::vector& dir_paths() const; + // Returns the collection of all the input sequences. + const std::vector seqs() const; + // Returns whether the sequence collection is empty or not. bool empty() const; }; diff --git a/src/Ref_Parser.cpp b/src/Ref_Parser.cpp index 7f904501..bfa23818 100644 --- a/src/Ref_Parser.cpp +++ b/src/Ref_Parser.cpp @@ -21,42 +21,18 @@ Ref_Parser::Ref_Parser(const std::string& file_path) } -Ref_Parser::Ref_Parser(const Seq_Input& ref_input) +Ref_Parser::Ref_Parser(const Seq_Input& ref_input): Ref_Parser(ref_input.seqs()) { - // Collect references from the raw reference paths provided. - for(const std::string& ref_path: ref_input.seq_paths()) - ref_paths.push(ref_path); - - - // Collect references from the provided reference lists. - for(const std::string& list_path: ref_input.list_paths()) - { - std::ifstream input(list_path.c_str(), std::ifstream::in); - if(input.fail()) - { - std::cerr << "Error opening reference list file " << list_path << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } - - std::string ref_path; - while(input >> ref_path) - ref_paths.push(ref_path); - - input.close(); - } - - - // Collect references from the provided reference directories. - for(const std::string& dir_path: ref_input.dir_paths()) - for(const auto& entry: ghc::filesystem::directory_iterator(dir_path)) - ref_paths.push(entry.path()); - - // Open the first reference for subsequent parsing. open_next_reference(); } +Ref_Parser::Ref_Parser(const std::vector& refs): + ref_paths(std::deque(refs.begin(), refs.end())) +{} + + void Ref_Parser::open_reference(const std::string& reference_path) { file_ptr = gzopen(reference_path.c_str(), "r"); // Open the file handler. diff --git a/src/Seq_Input.cpp b/src/Seq_Input.cpp index 504fd93b..4e57496c 100644 --- a/src/Seq_Input.cpp +++ b/src/Seq_Input.cpp @@ -1,5 +1,9 @@ #include "Seq_Input.hpp" +#include "ghc/filesystem.hpp" + +#include +#include Seq_Input::Seq_Input( const std::vector& seqs, @@ -29,6 +33,42 @@ const std::vector& Seq_Input::dir_paths() const } +const std::vector Seq_Input::seqs() const +{ + std::vector seqs; + + // Collect sequences from the raw sequence paths provided. + seqs.insert(seqs.end(), seq_paths_.begin(), seq_paths_.end()); + + + // Collect sequences from the provided sequence lists. + for(const std::string& list_path: list_paths_) + { + std::ifstream input(list_path.c_str(), std::ifstream::in); + if(input.fail()) + { + std::cerr << "Error opening list file " << list_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + std::string seq_path; + while(input >> seq_path) + seqs.emplace_back(seq_path); + + input.close(); + } + + + // Collect sequences from the provided sequence directories. + for(const std::string& dir_path: dir_paths_) + for(const auto& entry: ghc::filesystem::directory_iterator(dir_path)) + seqs.emplace_back(entry.path()); + + + return seqs; +} + + bool Seq_Input::empty() const { return seq_paths_.empty() && list_paths_.empty() && dir_paths_.empty(); From a13e4f345c99a5587058604e2419d8081790bfa4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 12 Sep 2021 19:20:44 -0400 Subject: [PATCH 153/350] Add hash table constructor with key count --- include/Kmer_Hash_Table.hpp | 10 +++++++--- src/Kmer_Hash_Table.cpp | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 2951fe53..248e940d 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -66,9 +66,13 @@ class Kmer_Hash_Table public: - // Constructs a `Kmer_Hash_Table` object, where the hash table is to be built - // over the KMC database with path prefix `kmc_db_path`. - Kmer_Hash_Table(const std::string& kmc_db_path); + // Constructs a k-mer hash table where the table is to be built over the k-mer + // database with path prefix `kmer_db_path`. + Kmer_Hash_Table(const std::string& kmer_db_path); + + // Constructs a k-mer hash table where the table is to be built over the k-mer + // database having path prefix `kmer_db_path` and `kmer_count` distinct k-mers. + Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count); // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 9feef388..f9725db5 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -18,6 +18,14 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path {} +template +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count): + kmc_db_path(kmc_db_path), + kmer_count(kmer_count), + sparse_lock(kmer_count, lock_count) +{} + + template void Kmer_Hash_Table::build_mph_function(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { From 9a2006c82fbfe7c89c8c6962723ea3623c9e2392 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 12 Sep 2021 19:21:38 -0400 Subject: [PATCH 154/350] Add getters for k-mer enumeration stats --- include/kmer_Enumerator.hpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index 85d493d5..bcfae1ae 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -62,18 +62,36 @@ class kmer_Enumeration_Stats { private: - uint64_t kmer_count; - std::size_t max_memory; - std::size_t temp_disk_usage; + uint64_t kmer_count_; + std::size_t max_memory_; + std::size_t temp_disk_usage_; public: kmer_Enumeration_Stats(const uint64_t kmer_count, const std::size_t max_memory, const std::size_t temp_disk_usage): - kmer_count(kmer_count), - max_memory(max_memory), - temp_disk_usage(temp_disk_usage) + kmer_count_(kmer_count), + max_memory_(max_memory), + temp_disk_usage_(temp_disk_usage) {} + + + uint64_t kmer_count() const + { + return kmer_count_; + } + + + std::size_t max_memory() const + { + return max_memory_; + } + + + std::size_t temp_disk_usage() const + { + return temp_disk_usage_; + } }; From 129e04fd0d50e3d41243794b15a850dececc8795 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 12 Sep 2021 20:53:12 -0400 Subject: [PATCH 155/350] Integrate edge- and vertex-enumeration in pipeline DCC extraction needs update --- include/File_Extensions.hpp | 2 ++ include/Read_CdBG.hpp | 10 +++++---- include/Read_CdBG_Constructor.hpp | 4 ++-- include/Read_CdBG_Extractor.hpp | 4 ++-- src/Build_Params.cpp | 17 ++++++++------- src/Read_CdBG.cpp | 36 +++++++++++++++++++++++++------ src/Read_CdBG_Constructor.cpp | 4 ++-- src/Read_CdBG_Extractor.cpp | 4 ++-- src/main.cpp | 2 +- 9 files changed, 55 insertions(+), 28 deletions(-) diff --git a/include/File_Extensions.hpp b/include/File_Extensions.hpp index e1e576b6..52a2c06f 100644 --- a/include/File_Extensions.hpp +++ b/include/File_Extensions.hpp @@ -9,6 +9,8 @@ namespace cuttlefish // File extensions for the data structures and files output by the algorithm. namespace file_ext { + constexpr char edges_ext[] = ".edges"; + constexpr char vertices_ext[] = ".vertices"; constexpr char hash_ext[] = ".cf_hf"; constexpr char buckets_ext[] = ".cf_hb"; constexpr char unipaths_ext[] = ".fa"; diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index babbbda0..85098343 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -24,11 +24,13 @@ class Read_CdBG dBG_Info dbg_info; // Wrapper object for structural information of the graph. - // Computes the states of the automata, i.e. the vertices in the graph. - void compute_DFA_states(); + // Computes the states of the automata, i.e. the vertices of the graph having it edge + // set present at the path prefix `edge_db_path`. + void compute_DFA_states(const std::string& edge_db_path); - // Extracts the maximal unitigs from the graph. - void extract_maximal_unitigs(); + // Extracts the maximal unitigs from the graph gaving its vertex set present at the + // path prefix `vertex_db_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path); public: diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 9f53c18f..8cef4d9b 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -92,8 +92,8 @@ class Read_CdBG_Constructor // the Cuttlefish hash table `hash_table`. Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table); - // Computes the states of the DFA in the de Bruijn graph. - void compute_DFA_states(); + // Computes the states of the DFA in the de Bruijn graph with the edge set at path prefix `edge_db_path`. + void compute_DFA_states(const std::string& edge_db_path); // Returns the number of distinct vertices in the underlying graph. uint64_t vertex_count() const; diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 711dd915..0781086c 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -172,8 +172,8 @@ class Read_CdBG_Extractor // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); - // Extracts the maximal unitigs of the de Bruijn graph. - void extract_maximal_unitigs(); + // Extracts the maximal unitigs of the de Bruijn graph with the vertex set at path prefix `vertex_db_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path); // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the // rest of the graph. `dbg_info` is used to determine whether the compacted graph had been constructed diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index a016ba62..0db24201 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -10,18 +10,19 @@ bool Build_Params::is_valid() const bool valid = true; + if(seq_input_.empty()) + { + std::cout << "No sequence input provided for compacted de Bruijn graph construction.\n"; + valid = false; + } + + // Check if read and reference de Bruijn graph parameters are being mixed with. if(is_read_graph_) // Is a read de Bruijn graph. { - if(!seq_input_.empty()) - { - std::cout << "No reference is to be provided for a compacted read de Bruijn graph construction.\n"; - valid = false; - } - - if(edge_db_path_.empty()) + if(output_format_ != cuttlefish::Output_Format::txt) { - std::cout << "The path prefix to the KMC-database for edges (i.e. (k + 1)-mers) is required.\n"; + std::cout << "(Currently) Unsupported output file format requested for the compacted read de Bruijn graph.\n"; valid = false; } } diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 834b6056..e9a3f0cc 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -1,7 +1,10 @@ #include "Read_CdBG.hpp" +#include "kmer_Enumerator.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include "File_Extensions.hpp" +#include "kmc_runner.h" #include @@ -26,18 +29,37 @@ void Read_CdBG::construct() dbg_info.add_build_params(params); + std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; + const std::string edge_db_path = params.output_prefix() + cuttlefish::file_ext::edges_ext; + kmer_Enumerator edge_enumerator; + kmer_Enumeration_Stats edge_stats = edge_enumerator.enumerate( + KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), + params.thread_count(), params.max_memory(), params.strict_memory(), true, + params.working_dir_path(), edge_db_path); + + std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; + kmer_Enumerator vertex_enumerator; + const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; + kmer_Enumeration_Stats vertex_stats = vertex_enumerator.enumerate( + KMC::InputFileType::KMC, std::vector(1, edge_db_path), 1, + params.thread_count(), edge_stats.max_memory(), params.strict_memory(), false, + params.working_dir_path(), vertex_db_path); + + std::cout << "Number of edges: " << edge_stats.kmer_count() << ".\n"; + std::cout << "Number of vertices: " << vertex_stats.kmer_count() << ".\n"; + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - hash_table = std::make_unique>(params.vertex_db_path()); + hash_table = std::make_unique>(vertex_db_path, vertex_stats.kmer_count()); hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); std::cout << "\nComputing the DFA states.\n"; - compute_DFA_states(); + compute_DFA_states(edge_db_path); if(!params.extract_cycles() && !params.dcc_opt()) hash_table->save(params); std::cout << "\nExtracting the maximal unitigs.\n"; - extract_maximal_unitigs(); + extract_maximal_unitigs(vertex_db_path); if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) hash_table->remove(params); @@ -49,22 +71,22 @@ void Read_CdBG::construct() template -void Read_CdBG::compute_DFA_states() +void Read_CdBG::compute_DFA_states(const std::string& edge_db_path) { Read_CdBG_Constructor cdBg_constructor(params, *hash_table); - cdBg_constructor.compute_DFA_states(); + cdBg_constructor.compute_DFA_states(edge_db_path); dbg_info.add_basic_info(cdBg_constructor); } template -void Read_CdBG::extract_maximal_unitigs() +void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); if(!is_constructed(params)) { - cdBg_extractor.extract_maximal_unitigs(); + cdBg_extractor.extract_maximal_unitigs(vertex_db_path); dbg_info.add_unipaths_info(cdBg_extractor); diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index e0627b64..e7039818 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -15,12 +15,12 @@ Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer template -void Read_CdBG_Constructor::compute_DFA_states() +void Read_CdBG_Constructor::compute_DFA_states(const std::string& edge_db_path) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - const Kmer_Container edge_container(params.edge_db_path()); // Wrapper container for the edge-database. + const Kmer_Container edge_container(edge_db_path); // Wrapper container for the edge-database. Kmer_SPMC_Iterator edge_parser(&edge_container, params.thread_count()); // Parser for the edges from the edge-database. edge_count_ = edge_container.size(); std::cout << "Total number of distinct edges: " << edge_count_ << ".\n"; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 228f2761..2e8b91dc 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -16,7 +16,7 @@ Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Has template -void Read_CdBG_Extractor::extract_maximal_unitigs() +void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_db_path) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -26,7 +26,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs() Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_unipaths_read_space); // Launch the reading (and parsing per demand) of the vertices from disk. - const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + const Kmer_Container vertex_container(vertex_db_path); // Wrapper container for the vertex-database. Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; diff --git a/src/main.cpp b/src/main.cpp index 0d7f545d..5c2fb588 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -30,7 +30,7 @@ void build(int argc, char** argv) ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("k,kmer_len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("c,cutoff", "frequency cutoff for (k + 1)-mers (inapplicable for references)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) - ("s,kmc_db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()) + ("s,kmc_db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("e,edge_db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) From 55899e3aeb8e0fdeec43449fcd209ae3770b00b4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 12 Sep 2021 21:31:13 -0400 Subject: [PATCH 156/350] Make DCC extraction work with KMC intergated --- include/Read_CdBG_Extractor.hpp | 17 ++++++++++------- src/Detached_Cycles_Extractor.cpp | 14 +++++++------- src/Read_CdBG.cpp | 4 ++-- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 0781086c..12304271 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -76,8 +76,9 @@ class Read_CdBG_Extractor // Marks all the vertices which have their hashes present in `path_hashes` as outputted. void mark_path(const std::vector& path_hashes); - // Marks all the vertices that are present in the maximal unitigs of the graph. - void mark_maximal_unitig_vertices(); + // Marks all the vertices that are present in the maximal unitigs of the graph with its vertex + // set being present at the path prefix `vertex_db_path`. + void mark_maximal_unitig_vertices(const std::string& vertex_db_path); // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` // for potential unipath-flanking vertices. If a vertex `v` is found to be a flanking one, then @@ -94,8 +95,9 @@ class Read_CdBG_Extractor // number of vertices marked in this execution. std::size_t mark_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); - // Extracts all the detached chordless cycles present in the graph. - void extract_detached_chordless_cycles(); + // Extracts all the detached chordless cycles present in the graph with its vertex set being + // present at the path prefix `vertex_db_path`. + void extract_detached_chordless_cycles(const std::string& vertex_db_path); // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` // for potential detached chordless cycles. If a vertex `v` is found to be not marked as present @@ -176,9 +178,10 @@ class Read_CdBG_Extractor void extract_maximal_unitigs(const std::string& vertex_db_path); // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the - // rest of the graph. `dbg_info` is used to determine whether the compacted graph had been constructed - // earlier — in which case some data structures are re-used from the earlier construction. - void extract_detached_cycles(const dBG_Info& dbg_info); + // rest of the graph. The graph is to contain its vertex set at the path prefix `vertex_db_path`. + // `dbg_info` is used to determine whether the compacted graph had been constructed earlier — in + // which case some data structures are re-used from the earlier construction. + void extract_detached_cycles(const std::string& vertex_db_path, const dBG_Info& dbg_info); // Returns the parameters collection for the compacted graph construction. const Build_Params& get_params() const; diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index eb069b58..3bde7f48 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -9,7 +9,7 @@ template -void Read_CdBG_Extractor::extract_detached_cycles(const dBG_Info& dbg_info) +void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_db_path, const dBG_Info& dbg_info) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -17,12 +17,12 @@ void Read_CdBG_Extractor::extract_detached_cycles(const dBG_Info& dbg_info if(Read_CdBG::is_constructed(params) && !dbg_info.dcc_opt_performed()) { std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; - mark_maximal_unitig_vertices(); + mark_maximal_unitig_vertices(vertex_db_path); std::cout << "Done marking the vertices.\n"; } std::cout << "Extracting the cycles.\n"; - extract_detached_chordless_cycles(); + extract_detached_chordless_cycles(vertex_db_path); std::cout << "\nNumber of detached chordless cycles: " << unipaths_meta_info_.dcc_count() << ".\n" "Number of vertices in the cycles: " << unipaths_meta_info_.dcc_kmer_count() << ".\n"; @@ -35,14 +35,14 @@ void Read_CdBG_Extractor::extract_detached_cycles(const dBG_Info& dbg_info template -void Read_CdBG_Extractor::mark_maximal_unitig_vertices() +void Read_CdBG_Extractor::mark_maximal_unitig_vertices(const std::string& vertex_db_path) { // Construct a thread pool. const uint16_t thread_count = params.thread_count(); Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::mark_unipath_vertices); // Launch the reading (and parsing per demand) of the vertices from disk. - const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + const Kmer_Container vertex_container(vertex_db_path); // Wrapper container for the vertex-database. Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; @@ -142,14 +142,14 @@ std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, co template -void Read_CdBG_Extractor::extract_detached_chordless_cycles() +void Read_CdBG_Extractor::extract_detached_chordless_cycles(const std::string& vertex_db_path) { // Construct a thread pool. const uint16_t thread_count = params.thread_count(); Thread_Pool thread_pool(thread_count, this, Thread_Pool::Task_Type::extract_cycles); // Launch the reading (and parsing per demand) of the vertices from disk. - const Kmer_Container vertex_container(params.vertex_db_path()); // Wrapper container for the vertex-database. + const Kmer_Container vertex_container(vertex_db_path); // Wrapper container for the vertex-database. Kmer_SPMC_Iterator vertex_parser(&vertex_container, params.thread_count()); // Parser for the vertices from the vertex-database. std::cout << "Number of distinct vertices: " << vertex_container.size() << ".\n"; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index e9a3f0cc..7979fc06 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -94,7 +94,7 @@ void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { if(params.extract_cycles()) { - cdBg_extractor.extract_detached_cycles(dbg_info); + cdBg_extractor.extract_detached_cycles(vertex_db_path, dbg_info); dbg_info.add_DCC_info(cdBg_extractor); } @@ -108,7 +108,7 @@ void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { if(!dbg_info.dcc_extracted()) { - cdBg_extractor.extract_detached_cycles(dbg_info); + cdBg_extractor.extract_detached_cycles(vertex_db_path, dbg_info); dbg_info.add_DCC_info(cdBg_extractor); } From d95180eeb5e8a2666ac1d8013ddb88ff1753cd04 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 13 Sep 2021 16:42:30 -0400 Subject: [PATCH 157/350] Modularize high-level construction --- include/Read_CdBG.hpp | 16 ++++++++++++++++ src/Read_CdBG.cpp | 43 +++++++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 85098343..ac226cb5 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -12,6 +12,9 @@ #include +class kmer_Enumeration_Stats; + + // Read de Bruijn graph class to support the compaction algorithm. template class Read_CdBG @@ -24,6 +27,19 @@ class Read_CdBG dBG_Info dbg_info; // Wrapper object for structural information of the graph. + // Enumerates the edges of the de Bruijn graph in a database at path `edge_db_path`, + // and returns summary statistics of the enumearation. + kmer_Enumeration_Stats enumerate_edges(const std::string& edge_db_path); + + // Enumerates the vertices of the de Bruijn graph in a database at path `vertex_db_path`, + // from the edge database present at `edge_db_path`, using at most `max_memory` amount of + // memory. Returns summary statistics of the enumeration. + kmer_Enumeration_Stats enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, std::size_t max_memory); + + // Constructs the Cuttlefish hash table for the `vertex_count` vertices in the database + // at path `vertex_db_path`. + void construct_hash_table(const std::string& vertex_db_path, uint64_t vertex_count); + // Computes the states of the automata, i.e. the vertices of the graph having it edge // set present at the path prefix `edge_db_path`. void compute_DFA_states(const std::string& edge_db_path); diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 7979fc06..2c8516d6 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -31,26 +31,17 @@ void Read_CdBG::construct() std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; const std::string edge_db_path = params.output_prefix() + cuttlefish::file_ext::edges_ext; - kmer_Enumerator edge_enumerator; - kmer_Enumeration_Stats edge_stats = edge_enumerator.enumerate( - KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), - params.thread_count(), params.max_memory(), params.strict_memory(), true, - params.working_dir_path(), edge_db_path); + kmer_Enumeration_Stats edge_stats = enumerate_edges(edge_db_path); std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; - kmer_Enumerator vertex_enumerator; const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; - kmer_Enumeration_Stats vertex_stats = vertex_enumerator.enumerate( - KMC::InputFileType::KMC, std::vector(1, edge_db_path), 1, - params.thread_count(), edge_stats.max_memory(), params.strict_memory(), false, - params.working_dir_path(), vertex_db_path); + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_db_path, vertex_db_path, edge_stats.max_memory()); std::cout << "Number of edges: " << edge_stats.kmer_count() << ".\n"; std::cout << "Number of vertices: " << vertex_stats.kmer_count() << ".\n"; std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - hash_table = std::make_unique>(vertex_db_path, vertex_stats.kmer_count()); - hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + construct_hash_table(vertex_db_path, vertex_stats.kmer_count()); std::cout << "\nComputing the DFA states.\n"; compute_DFA_states(edge_db_path); @@ -70,6 +61,34 @@ void Read_CdBG::construct() } +template +kmer_Enumeration_Stats Read_CdBG::enumerate_edges(const std::string& edge_db_path) +{ + return kmer_Enumerator().enumerate( + KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), + params.thread_count(), params.max_memory(), params.strict_memory(), true, + params.working_dir_path(), edge_db_path); +} + + +template +kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, const std::size_t max_memory) +{ + return kmer_Enumerator().enumerate( + KMC::InputFileType::KMC, std::vector(1, edge_db_path), 1, + params.thread_count(), max_memory, params.strict_memory(), false, + params.working_dir_path(), vertex_db_path); +} + + +template +void Read_CdBG::construct_hash_table(const std::string& vertex_db_path, const uint64_t vertex_count) +{ + hash_table = std::make_unique>(vertex_db_path, vertex_count); + hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); +} + + template void Read_CdBG::compute_DFA_states(const std::string& edge_db_path) { From a9005a01620dff16826704f1373e2957cca162ae Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 13 Sep 2021 23:04:52 -0400 Subject: [PATCH 158/350] Time modules --- src/Read_CdBG.cpp | 22 +++++++++++++++++++++- src/Read_CdBG_Constructor.cpp | 8 ++++---- src/Read_CdBG_Extractor.cpp | 8 ++++---- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 2c8516d6..ecbb13ce 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -25,29 +25,46 @@ void Read_CdBG::construct() return; } - dbg_info.add_build_params(params); + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; const std::string edge_db_path = params.output_prefix() + cuttlefish::file_ext::edges_ext; kmer_Enumeration_Stats edge_stats = enumerate_edges(edge_db_path); + std::chrono::high_resolution_clock::time_point t_edges = std::chrono::high_resolution_clock::now(); + std::cout << "Enumerated the edge set of the graph. Time taken = " << std::chrono::duration_cast>(t_edges - t_start).count() << " seconds.\n"; + + std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_db_path, vertex_db_path, edge_stats.max_memory()); + std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); + std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_edges).count() << " seconds.\n"; + std::cout << "Number of edges: " << edge_stats.kmer_count() << ".\n"; std::cout << "Number of vertices: " << vertex_stats.kmer_count() << ".\n"; + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; construct_hash_table(vertex_db_path, vertex_stats.kmer_count()); + std::chrono::high_resolution_clock::time_point t_mphf = std::chrono::high_resolution_clock::now(); + std::cout << "Constructed the minimal perfect hash function for the vertices. Time taken = " << std::chrono::duration_cast>(t_mphf - t_vertices).count() << " seconds.\n"; + + std::cout << "\nComputing the DFA states.\n"; compute_DFA_states(edge_db_path); if(!params.extract_cycles() && !params.dcc_opt()) hash_table->save(params); + + std::chrono::high_resolution_clock::time_point t_dfa = std::chrono::high_resolution_clock::now(); + std::cout << "Computed the states of the automata. Time taken = " << std::chrono::duration_cast>(t_dfa - t_mphf).count() << " seconds.\n"; + std::cout << "\nExtracting the maximal unitigs.\n"; extract_maximal_unitigs(vertex_db_path); @@ -55,6 +72,9 @@ void Read_CdBG::construct() if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) hash_table->remove(params); + std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); + std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; + hash_table->clear(); dbg_info.dump_info(); diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index e7039818..fb19e003 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -17,7 +17,7 @@ Read_CdBG_Constructor::Read_CdBG_Constructor(const Build_Params& params, Kmer template void Read_CdBG_Constructor::compute_DFA_states(const std::string& edge_db_path) { - std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); const Kmer_Container edge_container(edge_db_path); // Wrapper container for the edge-database. @@ -58,9 +58,9 @@ void Read_CdBG_Constructor::compute_DFA_states(const std::string& edge_db_pat } - std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done computing the DFA states. Time taken = " << elapsed_seconds << " seconds.\n"; + // std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + // double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + // std::cout << "Done computing the DFA states. Time taken = " << elapsed_seconds << " seconds.\n"; } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 2e8b91dc..feb045d8 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -18,7 +18,7 @@ Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Has template void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_db_path) { - std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); // Construct a thread pool. @@ -59,9 +59,9 @@ void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_d " I.e. the cycles are graph components exclusively on their own.\n\n"; - std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; + // std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + // double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + // std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; } From 7ea34505a23cf1e21ffe1129b783529dd46a5a56 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 11:52:04 -0400 Subject: [PATCH 159/350] Relocate output sink opener and closer --- src/Detached_Cycles_Extractor.cpp | 14 -------------- src/Read_CdBG_Extractor.cpp | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 3bde7f48..8f7526d2 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -283,20 +283,6 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s } -template -void Read_CdBG_Extractor::init_output_sink() -{ - output_sink.init_sink(params.output_file_path()); -} - - -template -void Read_CdBG_Extractor::close_output_sink() -{ - output_sink.close_sink(); -} - - // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index feb045d8..b81af5c5 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -199,6 +199,20 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } +template +void Read_CdBG_Extractor::init_output_sink() +{ + output_sink.init_sink(params.output_file_path()); +} + + +template +void Read_CdBG_Extractor::close_output_sink() +{ + output_sink.close_sink(); +} + + template const Build_Params& Read_CdBG_Extractor::get_params() const { From 847102d7f6914d5d11cf4d3ad267c779840a0222 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 11:58:00 -0400 Subject: [PATCH 160/350] Generalize output sink init --- include/Read_CdBG_Extractor.hpp | 4 ++-- src/Detached_Cycles_Extractor.cpp | 2 +- src/Read_CdBG_Extractor.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 12304271..aa42cc0c 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -125,8 +125,8 @@ class Read_CdBG_Extractor // successful. bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); - // Initializes the output sink. - void init_output_sink(); + // Initializes the output sink, corresponding to the file `output_file_path`. + void init_output_sink(const std::string& output_file_path); // Closes the output sink. void close_output_sink(); diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 8f7526d2..444d7b9d 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -156,7 +156,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(const std::string vertex_parser.launch_production(); // Initialize the output sink. - init_output_sink(); + init_output_sink(params.output_file_path()); // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index b81af5c5..c61fe42c 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -34,7 +34,7 @@ void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_d // Clear the output file and initialize the output sink. clear_file(params.output_file_path()); - init_output_sink(); + init_output_sink(params.output_file_path()); // Launch (multi-threaded) extraction of the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); @@ -200,9 +200,9 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const template -void Read_CdBG_Extractor::init_output_sink() +void Read_CdBG_Extractor::init_output_sink(const std::string& output_file_path) { - output_sink.init_sink(params.output_file_path()); + output_sink.init_sink(output_file_path); } From 2c564aabbb13ddfb3e17c0e6cd7b95a0a16096a7 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 12:23:44 -0400 Subject: [PATCH 161/350] Generalize output path --- include/Read_CdBG.hpp | 6 +++--- include/Read_CdBG_Extractor.hpp | 18 ++++++++++-------- src/Detached_Cycles_Extractor.cpp | 8 ++++---- src/Read_CdBG.cpp | 10 +++++----- src/Read_CdBG_Extractor.cpp | 6 +++--- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index ac226cb5..62aeef3e 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -44,9 +44,9 @@ class Read_CdBG // set present at the path prefix `edge_db_path`. void compute_DFA_states(const std::string& edge_db_path); - // Extracts the maximal unitigs from the graph gaving its vertex set present at the - // path prefix `vertex_db_path`. - void extract_maximal_unitigs(const std::string& vertex_db_path); + // Extracts the maximal unitigs from the graph having its vertex set present at the + // path prefix `vertex_db_path`, into the output file at `output_file_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path); public: diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index aa42cc0c..52e67e04 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -96,8 +96,8 @@ class Read_CdBG_Extractor std::size_t mark_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat); // Extracts all the detached chordless cycles present in the graph with its vertex set being - // present at the path prefix `vertex_db_path`. - void extract_detached_chordless_cycles(const std::string& vertex_db_path); + // present at the path prefix `vertex_db_path`, into the output file at `output_file_path`. + void extract_detached_chordless_cycles(const std::string& vertex_db_path, const std::string& output_file_path); // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` // for potential detached chordless cycles. If a vertex `v` is found to be not marked as present @@ -174,14 +174,16 @@ class Read_CdBG_Extractor // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); - // Extracts the maximal unitigs of the de Bruijn graph with the vertex set at path prefix `vertex_db_path`. - void extract_maximal_unitigs(const std::string& vertex_db_path); + // Extracts the maximal unitigs of the de Bruijn graph with the vertex set at path prefix `vertex_db_path`, + // into the output file at `output_file_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path); // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the - // rest of the graph. The graph is to contain its vertex set at the path prefix `vertex_db_path`. - // `dbg_info` is used to determine whether the compacted graph had been constructed earlier — in - // which case some data structures are re-used from the earlier construction. - void extract_detached_cycles(const std::string& vertex_db_path, const dBG_Info& dbg_info); + // rest of the graph. The graph is to contain its vertex set at the path prefix `vertex_db_path`, + // and the cycles are appeneded to the output file at `output_file_path`. `dbg_info` is used to + // determine whether the compacted graph had been constructed earlier—in which case some data + // structures are re-used from the earlier construction. + void extract_detached_cycles(const std::string& vertex_db_path, const std::string& output_file_path, const dBG_Info& dbg_info); // Returns the parameters collection for the compacted graph construction. const Build_Params& get_params() const; diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 444d7b9d..a4343ee5 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -9,7 +9,7 @@ template -void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_db_path, const dBG_Info& dbg_info) +void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_db_path, const std::string& output_file_path, const dBG_Info& dbg_info) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -22,7 +22,7 @@ void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_d } std::cout << "Extracting the cycles.\n"; - extract_detached_chordless_cycles(vertex_db_path); + extract_detached_chordless_cycles(vertex_db_path, output_file_path); std::cout << "\nNumber of detached chordless cycles: " << unipaths_meta_info_.dcc_count() << ".\n" "Number of vertices in the cycles: " << unipaths_meta_info_.dcc_kmer_count() << ".\n"; @@ -142,7 +142,7 @@ std::size_t Read_CdBG_Extractor::mark_maximal_unitig(const Kmer& v_hat, co template -void Read_CdBG_Extractor::extract_detached_chordless_cycles(const std::string& vertex_db_path) +void Read_CdBG_Extractor::extract_detached_chordless_cycles(const std::string& vertex_db_path, const std::string& output_file_path) { // Construct a thread pool. const uint16_t thread_count = params.thread_count(); @@ -156,7 +156,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(const std::string vertex_parser.launch_production(); // Initialize the output sink. - init_output_sink(params.output_file_path()); + init_output_sink(output_file_path); // Launch (multi-threaded) marking of the vertices present in the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index ecbb13ce..b515e4f2 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -67,7 +67,7 @@ void Read_CdBG::construct() std::cout << "\nExtracting the maximal unitigs.\n"; - extract_maximal_unitigs(vertex_db_path); + extract_maximal_unitigs(vertex_db_path, params.output_file_path()); if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) hash_table->remove(params); @@ -120,12 +120,12 @@ void Read_CdBG::compute_DFA_states(const std::string& edge_db_path) template -void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) +void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path) { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); if(!is_constructed(params)) { - cdBg_extractor.extract_maximal_unitigs(vertex_db_path); + cdBg_extractor.extract_maximal_unitigs(vertex_db_path, output_file_path); dbg_info.add_unipaths_info(cdBg_extractor); @@ -133,7 +133,7 @@ void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { if(params.extract_cycles()) { - cdBg_extractor.extract_detached_cycles(vertex_db_path, dbg_info); + cdBg_extractor.extract_detached_cycles(vertex_db_path, output_file_path, dbg_info); dbg_info.add_DCC_info(cdBg_extractor); } @@ -147,7 +147,7 @@ void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { if(!dbg_info.dcc_extracted()) { - cdBg_extractor.extract_detached_cycles(vertex_db_path, dbg_info); + cdBg_extractor.extract_detached_cycles(vertex_db_path, output_file_path, dbg_info); dbg_info.add_DCC_info(cdBg_extractor); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index c61fe42c..532a70cd 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -16,7 +16,7 @@ Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Has template -void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_db_path) +void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path) { // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -33,8 +33,8 @@ void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_d vertex_parser.launch_production(); // Clear the output file and initialize the output sink. - clear_file(params.output_file_path()); - init_output_sink(params.output_file_path()); + clear_file(output_file_path); + init_output_sink(output_file_path); // Launch (multi-threaded) extraction of the maximal unitigs. const uint64_t thread_load_percentile = static_cast(std::round((vertex_count() / 100.0) / params.thread_count())); From a6e3777444f5ca6a45d2e0a59d6faf06c7ae33a2 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 22:18:40 -0400 Subject: [PATCH 162/350] Add corner check for DCC status --- src/Read_CdBG.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index b515e4f2..be619528 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -19,9 +19,12 @@ Read_CdBG::Read_CdBG(const Build_Params& params): template void Read_CdBG::construct() { - if(is_constructed(params) && (!dbg_info.has_dcc() || dbg_info.dcc_extracted())) + if(is_constructed(params) && (!dbg_info.has_dcc() || dbg_info.dcc_extracted() || !params.extract_cycles())) { - std::cout << "\nThe compacted de Bruijn graph has already been completely constructed earlier.\n"; + std::cout << "\nThe compacted de Bruijn graph has already been constructed earlier.\n"; + if(dbg_info.has_dcc() && !dbg_info.dcc_extracted()) + std::cout << "There are Detached Chordless Cycles (DCC) present in the graph; run Cuttlefish with the `cycles` argument to extract those.\n"; + return; } From 155267010a6a721afa1752cb78275ce3d52c971e Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 22:24:33 -0400 Subject: [PATCH 163/350] Better dir info --- include/Build_Params.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 4a62404f..8f362623 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -73,7 +73,7 @@ class Build_Params strict_memory_(strict_memory), output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), - working_dir_path_(working_dir_path), + working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), From 3dc02ce4737a068548e528260228001e606c9470 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 22:31:08 -0400 Subject: [PATCH 164/350] Update filesystem functionality --- include/utility.hpp | 8 ++++++++ src/utility.cpp | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index 1e0fc5bf..1c127171 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -5,6 +5,8 @@ #include +// TODO: wrap everything here in some namespaces. + // Returns a random string of length `len`, using characters from `alphabet`. std::string get_random_string(size_t len, const char* alphabet = "0123456789" @@ -31,6 +33,12 @@ void remove_kmer_set(const std::string& kmc_file_pref); // Clears the content of the file at path `file_path`. void clear_file(const std::string& file_path); +// Returns the name of the file present at the path `file_path`. +const std::string filename(const std::string& file_path); + +// Moves the file present at path `from_path` to the path `to_path`. +void move_file(const std::string& from_path, const std::string& to_path); + #endif diff --git a/src/utility.cpp b/src/utility.cpp index bcb5ad24..fe481b66 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -90,3 +90,16 @@ void clear_file(const std::string& file_path) file.close(); } + + +const std::string filename(const std::string& file_path) +{ + return ghc::filesystem::path(file_path).filename().string(); +} + + +void move_file(const std::string& from_path, const std::string& to_path) +{ + ghc::filesystem::copy(from_path, to_path); + ghc::filesystem::remove(from_path); +} From ab40356ddb5212cad3ec5c12157716d9446cea05 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 22:38:40 -0400 Subject: [PATCH 165/350] Reduce fragile code --- include/Build_Params.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 8f362623..1f01805c 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -28,13 +28,13 @@ class Build_Params const uint16_t thread_count_; // Number of threads to work with. const std::size_t max_memory_; // Soft maximum memory limit. const bool strict_memory_; // Whether strict memory limit restriction is specifiied. - const std::string& output_file_path_; // Path to the output file. + const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). - const std::string& working_dir_path_; // Path to the working directory (for temporary files). + const std::string working_dir_path_; // Path to the working directory (for temporary files). const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. - const std::string& mph_file_path_; // Optional path to file storing an MPH over the k-mer set. - const std::string& buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. - const std::string& json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. + const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. + const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const std::string json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. From 66b793a96d2fdab9cd0af0fdaf2bbf469e6826f2 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 22:43:46 -0400 Subject: [PATCH 166/350] Output (temp) to working dir --- include/File_Extensions.hpp | 5 +++-- include/Read_CdBG.hpp | 4 ++-- src/Read_CdBG.cpp | 14 ++++++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/include/File_Extensions.hpp b/include/File_Extensions.hpp index 52a2c06f..1f69f247 100644 --- a/include/File_Extensions.hpp +++ b/include/File_Extensions.hpp @@ -9,12 +9,13 @@ namespace cuttlefish // File extensions for the data structures and files output by the algorithm. namespace file_ext { - constexpr char edges_ext[] = ".edges"; - constexpr char vertices_ext[] = ".vertices"; + constexpr char edges_ext[] = ".cf_edges"; + constexpr char vertices_ext[] = ".cf_vertices"; constexpr char hash_ext[] = ".cf_hf"; constexpr char buckets_ext[] = ".cf_hb"; constexpr char unipaths_ext[] = ".fa"; constexpr char json_ext[] = ".json"; + constexpr char temp[] = ".cf_op"; // For reference dBGs only: diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 62aeef3e..a3d15ca0 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -45,8 +45,8 @@ class Read_CdBG void compute_DFA_states(const std::string& edge_db_path); // Extracts the maximal unitigs from the graph having its vertex set present at the - // path prefix `vertex_db_path`, into the output file at `output_file_path`. - void extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path); + // path prefix `vertex_db_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path); public: diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index be619528..2e202d29 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -4,6 +4,7 @@ #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" #include "File_Extensions.hpp" +#include "utility.hpp" #include "kmc_runner.h" #include @@ -70,7 +71,7 @@ void Read_CdBG::construct() std::cout << "\nExtracting the maximal unitigs.\n"; - extract_maximal_unitigs(vertex_db_path, params.output_file_path()); + extract_maximal_unitigs(vertex_db_path); if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) hash_table->remove(params); @@ -123,12 +124,15 @@ void Read_CdBG::compute_DFA_states(const std::string& edge_db_path) template -void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path) +void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); + const std::string temp_output_path = params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::temp; + const std::string output_file_path = params.output_file_path(); + if(!is_constructed(params)) { - cdBg_extractor.extract_maximal_unitigs(vertex_db_path, output_file_path); + cdBg_extractor.extract_maximal_unitigs(vertex_db_path, temp_output_path); dbg_info.add_unipaths_info(cdBg_extractor); @@ -136,13 +140,15 @@ void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path, co { if(params.extract_cycles()) { - cdBg_extractor.extract_detached_cycles(vertex_db_path, output_file_path, dbg_info); + cdBg_extractor.extract_detached_cycles(vertex_db_path, temp_output_path, dbg_info); dbg_info.add_DCC_info(cdBg_extractor); } else if(params.dcc_opt()) hash_table->save(params); } + + move_file(temp_output_path, output_file_path); } else if(params.extract_cycles()) { From 07346999e32026a28419f56e30324fc387499b9f Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 23:10:12 -0400 Subject: [PATCH 167/350] Remove inner-step time-log --- src/Kmer_Hash_Table.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index f9725db5..56406a42 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -155,7 +155,7 @@ void Kmer_Hash_Table::remove(const Build_Params& params) const template void Kmer_Hash_Table::construct(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { - std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); std::cout << "Total number of k-mers in the set (KMC database): " << kmer_count << ".\n"; @@ -179,9 +179,9 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co " Bits per k-mer: " << (total_mem * 8.0) / kmer_count << ".\n"; - std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - const double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done allocating the hash table. Time taken = " << elapsed_seconds << " seconds.\n"; + // std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + // const double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + // std::cout << "Done allocating the hash table. Time taken = " << elapsed_seconds << " seconds.\n"; } From 0feb2834dec264c38e6ae1f0ec54cdce0882d545 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 14 Sep 2021 23:28:57 -0400 Subject: [PATCH 168/350] Better file removal --- include/Kmer_Container.hpp | 2 ++ include/utility.hpp | 5 +++-- src/CdBG.cpp | 2 +- src/Kmer_Container.cpp | 14 ++++++++++++++ src/utility.cpp | 11 ++--------- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/include/Kmer_Container.hpp b/include/Kmer_Container.hpp index be215842..f6309025 100644 --- a/include/Kmer_Container.hpp +++ b/include/Kmer_Container.hpp @@ -46,6 +46,8 @@ class Kmer_Container // Returns the number of k-mers present in the k-mer database with path prefix `kmc_db_path`. static uint64_t size(const std::string& kmc_db_path); + // Removes the KMC database at path `kmc_db_prefix` from disk. + static void remove(const std::string& kmc_db_prefix); // Returns an iterator pointing to the beginning of the underlying k-mer // database. diff --git a/include/utility.hpp b/include/utility.hpp index 1c127171..26d2e224 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -27,8 +27,9 @@ bool file_prefix_exists(const std::string& path, const std::string& prefix); // Returns a string that is a copy of `s` but has all the whitespaces removed. std::string remove_whitespaces(const char* s); -// Removes the k-mer set (KMC database) with the path prefix `kmc_file_pref`. -void remove_kmer_set(const std::string& kmc_file_pref); +// Removes the file at path `file_path` from disk. Returns `true` iff the +// removal is successful. +bool remove_file(const std::string& file_path); // Clears the content of the file at path `file_path`. void clear_file(const std::string& file_path); diff --git a/src/CdBG.cpp b/src/CdBG.cpp index e34670f7..71e18e7f 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -20,7 +20,7 @@ void CdBG::construct() if(params.remove_kmc_db()) { - remove_kmer_set(params.vertex_db_path()); + Kmer_Container::remove(params.vertex_db_path()); std::cout << "\nRemoved the KMC database from disk.\n"; } diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index e5b82eec..35e5422f 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -60,6 +60,20 @@ uint64_t Kmer_Container::size(const std::string& kmc_db_path) } +template +void Kmer_Container::remove(const std::string& kmc_db_path) +{ + const std::string kmc_pref_file(kmc_db_path + ".kmc_pre"); + const std::string kmc_suff_file(kmc_db_path + ".kmc_suf"); + + if(!remove_file(kmc_pref_file) || !remove_file(kmc_suff_file)) + { + std::cerr << "Error removing the KMC database file from path prefix " << kmc_db_path << ". Aborting.\n"; + std::exit(EXIT_FAILURE); + } +} + + // template // typename Kmer_Container::iterator Kmer_Container::end() const // { diff --git a/src/utility.cpp b/src/utility.cpp index fe481b66..fb65ccc5 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -66,16 +66,9 @@ std::string remove_whitespaces(const char* s) } -void remove_kmer_set(const std::string& kmc_file_pref) +bool remove_file(const std::string& file_path) { - const std::string kmc_file1_path(kmc_file_pref + ".kmc_pre"); - const std::string kmc_file2_path(kmc_file_pref + ".kmc_suf"); - - if(std::remove(kmc_file1_path.c_str()) || std::remove(kmc_file2_path.c_str())) - { - std::cerr << "Error removing the KMC database file from path prefix " << kmc_file_pref << ". Aborting.\n"; - std::exit(EXIT_FAILURE); - } + return ghc::filesystem::remove(file_path); } From 0605dcb59f75b360da3a98d109aee13286fa4bd3 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 00:43:35 -0400 Subject: [PATCH 169/350] Better const methods --- include/Read_CdBG.hpp | 4 ++-- src/Read_CdBG.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index a3d15ca0..6d673252 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -29,12 +29,12 @@ class Read_CdBG // Enumerates the edges of the de Bruijn graph in a database at path `edge_db_path`, // and returns summary statistics of the enumearation. - kmer_Enumeration_Stats enumerate_edges(const std::string& edge_db_path); + kmer_Enumeration_Stats enumerate_edges(const std::string& edge_db_path) const; // Enumerates the vertices of the de Bruijn graph in a database at path `vertex_db_path`, // from the edge database present at `edge_db_path`, using at most `max_memory` amount of // memory. Returns summary statistics of the enumeration. - kmer_Enumeration_Stats enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, std::size_t max_memory); + kmer_Enumeration_Stats enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, std::size_t max_memory) const; // Constructs the Cuttlefish hash table for the `vertex_count` vertices in the database // at path `vertex_db_path`. diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 2e202d29..e52814de 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -86,7 +86,7 @@ void Read_CdBG::construct() template -kmer_Enumeration_Stats Read_CdBG::enumerate_edges(const std::string& edge_db_path) +kmer_Enumeration_Stats Read_CdBG::enumerate_edges(const std::string& edge_db_path) const { return kmer_Enumerator().enumerate( KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), @@ -96,7 +96,7 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges(const std::string& edge_db_ template -kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, const std::size_t max_memory) +kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, const std::size_t max_memory) const { return kmer_Enumerator().enumerate( KMC::InputFileType::KMC, std::vector(1, edge_db_path), 1, From 34af052b0e13bccb90b494d94ee4919ffe3c9743 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 18:32:35 -0400 Subject: [PATCH 170/350] Add hash table loader interface --- include/Kmer_Hash_Table.hpp | 4 ++++ src/Kmer_Hash_Table.cpp | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 248e940d..435e67da 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -126,6 +126,10 @@ class Kmer_Hash_Table // paths determined from the parameters collection `params`. void save(const Build_Params& params) const; + // Loads the hash table from disk files, determined from the parameters + // collection `params`. + void load(const Build_Params& params); + // Removes the hash table files (if exists) from disk, with the file paths // being determined from the parameters collection `params`. void remove(const Build_Params& params) const; diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 56406a42..ca94382d 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -137,6 +137,14 @@ void Kmer_Hash_Table::save(const Build_Params& params) const } +template +void Kmer_Hash_Table::load(const Build_Params& params) +{ + load_mph_function(params.mph_file_path()); + load_hash_buckets(params.buckets_file_path()); +} + + template void Kmer_Hash_Table::remove(const Build_Params& params) const { From e140821ad3370f3eb77ee549dd717e12ff3425b2 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 18:33:29 -0400 Subject: [PATCH 171/350] Add kmc-db existence checker --- include/Kmer_Container.hpp | 3 +++ src/Kmer_Container.cpp | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/Kmer_Container.hpp b/include/Kmer_Container.hpp index f6309025..ae44cc60 100644 --- a/include/Kmer_Container.hpp +++ b/include/Kmer_Container.hpp @@ -46,6 +46,9 @@ class Kmer_Container // Returns the number of k-mers present in the k-mer database with path prefix `kmc_db_path`. static uint64_t size(const std::string& kmc_db_path); + // Returns `true` iff a KMC database is present at the path prefix `kmc_db_prefix`. + static bool exists(const std::string& kmc_db_prefix); + // Removes the KMC database at path `kmc_db_prefix` from disk. static void remove(const std::string& kmc_db_prefix); diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 35e5422f..796e2aa7 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -60,6 +60,16 @@ uint64_t Kmer_Container::size(const std::string& kmc_db_path) } +template +bool Kmer_Container::exists(const std::string& kmc_db_path) +{ + const std::string kmc_pref_file(kmc_db_path + ".kmc_pre"); + const std::string kmc_suff_file(kmc_db_path + ".kmc_suf"); + + return file_exists(kmc_pref_file) && file_exists(kmc_suff_file); +} + + template void Kmer_Container::remove(const std::string& kmc_db_path) { From 165eee9427a0c9d03bd91abc8013103108c140e1 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 18:34:33 -0400 Subject: [PATCH 172/350] Match file extension standard --- include/File_Extensions.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/File_Extensions.hpp b/include/File_Extensions.hpp index 1f69f247..9d164ac7 100644 --- a/include/File_Extensions.hpp +++ b/include/File_Extensions.hpp @@ -9,8 +9,8 @@ namespace cuttlefish // File extensions for the data structures and files output by the algorithm. namespace file_ext { - constexpr char edges_ext[] = ".cf_edges"; - constexpr char vertices_ext[] = ".cf_vertices"; + constexpr char edges_ext[] = ".cf_E"; + constexpr char vertices_ext[] = ".cf_V"; constexpr char hash_ext[] = ".cf_hf"; constexpr char buckets_ext[] = ".cf_hb"; constexpr char unipaths_ext[] = ".fa"; From 4248a46abf17f6131c585479e090ae47a68d25df Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 19:32:56 -0400 Subject: [PATCH 173/350] Better interfacing with DCC extractor --- include/Read_CdBG.hpp | 60 +++++++---- src/Detached_Cycles_Extractor.cpp | 2 +- src/Read_CdBG.cpp | 164 ++++++++++++++++++++---------- 3 files changed, 148 insertions(+), 78 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 6d673252..01aa2103 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -27,26 +27,49 @@ class Read_CdBG dBG_Info dbg_info; // Wrapper object for structural information of the graph. - // Enumerates the edges of the de Bruijn graph in a database at path `edge_db_path`, - // and returns summary statistics of the enumearation. - kmer_Enumeration_Stats enumerate_edges(const std::string& edge_db_path) const; + // Enumerates the edges of the de Bruijn graph and returns summary statistics of the + // enumearation. + kmer_Enumeration_Stats enumerate_edges() const; - // Enumerates the vertices of the de Bruijn graph in a database at path `vertex_db_path`, - // from the edge database present at `edge_db_path`, using at most `max_memory` amount of - // memory. Returns summary statistics of the enumeration. - kmer_Enumeration_Stats enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, std::size_t max_memory) const; + // Enumerates the vertices of the de Bruijn graph using at most `max_memory` amount of + // memory, and returns summary statistics of the enumeration. + kmer_Enumeration_Stats enumerate_vertices(std::size_t max_memory) const; - // Constructs the Cuttlefish hash table for the `vertex_count` vertices in the database - // at path `vertex_db_path`. - void construct_hash_table(const std::string& vertex_db_path, uint64_t vertex_count); + // Constructs the Cuttlefish hash table for the `vertex_count` vertices of the graph. + // If `load` is specified, then it is loaded from disk. + void construct_hash_table(uint64_t vertex_count, bool load = false); - // Computes the states of the automata, i.e. the vertices of the graph having it edge - // set present at the path prefix `edge_db_path`. - void compute_DFA_states(const std::string& edge_db_path); + // Computes the states of the automata, i.e. the vertices of the graph. + void compute_DFA_states(); - // Extracts the maximal unitigs from the graph having its vertex set present at the - // path prefix `vertex_db_path`. - void extract_maximal_unitigs(const std::string& vertex_db_path); + // Extracts the maximal unitigs from the graph. + void extract_maximal_unitigs(); + + // Extracts the detached chordless cycles of the graph and appends the output to the + // output file at path `output_file_path`. Specifying `rerun` implies that the graph + // has been compacted earlier in a separate run of Cuttlefish; otherwise it's done + // in this same run. Returns `true` iff either there is no DCC in the graph, or the + // DCCs have already been extracted earlier. + bool extract_DCCs(const std::string& output_file_path, bool rerun = false); + + // Returns the path prefix to the edge database being used by Cuttlefish. + const std::string edge_db_path() const; + + // Returns the path prefix to the vertex database being used by Cuttlefish. + const std::string vertex_db_path() const; + + // Returns `true` iff the compacted de Bruijn graph to be built from the parameters + // collection `params` had been constructed in an earlier execution. + // NB: only the existence of the output meta-info file is checked for this purpose. + bool is_constructed() const; + + // Returns `true` iff the graph contains detached chordless cycles and the current + // execution is configured to extract those in this same run. + bool extract_DCCs_this_run() const; + + // Returns `true` iff the data structures required for DCC-extraction is present + // from an earlier execution of the algorithm. + bool DCC_data_structs_exist() const; public: @@ -58,11 +81,6 @@ class Read_CdBG // Constructs the compacted read de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); - - // Returns `true` iff the compacted de Bruijn graph to be built from the parameters - // collection `params` had been constructed in an earlier execution. - // NB: only the existence of the output meta-info file is checked for this purpose. - static bool is_constructed(const Build_Params& params); }; diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index a4343ee5..bf18c87c 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -14,7 +14,7 @@ void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_d std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); - if(Read_CdBG::is_constructed(params) && !dbg_info.dcc_opt_performed()) + if(!dbg_info.dcc_opt_performed()) { std::cout << "Marking the vertices present in the extracted maximal unitigs.\n"; mark_maximal_unitig_vertices(vertex_db_path); diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index e52814de..0c323147 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -13,6 +13,7 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): params(params), + hash_table(nullptr), dbg_info(params.json_file_path()) {} @@ -20,31 +21,28 @@ Read_CdBG::Read_CdBG(const Build_Params& params): template void Read_CdBG::construct() { - if(is_constructed(params) && (!dbg_info.has_dcc() || dbg_info.dcc_extracted() || !params.extract_cycles())) + if(is_constructed()) { - std::cout << "\nThe compacted de Bruijn graph has already been constructed earlier.\n"; - if(dbg_info.has_dcc() && !dbg_info.dcc_extracted()) - std::cout << "There are Detached Chordless Cycles (DCC) present in the graph; run Cuttlefish with the `cycles` argument to extract those.\n"; - + std::cout << "\nThe compacted de Bruijn graph has been constructed earlier.\n"; + extract_DCCs(params.output_file_path(), true); return; } + dbg_info.add_build_params(params); std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; - const std::string edge_db_path = params.output_prefix() + cuttlefish::file_ext::edges_ext; - kmer_Enumeration_Stats edge_stats = enumerate_edges(edge_db_path); + kmer_Enumeration_Stats edge_stats = enumerate_edges(); std::chrono::high_resolution_clock::time_point t_edges = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the edge set of the graph. Time taken = " << std::chrono::duration_cast>(t_edges - t_start).count() << " seconds.\n"; std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; - const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; - kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_db_path, vertex_db_path, edge_stats.max_memory()); + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_edges).count() << " seconds.\n"; @@ -54,15 +52,16 @@ void Read_CdBG::construct() std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - construct_hash_table(vertex_db_path, vertex_stats.kmer_count()); + construct_hash_table(vertex_stats.kmer_count()); std::chrono::high_resolution_clock::time_point t_mphf = std::chrono::high_resolution_clock::now(); std::cout << "Constructed the minimal perfect hash function for the vertices. Time taken = " << std::chrono::duration_cast>(t_mphf - t_vertices).count() << " seconds.\n"; std::cout << "\nComputing the DFA states.\n"; - compute_DFA_states(edge_db_path); + compute_DFA_states(); + Kmer_Container::remove(edge_db_path()); if(!params.extract_cycles() && !params.dcc_opt()) hash_table->save(params); @@ -71,10 +70,13 @@ void Read_CdBG::construct() std::cout << "\nExtracting the maximal unitigs.\n"; - extract_maximal_unitigs(vertex_db_path); + extract_maximal_unitigs(); - if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) + if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) // Either there are no DCCs, or the DCCs have already been extracted in this run. + { + Kmer_Container::remove(vertex_db_path()); hash_table->remove(params); + } std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; @@ -86,98 +88,148 @@ void Read_CdBG::construct() template -kmer_Enumeration_Stats Read_CdBG::enumerate_edges(const std::string& edge_db_path) const +kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { return kmer_Enumerator().enumerate( KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), params.thread_count(), params.max_memory(), params.strict_memory(), true, - params.working_dir_path(), edge_db_path); + params.working_dir_path(), edge_db_path()); } template -kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::string& edge_db_path, const std::string& vertex_db_path, const std::size_t max_memory) const +kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { return kmer_Enumerator().enumerate( - KMC::InputFileType::KMC, std::vector(1, edge_db_path), 1, + KMC::InputFileType::KMC, std::vector(1, edge_db_path()), 1, params.thread_count(), max_memory, params.strict_memory(), false, - params.working_dir_path(), vertex_db_path); + params.working_dir_path(), vertex_db_path()); } template -void Read_CdBG::construct_hash_table(const std::string& vertex_db_path, const uint64_t vertex_count) +void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool load) { - hash_table = std::make_unique>(vertex_db_path, vertex_count); - hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + hash_table = std::make_unique>(vertex_db_path(), vertex_count); + load ? hash_table->load(params) : + hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); } template -void Read_CdBG::compute_DFA_states(const std::string& edge_db_path) +void Read_CdBG::compute_DFA_states() { Read_CdBG_Constructor cdBg_constructor(params, *hash_table); - cdBg_constructor.compute_DFA_states(edge_db_path); + cdBg_constructor.compute_DFA_states(edge_db_path()); dbg_info.add_basic_info(cdBg_constructor); } template -void Read_CdBG::extract_maximal_unitigs(const std::string& vertex_db_path) +void Read_CdBG::extract_maximal_unitigs() { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); const std::string temp_output_path = params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::temp; const std::string output_file_path = params.output_file_path(); - if(!is_constructed(params)) - { - cdBg_extractor.extract_maximal_unitigs(vertex_db_path, temp_output_path); - - dbg_info.add_unipaths_info(cdBg_extractor); + cdBg_extractor.extract_maximal_unitigs(vertex_db_path(), temp_output_path); + dbg_info.add_unipaths_info(cdBg_extractor); + + if(!extract_DCCs(temp_output_path) && params.dcc_opt()) + hash_table->save(params); + + move_file(temp_output_path, output_file_path); +} + + +template +bool Read_CdBG::extract_DCCs(const std::string& output_file_path, const bool rerun) +{ + if(!extract_DCCs_this_run()) + return !dbg_info.has_dcc(); - if(cdBg_extractor.has_dcc()) + if(rerun) + { + if(!DCC_data_structs_exist()) { - if(params.extract_cycles()) - { - cdBg_extractor.extract_detached_cycles(vertex_db_path, temp_output_path, dbg_info); - - dbg_info.add_DCC_info(cdBg_extractor); - } - else if(params.dcc_opt()) - hash_table->save(params); + std::cout << "The data structure(s) required for the cycles extraction have been removed.\n" + "Please re-run Cuttlefish with the originial parameters to recover those.\n"; + return false; } - move_file(temp_output_path, output_file_path); + construct_hash_table(Kmer_Container::size(vertex_db_path()), true); } - else if(params.extract_cycles()) + + + Read_CdBG_Extractor cdBg_extractor(params, *hash_table); + cdBg_extractor.extract_detached_cycles(vertex_db_path(), output_file_path, dbg_info); + + dbg_info.add_DCC_info(cdBg_extractor); + + return true; +} + + +template +bool Read_CdBG::extract_DCCs_this_run() const +{ + if(!dbg_info.has_dcc()) { - if(dbg_info.has_dcc()) - { - if(!dbg_info.dcc_extracted()) - { - cdBg_extractor.extract_detached_cycles(vertex_db_path, output_file_path, dbg_info); - - dbg_info.add_DCC_info(cdBg_extractor); - } - else - std::cout << "\nThe DCCs (Detached Chordless Cycles) have already been extracted earlier.\n"; - } - else - std::cout << "\nThe de Bruijn graph has no DCCs (Detached Chordless Cycles).\n"; + std::cout << "The graph does not contain any detached chordless cycles.\n"; + return false; + } + + if(dbg_info.dcc_extracted()) + { + std::cout << "The detached chordless cycles have been extracted earlier.\n"; + return false; + } + + if(!params.extract_cycles()) + { + std::cout << "There are Detached Chordless Cycles (DCC) present in the graph.\n" + "Run Cuttlefish with the `cycles` argument to extract those.\n"; + return false; } - else - std::cout << "\nNothing to do.\n"; + + + return true; +} + + +template +bool Read_CdBG::DCC_data_structs_exist() const +{ + const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; + const std::string mph_path = params.mph_file_path(); + const std::string buckets_path = params.buckets_file_path(); + + return Kmer_Container::exists(vertex_db_path) && file_exists(mph_path) && file_exists(buckets_path); } template -bool Read_CdBG::is_constructed(const Build_Params& params) +bool Read_CdBG::is_constructed() const { return file_exists(params.json_file_path()); } +template +const std::string Read_CdBG::edge_db_path() const +{ + return params.output_prefix() + cuttlefish::file_ext::edges_ext; +} + + +template +const std::string Read_CdBG::vertex_db_path() const +{ + return params.output_prefix() + cuttlefish::file_ext::vertices_ext; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) From 2f8b7f643adad33117907aeca5c6b2eac7d73fd7 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 22:01:18 -0400 Subject: [PATCH 174/350] Fix sneaky bug fragile code --- include/Kmer_Hash_Table.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 435e67da..3eb0afd6 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -29,7 +29,7 @@ class Kmer_Hash_Table constexpr static double GAMMA_FACTOR = 2.0; // Path to the underlying k-mer database, over which the hash table is constructed. - const std::string& kmc_db_path; + const std::string kmc_db_path; // Number of keys (`Kmer`s) in the hash table. const uint64_t kmer_count; From 15ec87c9f4f5a04022f72247377c7a840905e34e Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Sep 2021 22:12:41 -0400 Subject: [PATCH 175/350] Fix subtle bug was losing execution information --- include/Read_CdBG.hpp | 4 ++++ src/Read_CdBG.cpp | 14 ++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 01aa2103..9705402b 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -78,6 +78,10 @@ class Read_CdBG // the compacted representation of the underlying read de Bruijn graph wrapped in `params`. Read_CdBG(const Build_Params& params); + // Destructs the compacted graph builder object, freeing its hash table and dumping the + // graph information to disk. + ~Read_CdBG(); + // Constructs the compacted read de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 0c323147..f883f522 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -18,6 +18,16 @@ Read_CdBG::Read_CdBG(const Build_Params& params): {} +template +Read_CdBG::~Read_CdBG() +{ + if(hash_table != nullptr) + hash_table->clear(); + + dbg_info.dump_info(); +} + + template void Read_CdBG::construct() { @@ -80,10 +90,6 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; - - - hash_table->clear(); - dbg_info.dump_info(); } From 88d933295a21ce2a9c7fa65285e202b4e495d8cd Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 16 Sep 2021 23:31:13 -0400 Subject: [PATCH 176/350] Fix missing output --- include/Async_Logger_Wrapper.hpp | 3 +++ include/Output_Sink.hpp | 2 +- src/Async_Logger_Wrapper.cpp | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/Async_Logger_Wrapper.hpp b/include/Async_Logger_Wrapper.hpp index f2404027..47da8ace 100644 --- a/include/Async_Logger_Wrapper.hpp +++ b/include/Async_Logger_Wrapper.hpp @@ -41,6 +41,9 @@ class Async_Logger_Wrapper // Log the passed null-terminated message `str`. void write(const char* str) const; + + // Posts flush request for the logger, and closes it. + void close_logger(); }; diff --git a/include/Output_Sink.hpp b/include/Output_Sink.hpp index f7aa257c..376804f8 100644 --- a/include/Output_Sink.hpp +++ b/include/Output_Sink.hpp @@ -65,7 +65,7 @@ class Output_Sink void close_sink() { - spdlog::drop_all(); + output_.close_logger(); } }; diff --git a/src/Async_Logger_Wrapper.cpp b/src/Async_Logger_Wrapper.cpp index b94b2dea..8b186fde 100644 --- a/src/Async_Logger_Wrapper.cpp +++ b/src/Async_Logger_Wrapper.cpp @@ -19,3 +19,15 @@ void Async_Logger_Wrapper::init_logger(const std::string& output_file_path) // Set the log message pattern. logger->set_pattern("%v"); } + + +void Async_Logger_Wrapper::close_logger() +{ + // Note: For `spdlog`, `logger->flush()` posts a message to the queue requesting the flush operation, + // so the function returns immediately. Hence a forceful eviction is necessary by dropping the `spdlog` + // thread pools for the output to force-flush the pending logs. `spdlog::shutdown()` force-flushes + // messages from the global pool only. + + logger->flush(); + tp.reset(); +} From b565cef912cb27e4f73a4d2a2104ed56e2ad4d6e Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 16 Sep 2021 23:59:44 -0400 Subject: [PATCH 177/350] Fix race bug --- src/Detached_Cycles_Extractor.cpp | 10 +++++----- src/Unipaths_Meta_info.cpp | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index bf18c87c..50906976 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -185,8 +185,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato std::size_t pivot; // Index of the lexicographically lowest (canonical) k-mer in the cycle. uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - uint64_t cycles_extracted = 0; // Number of detached chordless cycles extracted by this thread. - uint64_t cycle_vertices = 0; // Number of vertices found to be in detached chordless cycles by this thread. + Unipaths_Meta_info extracted_dcc_info; // Meta-information over the DCCs extracted by this thread. uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. Character_Buffer output_buffer(output_sink.sink()); // The output buffer for the cycles. @@ -200,13 +199,13 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato if(!state.is_outputted()) if(extract_cycle(v, id, cycle, pivot)) { - cycles_extracted++; - cycle_vertices += cycle.size() - (k - 1); - unipaths_meta_info_.add_DCC(cycle); + extracted_dcc_info.add_DCC(cycle); // cycle.emplace_back('\n'); // output_buffer += FASTA_Record>(id, cycle); output_buffer.rotate_append(FASTA_Record>(id, cycle), pivot); + + // TODO: mark the path vertices, or not? } vertex_count++; @@ -220,6 +219,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato lock.lock(); vertices_scanned += vertex_count; + unipaths_meta_info_.aggregate(extracted_dcc_info); lock.unlock(); } diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index e323c2ce..6c430da3 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -30,6 +30,10 @@ void Unipaths_Meta_info::aggregate(const Unipaths_Meta_info& other) max_len_ = std::max(max_len_, other.max_len_); min_len_ = std::min(min_len_, other.min_len_); sum_len_ += other.sum_len_; + + dcc_count_ += other.dcc_count_; + dcc_kmer_count_ += other.dcc_kmer_count_; + dcc_sum_len_ += other.dcc_sum_len_; } From 3a9b50a926af0955484b43bc3850c21c15b25420 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 17 Sep 2021 12:03:06 -0400 Subject: [PATCH 178/350] Save a comparison for large k-mers --- include/Directed_Vertex.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp index 67b43dc7..5c783d0b 100644 --- a/include/Directed_Vertex.hpp +++ b/include/Directed_Vertex.hpp @@ -108,7 +108,7 @@ template inline Directed_Vertex::Directed_Vertex(const Directed_Vertex& rhs): kmer_(rhs.kmer_), kmer_bar_(rhs.kmer_bar_), - kmer_hat_ptr(Kmer::canonical(kmer_, kmer_bar_)), // TODO: replace with pointer-check based assignment (check `operator=`). + kmer_hat_ptr(rhs.kmer_hat_ptr == &rhs.kmer_ ? &kmer_ : &kmer_bar_), h(rhs.h) {} From 8e8cc6ce27a8600659bc1275314a54e517a94cdb Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 17 Sep 2021 21:48:14 -0400 Subject: [PATCH 179/350] (Temp.) Modify to cross-check o/p k-mers --- src/Detached_Cycles_Extractor.cpp | 2 ++ src/Read_CdBG.cpp | 2 +- src/kmer_Enumerator.cpp | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 50906976..6b475925 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -267,6 +267,8 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s cycle.emplace_back(Kmer::map_char(b_ext)); } + // if(!anchor.is_same_vertex(sign_vertex)) + // return false; if(!mark_vertex(sign_vertex)) return false; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index f883f522..7597b979 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -84,7 +84,7 @@ void Read_CdBG::construct() if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) // Either there are no DCCs, or the DCCs have already been extracted in this run. { - Kmer_Container::remove(vertex_db_path()); + // Kmer_Container::remove(vertex_db_path()); hash_table->remove(params); } diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index d99557a4..c87a9be9 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -36,7 +36,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetNThreads(thread_count) .SetMaxRamGB(memory) .SetStrictMemoryMode(strict_memory) - .SetCounterMax(counter_max) + // .SetCounterMax(counter_max) .SetOutputFileName(output_db_path) ; From 224fefae672fa82cd3f7cb9ebdbdb1dc8aa0eb71 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 17:38:39 -0400 Subject: [PATCH 180/350] Add option to save vertex set --- include/Build_Params.hpp | 10 ++++++++++ src/Read_CdBG.cpp | 4 +++- src/main.cpp | 6 +++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 1f01805c..6e5c60a2 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -34,6 +34,7 @@ class Build_Params const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const bool save_vertices_; // Option to save the vertex set of the de Bruijn graph (in KMC database format). const std::string json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. @@ -59,6 +60,7 @@ class Build_Params const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, + const bool save_vertices, const std::string& json_file_path, const bool dcc_opt, const bool extract_cycles): @@ -77,6 +79,7 @@ class Build_Params remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), + save_vertices_(save_vertices), json_file_path_(json_file_path), dcc_opt_(dcc_opt), extract_cycles_(extract_cycles) @@ -195,6 +198,13 @@ class Build_Params } + // Returns whether the option to save the vertex set of the de Bruijn graph (in KMC database format) is specified or not. + bool save_vertices() const + { + return save_vertices_; + } + + // Returns the path to the optional file storing meta-information about the graph and cuttlefish executions. const std::string json_file_path() const { diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 7597b979..0317cbf8 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -84,7 +84,9 @@ void Read_CdBG::construct() if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) // Either there are no DCCs, or the DCCs have already been extracted in this run. { - // Kmer_Container::remove(vertex_db_path()); + if(!params.save_vertices()) + Kmer_Container::remove(vertex_db_path()); + hash_table->remove(params); } diff --git a/src/main.cpp b/src/main.cpp index 5c2fb588..68566d8a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,7 +42,9 @@ void build(int argc, char** argv) // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("save-vertices", "save the vertex set of the graph") ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + // TODO: remove the following arg ("no-dcc", "turn off optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") ("cycles", "extract the detached chordless cycles of the graph") ("h,help", "print usage"); @@ -73,6 +75,7 @@ void build(int argc, char** argv) const auto working_dir = result["work_dir"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); + const auto save_vertices = result["save-vertices"].as(); const auto json_file = result["json"].as(); const auto dcc_opt = !result["no-dcc"].as(); const auto extract_cycles = result["cycles"].as(); @@ -80,7 +83,8 @@ void build(int argc, char** argv) const Build_Params params( is_read_graph, refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, - output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, json_file, + output_file, format, working_dir, + remove_kmc_db, mph_file, buckets_file, save_vertices, json_file, dcc_opt, extract_cycles); if(!params.is_valid()) { From 29c2ad68dc8136a97d7ed3105e774fea79edb665 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 18:26:05 -0400 Subject: [PATCH 181/350] Not skip k-mer counts if in validation mode --- CMakeLists.txt | 5 +++++ src/kmer_Enumerator.cpp | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6216de42..d183032a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,11 +38,16 @@ set(OPTIMIZE_FLAGS -funroll-loops) # Add the required preprocessor definitions (`#define`s) to pass on. +# TODO: replace `add_definitions` with `add_compile_definitions`. add_definitions(-D__STDC_FORMAT_MACROS -DSPDLOG_FMT_EXTERNAL_HO -DFMT_HEADER_ONLY -DXXH_INLINE_ALL) if(INSTANCE_COUNT) add_definitions(-DINSTANCE_COUNT=${INSTANCE_COUNT}) endif() +if(VALIDATION_MODE) + add_definitions(-DVALIDATION_MODE) +endif() + # Search the file system for the appropriate threads package for this platform, and then set # the `CMAKE_THREAD_LIBS_INIT` variable (and some other variables as well). diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index c87a9be9..0882b163 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -36,7 +36,9 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetNThreads(thread_count) .SetMaxRamGB(memory) .SetStrictMemoryMode(strict_memory) - // .SetCounterMax(counter_max) +#ifndef VALIDATION_MODE + .SetCounterMax(counter_max) +#endif .SetOutputFileName(output_db_path) ; From 510e3d6d78e0f79f72ec9736d1f16f6ffefafc48 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 18:43:18 -0400 Subject: [PATCH 182/350] Fix logic bug in rotating cycles --- include/Character_Buffer.hpp | 15 ++++++++------- include/FASTA_Record.hpp | 14 +++++++++----- src/Detached_Cycles_Extractor.cpp | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index c373333f..79f95685 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -49,13 +49,14 @@ class Character_Buffer template void operator+=(const FASTA_Record& fasta_rec); - // Appends the content of the FASTA record `fasta_rec` to the buffer. The FASTA - // added sequence is rotated around its index `pivot` — the entire sequence is + // Appends the content of the FASTA record `fasta_cycle` to the buffer, that is + // supposed to be a cycle in a de Bruijn graph `G(·, k)`. The cyclic FASTA + // sequence is rotated around its index `pivot` — the entire sequence is // right-rotated so that the `pivot`-index character is at index 0 finally. A // line-break is added at the end of the sequence, since the user might not be // able to provide it with the "to be rotated" sequence. - template - void rotate_append(const FASTA_Record& fasta_rec, std::size_t pivot); + template + void rotate_append_cycle(const FASTA_Record& fasta_cycle, std::size_t pivot); // Destructs the buffer object, flushing it if content are present. ~Character_Buffer(); @@ -141,14 +142,14 @@ inline void Character_Buffer::operator+=(const FASTA_Record -template -inline void Character_Buffer::rotate_append(const FASTA_Record& fasta_rec, const std::size_t pivot) +template +inline void Character_Buffer::rotate_append_cycle(const FASTA_Record& fasta_rec, const std::size_t pivot) { ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size() + 1); // 2 extra bytes for two line-breaks. fasta_rec.append_header(buffer); // Append the header. buffer.emplace_back('\n'); // Break line. - fasta_rec.append_rotated_seq(buffer, pivot); // Append the sequence right-rotated around index `pivot`. + fasta_rec.template append_rotated_cycle(buffer, pivot); // Append the sequence right-rotated around index `pivot`. buffer.emplace_back('\n'); // End the sequence. } diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index 57a94eda..b1499889 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -6,6 +6,7 @@ #include "fmt/format.h" + // ============================================================================= // A class wrapping a basic FASTA record: the sequence of type `T_seq_` and its // header/identifier of type `T_id`. The class is specifically designed for @@ -39,9 +40,11 @@ class FASTA_Record void append_seq(std::vector& buffer) const; // Appends the FASTA sequence to the vector `buffer` in a rotated form — the - // added sequence is right rotated so that the character at index `pivot` is - // at index 0 finally. - void append_rotated_seq(std::vector& buffer, std::size_t pivot) const; + // added sequence is supposed to be a cycle in a de Bruijn graph `G(·, k)`, + // and it is right rotated so that the character at index `pivot` is at + // index 0 finally. + template + void append_rotated_cycle(std::vector& buffer, std::size_t pivot) const; }; @@ -84,10 +87,11 @@ inline void FASTA_Record::append_seq(std::vector& buffer) c template -inline void FASTA_Record::append_rotated_seq(std::vector& buffer, const std::size_t pivot) const +template +inline void FASTA_Record::append_rotated_cycle(std::vector& buffer, const std::size_t pivot) const { buffer.insert(buffer.end(), seq_.begin() + pivot, seq_.end()); - buffer.insert(buffer.end(), seq_.begin(), seq_.begin() + pivot); + buffer.insert(buffer.end(), seq_.begin() + k - 1, seq_.begin() + k - 1 + pivot); } diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 6b475925..7a130d5e 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -203,7 +203,7 @@ void Read_CdBG_Extractor::extract_detached_chordless_cycles(Kmer_SPMC_Iterato // cycle.emplace_back('\n'); // output_buffer += FASTA_Record>(id, cycle); - output_buffer.rotate_append(FASTA_Record>(id, cycle), pivot); + output_buffer.rotate_append_cycle(FASTA_Record>(id, cycle), pivot); // TODO: mark the path vertices, or not? } @@ -276,7 +276,7 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s if(!sign_vertex.in_canonical_form()) { reverse_complement(cycle); - pivot = (cycle.size() - 1) - pivot - (k - 1); + pivot = (cycle.size() - 1) - (pivot + k - 1); } id = sign_vertex.hash(); From a7711963ba58a3e9fdd788995a788a499b7849e9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 21:00:26 -0400 Subject: [PATCH 183/350] Separate k-mer utils from DNA --- include/DNA_Utility.hpp | 29 ------------------------ include/Kmer.hpp | 3 ++- include/Kmer_Utility.hpp | 49 ++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/DNA_Utility.cpp | 1 - src/Kmer_Utility.cpp | 5 ++++ 6 files changed, 57 insertions(+), 31 deletions(-) create mode 100644 include/Kmer_Utility.hpp create mode 100644 src/Kmer_Utility.cpp diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index 641d006f..9f96e3b1 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -83,28 +83,6 @@ class DNA_Utility 1, 1, 1, 1, 1, 1, 1, 1 // 120 - 127 }; - // TODO: Move these new k-mer specific (and not DNA-base specific) stuffs to a separate class. - // Reverse complement (in the `DNA::Base` representation) of all possible bytes. - static constexpr uint8_t REVERSE_COMPLEMENT_BYTE[256] = - { - 255, 191, 127, 63, 239, 175, 111, 47, 223, 159, 95, 31, 207, 143, 79, 15, - 251, 187, 123, 59, 235, 171, 107, 43, 219, 155, 91, 27, 203, 139, 75, 11, - 247, 183, 119, 55, 231, 167, 103, 39, 215, 151, 87, 23, 199, 135, 71, 7, - 243, 179, 115, 51, 227, 163, 99, 35, 211, 147, 83, 19, 195, 131, 67, 3, - 254, 190, 126, 62, 238, 174, 110, 46, 222, 158, 94, 30, 206, 142, 78, 14, - 250, 186, 122, 58, 234, 170, 106, 42, 218, 154, 90, 26, 202, 138, 74, 10, - 246, 182, 118, 54, 230, 166, 102, 38, 214, 150, 86, 22, 198, 134, 70, 6, - 242, 178, 114, 50, 226, 162, 98, 34, 210, 146, 82, 18, 194, 130, 66, 2, - 253, 189, 125, 61, 237, 173, 109, 45, 221, 157, 93, 29, 205, 141, 77, 13, - 249, 185, 121, 57, 233, 169, 105, 41, 217, 153, 89, 25, 201, 137, 73, 9, - 245, 181, 117, 53, 229, 165, 101, 37, 213, 149, 85, 21, 197, 133, 69, 5, - 241, 177, 113, 49, 225, 161, 97, 33, 209, 145, 81, 17, 193, 129, 65, 1, - 252, 188, 124, 60, 236, 172, 108, 44, 220, 156, 92, 28, 204, 140, 76, 12, - 248, 184, 120, 56, 232, 168, 104, 40, 216, 152, 88, 24, 200, 136, 72, 8, - 244, 180, 116, 52, 228, 164, 100, 36, 212, 148, 84, 20, 196, 132, 68, 4, - 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 80, 16, 192, 128, 64, 0 - }; - // Mapped `DNA::Extended_Base` for the corresponding `DNA::Base`, i.e. // a mapping from [0(A) — T(3)] to [1(A) — 4(T)]. static constexpr DNA::Extended_Base MAPPED_EXTENDED_BASE[4] = @@ -158,13 +136,6 @@ class DNA_Utility return base <= 'T' ? base : (base - ('a' - 'A')); } - // Returns the reverse completement byte of the 4-mer `byte`; - // both are to be in the `DNA::Base` representation. - static uint8_t reverse_complement(const uint8_t byte) - { - return REVERSE_COMPLEMENT_BYTE[byte]; - } - // Returns the mapping `DNA::Extended_Base` representation of the // `DNA::Base` representation `base`. static DNA::Extended_Base map_extended_base(const DNA::Base base) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 74f09024..276eb52d 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -5,6 +5,7 @@ #include "DNA_Utility.hpp" +#include "Kmer_Utility.hpp" #include "utility.hpp" #include "kmc_api/kmc_file.h" #include "xxHash/xxh3.h" @@ -452,7 +453,7 @@ inline void Kmer::as_reverse_complement(const Kmer& other) constexpr uint16_t packed_byte_count = k / 4; for(uint16_t byte_idx = 0; byte_idx < packed_byte_count; ++byte_idx) - rev_compl[packed_byte_count - 1 - byte_idx] = DNA_Utility::reverse_complement(data[byte_idx]); + rev_compl[packed_byte_count - 1 - byte_idx] = Kmer_Utility::reverse_complement(data[byte_idx]); // Get the reverse complement for the only byte that might be partially packed (possible for the highest-indexed byte only). diff --git a/include/Kmer_Utility.hpp b/include/Kmer_Utility.hpp new file mode 100644 index 00000000..a399631f --- /dev/null +++ b/include/Kmer_Utility.hpp @@ -0,0 +1,49 @@ + +#ifndef KMER_UTILITY_HPP +#define KMER_UTILITY_HPP + + + +#include + + +class Kmer_Utility +{ +private: + + // Reverse complement (in the `DNA::Base` representation) of all possible bytes. + static constexpr uint8_t REVERSE_COMPLEMENT_BYTE[256] = + { + 255, 191, 127, 63, 239, 175, 111, 47, 223, 159, 95, 31, 207, 143, 79, 15, + 251, 187, 123, 59, 235, 171, 107, 43, 219, 155, 91, 27, 203, 139, 75, 11, + 247, 183, 119, 55, 231, 167, 103, 39, 215, 151, 87, 23, 199, 135, 71, 7, + 243, 179, 115, 51, 227, 163, 99, 35, 211, 147, 83, 19, 195, 131, 67, 3, + 254, 190, 126, 62, 238, 174, 110, 46, 222, 158, 94, 30, 206, 142, 78, 14, + 250, 186, 122, 58, 234, 170, 106, 42, 218, 154, 90, 26, 202, 138, 74, 10, + 246, 182, 118, 54, 230, 166, 102, 38, 214, 150, 86, 22, 198, 134, 70, 6, + 242, 178, 114, 50, 226, 162, 98, 34, 210, 146, 82, 18, 194, 130, 66, 2, + 253, 189, 125, 61, 237, 173, 109, 45, 221, 157, 93, 29, 205, 141, 77, 13, + 249, 185, 121, 57, 233, 169, 105, 41, 217, 153, 89, 25, 201, 137, 73, 9, + 245, 181, 117, 53, 229, 165, 101, 37, 213, 149, 85, 21, 197, 133, 69, 5, + 241, 177, 113, 49, 225, 161, 97, 33, 209, 145, 81, 17, 193, 129, 65, 1, + 252, 188, 124, 60, 236, 172, 108, 44, 220, 156, 92, 28, 204, 140, 76, 12, + 248, 184, 120, 56, 232, 168, 104, 40, 216, 152, 88, 24, 200, 136, 72, 8, + 244, 180, 116, 52, 228, 164, 100, 36, 212, 148, 84, 20, 196, 132, 68, 4, + 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 80, 16, 192, 128, 64, 0 + }; + + +public: + + + // Returns the reverse completement byte of the 4-mer `byte`; + // both are to be in the `DNA::Base` representation. + static uint8_t reverse_complement(const uint8_t byte) + { + return REVERSE_COMPLEMENT_BYTE[byte]; + } +}; + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 432d59f8..d72acc12 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,7 @@ set(PROJECT_SRC Async_Logger_Wrapper.cpp Thread_Pool.cpp DNA_Utility.cpp + Kmer_Utility.cpp Kmer_u64.cpp Vertex.cpp State.cpp diff --git a/src/DNA_Utility.cpp b/src/DNA_Utility.cpp index ccbdc7b5..f153ad71 100644 --- a/src/DNA_Utility.cpp +++ b/src/DNA_Utility.cpp @@ -6,7 +6,6 @@ constexpr DNA::Base DNA_Utility::MAPPED_BASE[128]; constexpr DNA::Base DNA_Utility::COMPLEMENTED_BASE[5]; constexpr char DNA_Utility::COMPLEMENTED_CHAR[128]; constexpr bool DNA_Utility::IS_PLACEHOLDER[128]; -constexpr uint8_t DNA_Utility::REVERSE_COMPLEMENT_BYTE[256]; constexpr DNA::Extended_Base DNA_Utility::MAPPED_EXTENDED_BASE[4]; constexpr DNA::Base DNA_Utility::REVERSE_MAPPED_EXTENDED_BASE[5]; constexpr char DNA_Utility::MAPPED_CHAR[4]; diff --git a/src/Kmer_Utility.cpp b/src/Kmer_Utility.cpp new file mode 100644 index 00000000..f449a9d5 --- /dev/null +++ b/src/Kmer_Utility.cpp @@ -0,0 +1,5 @@ + +#include "Kmer_Utility.hpp" + + +constexpr uint8_t Kmer_Utility::REVERSE_COMPLEMENT_BYTE[256]; From 3fa7a3bcaf15806d86e0a939f39d011eb066455a Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 21:09:17 -0400 Subject: [PATCH 184/350] Reduce redundant code --- include/Directed_Vertex.hpp | 12 ++++++++++ include/Endpoint.hpp | 44 +++++++++++-------------------------- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp index 5c783d0b..21022cc9 100644 --- a/include/Directed_Vertex.hpp +++ b/include/Directed_Vertex.hpp @@ -80,6 +80,11 @@ class Directed_Vertex // of that edge. cuttlefish::side_t exit_side() const; + // Returns the side of the vertex which is to be the incidence side of some bidirected + // edge instance if this vertex instance were to be the sink vertex (i.e. suffix k-mer) + // of that edge. + cuttlefish::side_t entrance_side() const; + // Returns `true` iff this vertex and the vertex `v` are the same vertex, without the // directionality. bool is_same_vertex(const Directed_Vertex& v) const; @@ -193,6 +198,13 @@ inline cuttlefish::side_t Directed_Vertex::exit_side() const } +template +inline cuttlefish::side_t Directed_Vertex::entrance_side() const +{ + return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::front : cuttlefish::side_t::back; +} + + template inline bool Directed_Vertex::is_same_vertex(const Directed_Vertex& v) const { diff --git a/include/Endpoint.hpp b/include/Endpoint.hpp index 6ad79eda..0391f139 100644 --- a/include/Endpoint.hpp +++ b/include/Endpoint.hpp @@ -6,6 +6,7 @@ #include "Kmer.hpp" +#include "Directed_Vertex.hpp" #include "globals.hpp" #include "Kmer_Hash_Table.hpp" @@ -18,14 +19,9 @@ class Endpoint { private: - // TODO: Refactor the class with inclusion of a `Directed_Vertex` instance, replacing four fields. - - Kmer kmer_; // The endpoint k-mer spelled by the edge instance. - Kmer kmer_bar_; // Reverse complement of the k-mer spelled by the edge instance. - const Kmer* kmer_hat_ptr; // Pointer to the canonical form of the endpoint k-mer. - cuttlefish::side_t s; // The side of the endpoint vertex to which the edge instance is incident to. + Directed_Vertex v; // The endpoint vertex. + cuttlefish::side_t s; // The side of `v` to which the edge instance is incident to. cuttlefish::edge_encoding_t e; // The `DNA::Extended_Base` encoding of the edge instance incident to this endpoint. - uint64_t h; // Hash value of the vertex, i.e. canonical k-mer. // Constructs an `Endpoint` object that appears in the form `kmer` in an edge instance, and @@ -89,56 +85,42 @@ class Endpoint template inline Endpoint::Endpoint(const Kmer& kmer, const bool is_source, const Kmer_Hash_Table& hash): - kmer_(kmer) -{ - kmer_bar_.as_reverse_complement(kmer_); - kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); - - s = (is_source ? exit_side() : entrance_side()); - - h = hash(*kmer_hat_ptr); -} + v(kmer, hash), + s(is_source ? exit_side() : entrance_side()) +{} template inline void Endpoint::from_prefix(const Kmer& e, const Kmer_Hash_Table& hash) { - kmer_.from_prefix(e); - kmer_bar_.as_reverse_complement(kmer_); - kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + v.from_prefix(e, hash); s = exit_side(); this->e = exit_edge(e); - - h = hash(*kmer_hat_ptr); } template inline void Endpoint::from_suffix(const Kmer& e, const Kmer_Hash_Table& hash) { - kmer_.from_suffix(e); - kmer_bar_.as_reverse_complement(kmer_); - kmer_hat_ptr = Kmer::canonical(kmer_, kmer_bar_); + v.from_suffix(e, hash); s = entrance_side(); this->e = entrance_edge(e); - - h = hash(*kmer_hat_ptr); } template inline cuttlefish::side_t Endpoint::exit_side() const { - return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::back : cuttlefish::side_t::front; + return v.exit_side(); } template inline cuttlefish::side_t Endpoint::entrance_side() const { - return &kmer_ == kmer_hat_ptr ? cuttlefish::side_t::front : cuttlefish::side_t::back; + return v.entrance_side(); } @@ -161,7 +143,7 @@ inline cuttlefish::edge_encoding_t Endpoint::entrance_edge(const Kmer& template inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encoding_t e, const Kmer_Hash_Table& hash) const { - Kmer kmer(*kmer_hat_ptr); + Kmer kmer(canonical()); if(s == cuttlefish::side_t::back) { @@ -177,7 +159,7 @@ inline Endpoint Endpoint::neighbor_endpoint(const cuttlefish::edge_encodin template inline const Kmer& Endpoint::canonical() const { - return *kmer_hat_ptr; + return v.canonical(); } @@ -198,7 +180,7 @@ inline cuttlefish::edge_encoding_t Endpoint::edge() const template inline uint64_t Endpoint::hash() const { - return h; + return v.hash(); } From 0a7cb512459cffb77c57b3a33adc08ced505d45c Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 21:30:43 -0400 Subject: [PATCH 185/350] Update with superseding CMake command add_definitions -> add_compile_definitions --- CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d183032a..11834864 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,14 +38,15 @@ set(OPTIMIZE_FLAGS -funroll-loops) # Add the required preprocessor definitions (`#define`s) to pass on. -# TODO: replace `add_definitions` with `add_compile_definitions`. -add_definitions(-D__STDC_FORMAT_MACROS -DSPDLOG_FMT_EXTERNAL_HO -DFMT_HEADER_ONLY -DXXH_INLINE_ALL) +# TODO: find out what are `__STDC_FORMAT_MACROS` and `SPDLOG_FMT_EXTERNAL_HO` for. +add_compile_definitions(__STDC_FORMAT_MACROS SPDLOG_FMT_EXTERNAL_HO FMT_HEADER_ONLY XXH_INLINE_ALL) + if(INSTANCE_COUNT) - add_definitions(-DINSTANCE_COUNT=${INSTANCE_COUNT}) + add_compile_definitions(INSTANCE_COUNT=${INSTANCE_COUNT}) endif() if(VALIDATION_MODE) - add_definitions(-DVALIDATION_MODE) + add_compile_definitions(VALIDATION_MODE) endif() From 9df09e11abd4e44d4b9cc3f803d3a814aeed1e5b Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 18 Sep 2021 22:12:00 -0400 Subject: [PATCH 186/350] Save input seq paths in o/p meta file --- include/utility.hpp | 4 ++++ src/dBG_Info.cpp | 2 +- src/utility.cpp | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/utility.hpp b/include/utility.hpp index 26d2e224..adca1e1c 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -27,6 +27,10 @@ bool file_prefix_exists(const std::string& path, const std::string& prefix); // Returns a string that is a copy of `s` but has all the whitespaces removed. std::string remove_whitespaces(const char* s); +// Given the collection of strings `s`, returns the concatenated string +// `s0 : s1 : ... : s_m`, where successive strings are separated by `delimiter`. +const std::string concat_strings(const std::vector& s, const std::string& delimiter = ", "); + // Removes the file at path `file_path` from disk. Returns `true` iff the // removal is successful. bool remove_file(const std::string& file_path); diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 99a522f7..f5540c53 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -78,7 +78,7 @@ void dBG_Info::add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor) template void dBG_Info::add_build_params(const Build_Params& params) { - // TODO: add input files information — after major generalization of the class `Reference_Input` and KMC library integration. + dBg_info[params_field]["input"] = concat_strings(params.sequence_input().seqs()); dBg_info[params_field]["k"] = params.k(); dBg_info[params_field]["output prefix"] = params.output_prefix(); } diff --git a/src/utility.cpp b/src/utility.cpp index fb65ccc5..bf2d8fc6 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -66,6 +66,17 @@ std::string remove_whitespaces(const char* s) } +const std::string concat_strings(const std::vector& s, const std::string& delimiter) +{ + std::ostringstream concat_stream; + std::copy(s.begin(), s.end(), std::ostream_iterator(concat_stream, delimiter.c_str())); + + std::string concat_str(concat_stream.str()); + concat_str.erase(concat_str.size() - delimiter.size(), delimiter.size()); + return concat_str; +} + + bool remove_file(const std::string& file_path) { return ghc::filesystem::remove(file_path); From 62792ed5be21a1235ac0155296d1a7934b1d52a3 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 12:08:14 -0400 Subject: [PATCH 187/350] Use default KMC args, no estimation w/o strict-mem --- src/Read_CdBG.cpp | 2 +- src/kmer_Enumerator.cpp | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 0317cbf8..6599411b 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -100,7 +100,7 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { return kmer_Enumerator().enumerate( KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), - params.thread_count(), params.max_memory(), params.strict_memory(), true, + params.thread_count(), params.max_memory(), params.strict_memory(), params.strict_memory(), params.working_dir_path(), edge_db_path()); } diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 0882b163..ef882f86 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -1,6 +1,5 @@ #include "kmer_Enumerator.hpp" -#include "globals.hpp" template @@ -17,14 +16,18 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetInputFiles(seqs) .SetKmerLen(k) .SetNThreads(thread_count) - .SetMaxRamGB(memory) - .SetSignatureLen(signature_len) - .SetNBins(bin_count) .SetTmpPath(working_dir_path) .SetEstimateHistogramCfg(estimate_mem_usage ? KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS : KMC::EstimateHistogramCfg::DONT_ESTIMATE) .SetPercentProgressObserver(&progress) ; + if(strict_memory) + stage1_params + .SetMaxRamGB(memory) + .SetSignatureLen(signature_len) + .SetNBins(bin_count) + ; + stage1_results = kmc.RunStage1(stage1_params); @@ -34,7 +37,6 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( stage2_params .SetCutoffMin(cutoff) .SetNThreads(thread_count) - .SetMaxRamGB(memory) .SetStrictMemoryMode(strict_memory) #ifndef VALIDATION_MODE .SetCounterMax(counter_max) @@ -42,6 +44,9 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetOutputFileName(output_db_path) ; + if(strict_memory) + stage2_params.SetMaxRamGB(memory); + stage2_results = kmc.RunStage2(stage2_params); From 12d39743d464933f2a6ee120717ad9f5b6e1ff2b Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 12:18:56 -0400 Subject: [PATCH 188/350] Skip funny log --- src/kmer_Enumerator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index ef882f86..ac0a9e85 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -18,7 +18,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetNThreads(thread_count) .SetTmpPath(working_dir_path) .SetEstimateHistogramCfg(estimate_mem_usage ? KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS : KMC::EstimateHistogramCfg::DONT_ESTIMATE) - .SetPercentProgressObserver(&progress) + // .SetPercentProgressObserver(&progress) ; if(strict_memory) From edbb23cca9c88e628f1935d68bfe4dd360a1b42e Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 12:57:49 -0400 Subject: [PATCH 189/350] Remove repeated computation in k-mer parsing --- include/kmc_api/kmc_file.h | 12 ++++++------ src/kmc_api/kmc_file.cpp | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 48958d0e..aef5eaba 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -79,6 +79,10 @@ class CKMC_DB uint32 original_min_count; uint64 original_max_count; + // Auxiliary fields to aid in k-mer parsing by Cuttlefish. + uint64_t prefix_mask_; //for kmc2 db + uint8_t byte_alignment_; + static uint64 part_size; // the size of a block readed to sufix_file_buf, in listing mode bool BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint64& counter, uint32 pattern_offset); @@ -569,12 +573,8 @@ inline void CKMC_DB::parse_kmer_buf(std::vector>:: const uint64_t prefix = prefix_it->first; prefix_it->second--; - - // TODO: make some of these constant class-fields, to avoid repeated calculations. - const uint64_t prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db - constexpr uint8_t byte_alignment = (k % 4 != 0 ? 4 - (k % 4) : 0); - uint32_t off = (sizeof(prefix) * 8) - (lut_prefix_length * 2) - byte_alignment * 2; - const uint64_t temp_prefix = (prefix & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format + uint32_t off = (sizeof(prefix) * 8) - (lut_prefix_length * 2) - byte_alignment_ * 2; + const uint64_t temp_prefix = (prefix & prefix_mask_) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format // Store prefix in a KMC alignment (differs in endianness from Cuttlefish's). kmc_data[0] = temp_prefix; diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index 3b2aacba..c677668f 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -301,6 +301,10 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ single_LUT_size = 1 << (2 * lut_prefix_length); uint64 last_data_index = lut_area_size_in_bytes / sizeof(uint64); + // Set auxiliary fields aiding in k-mer parsing by Cuttlefish. + prefix_mask_ = (1 << 2 * lut_prefix_length) - 1; + byte_alignment_ = (kmer_length % 4 != 0 ? 4 - (kmer_length % 4) : 0); + if(load_pref_file) { std::rewind(file_pre); From abcb19ecdd484f2c7b30d2bdfe95fa35fdb8422c Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 13:27:44 -0400 Subject: [PATCH 190/350] Match CLI standards _ -> - --- src/main.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 68566d8a..f85d33ea 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,22 +22,21 @@ void build(int argc, char** argv) { cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from references or reads"); options.add_options() - // TODO: replace CLI underscores with hyphens // TODO: better indent the following wall of text ("read", "construct a compacted read de Bruijn graph") ("r,refs", "reference files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("l,lists", "reference file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("k,kmer_len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) + ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("c,cutoff", "frequency cutoff for (k + 1)-mers (inapplicable for references)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) - ("s,kmc_db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("e,edge_db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) ("unrestrict-memory", "do not impose memory usage restriction") ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) - ("w,work_dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("rm", "remove the KMC database") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) @@ -62,17 +61,17 @@ void build(int argc, char** argv) const auto refs = result["refs"].as>(); const auto lists = result["lists"].as>(); const auto dirs = result["dirs"].as>(); - const auto k = result["kmer_len"].as(); + const auto k = result["kmer-len"].as(); const auto cutoff = result["cutoff"].as(); - const auto kmer_database = result["kmc_db"].as(); - const auto edge_database = result["edge_db"].as(); + const auto kmer_database = result["kmc-db"].as(); + const auto edge_database = result["edge-db"].as(); const auto thread_count = result["threads"].as(); const auto max_memory = result["max-memory"].as(); const auto strict_memory = !result["unrestrict-memory"].as(); const auto output_file = result["output"].as(); const auto format = result["format"].as(); const auto remove_kmc_db = result["rm"].as(); - const auto working_dir = result["work_dir"].as(); + const auto working_dir = result["work-dir"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); From 6cc47a5437f41d8da943b9dd0dcb663f37cf1f74 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 16:50:51 -0400 Subject: [PATCH 191/350] Remove impossible todo-note must have non-volatile value-type for STL containers --- include/Kmer_SPMC_Iterator.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index e2d019a1..8a798b0d 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -59,7 +59,6 @@ class Kmer_SPMC_Iterator no_more, // no k-mers will be provided anymore. }; - // TODO: replace the raw pointer with a vector maybe? volatile Task_Status* task_status{nullptr}; // Collection of the task statuses of the consumers. From 620b12c043cb462044b4f3d13a382c5b8877b326 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 17:35:10 -0400 Subject: [PATCH 192/350] Resolve pedantic warnings (bar C++17) --- CMakeLists.txt | 4 ++-- include/BBHash/BooPHF.h | 2 +- include/Progress_Tracker.hpp | 2 +- src/Ref_Parser.cpp | 2 +- src/test.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11834864..d38ed2a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ project(${PROJECT_NAME} # Fix language standards, and set hard requirements for such. # All targets defined from this point onward will pick up these requirements. -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 14) # Bump to 17 set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_C_STANDARD 11) @@ -28,7 +28,7 @@ set(CMAKE_C_EXTENSIONS OFF) # Bundle the warning flags that we want to pass on to the compiler. # Disable unknown pragmas, b/c bbhash uses them extensively. # Reference: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html -set(WARNING_FLAGS -Wall -Wextra) +set(WARNING_FLAGS -Wall -Wextra -pedantic) set(SUPPRESS_WARNING_FLAGS -Wno-unknown-pragmas) # Bundle the extra optimization flags (not associated with the `-O` levels) diff --git a/include/BBHash/BooPHF.h b/include/BBHash/BooPHF.h index b4377077..1c7fb449 100644 --- a/include/BBHash/BooPHF.h +++ b/include/BBHash/BooPHF.h @@ -79,7 +79,7 @@ namespace boomphf { int reso = fseek(_is,0,SEEK_SET); if (reso) { fprintf(stderr, "fseek failed on FILE* %p with return code %d", - is, reso); + static_cast(is), reso); } // advance(); peek(); diff --git a/include/Progress_Tracker.hpp b/include/Progress_Tracker.hpp index 1f2bff4c..b33778aa 100644 --- a/include/Progress_Tracker.hpp +++ b/include/Progress_Tracker.hpp @@ -52,7 +52,7 @@ inline void Progress_Tracker::track_work(uint64_t& work_chunk_size) if(percent_work_done < new_percent) { percent_work_done = new_percent; - std::cerr << "\r[" << log_message << "]\t" << percent_work_done << "\%"; + std::cerr << "\r[" << log_message << "]\t" << percent_work_done << "%"; } lock.unlock(); diff --git a/src/Ref_Parser.cpp b/src/Ref_Parser.cpp index bfa23818..bd50f821 100644 --- a/src/Ref_Parser.cpp +++ b/src/Ref_Parser.cpp @@ -9,7 +9,7 @@ // Declare the type of file handler and the read() function. // Required for FASTA/FASTQ file reading using the kseq library. -KSEQ_INIT(gzFile, gzread); +KSEQ_INIT(gzFile, gzread) Ref_Parser::Ref_Parser(const std::string& file_path) diff --git a/src/test.cpp b/src/test.cpp index 0c10272f..562d653f 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -27,7 +27,7 @@ // STEP 1: declare the type of file handler and the read() function -KSEQ_INIT(int, read); +KSEQ_INIT(int, read) void test_kseq(const char* fileName) From a8fbf6d2d525f7617d4d492ef13d307c4c237cfd Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 17:43:29 -0400 Subject: [PATCH 193/350] Do pedantics --- include/kmer_Enumerator.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index bcfae1ae..26c69bfb 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -62,9 +62,9 @@ class kmer_Enumeration_Stats { private: - uint64_t kmer_count_; - std::size_t max_memory_; - std::size_t temp_disk_usage_; + const uint64_t kmer_count_; + const std::size_t max_memory_; + const std::size_t temp_disk_usage_; public: From a279e93d147bd230dd54d58fed4b8119dd92361d Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 19 Sep 2021 17:44:18 -0400 Subject: [PATCH 194/350] Print max disk-usage --- src/Read_CdBG.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 6599411b..6c1672c6 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -92,6 +92,9 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; + + const double max_disk_usage = std::max(edge_stats.temp_disk_usage(), vertex_stats.temp_disk_usage()) / (1024.0 * 1024.0 * 1024.0); + std::cout << "\nMaximum temporary disk-usage: " << max_disk_usage << "GB.\n"; } From 2d0f5ef2f70901e339f5593d85572d8bdf8ce5c0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 12:01:06 -0400 Subject: [PATCH 195/350] Remove pedantic warnings --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d38ed2a8..96792a69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ set(CMAKE_C_EXTENSIONS OFF) # Bundle the warning flags that we want to pass on to the compiler. # Disable unknown pragmas, b/c bbhash uses them extensively. # Reference: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html -set(WARNING_FLAGS -Wall -Wextra -pedantic) +set(WARNING_FLAGS -Wall -Wextra) set(SUPPRESS_WARNING_FLAGS -Wno-unknown-pragmas) # Bundle the extra optimization flags (not associated with the `-O` levels) From 9a4095d1fd04a05f2a0e390452ab1d6515f45bc8 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 12:53:04 -0400 Subject: [PATCH 196/350] Better false sharing avoidance --- CMakeLists.txt | 7 +++++++ include/Kmer_SPMC_Iterator.hpp | 9 ++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96792a69..a5129f62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,13 @@ if(VALIDATION_MODE) add_compile_definitions(VALIDATION_MODE) endif() +execute_process( + COMMAND getconf LEVEL1_DCACHE_LINESIZE + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE +) +add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) + # Search the file system for the appropriate threads package for this platform, and then set # the `CMAKE_THREAD_LIBS_INIT` variable (and some other variables as well). diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index 8a798b0d..ae1eef01 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -14,15 +14,18 @@ // Data required by the consumers to correctly parse raw binary k-mers. -struct Consumer_Data +struct +#ifdef L1_CACHE_LINE_SIZE + alignas(L1_CACHE_LINE_SIZE) +#endif + Consumer_Data { uint8_t* suff_buf{nullptr}; // Buffer for the raw binary suffixes of the k-mers. uint64_t kmers_available; // Number of k-mers present in the current buffer. uint64_t kmers_parsed; // Number of k-mers parsed from the current buffers. std::vector> pref_buf; // Buffer for the raw binary prefixes of the k-mers, in the form: std::vector>::iterator pref_it; // Pointer to the prefix to start parsing k-mers from. - uint64_t pad_[1]; // Padding to avoid false-sharing. - // TODO: use better soln: https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size + // uint64_t pad_[1]; // Padding to avoid false-sharing. }; // An "iterator" class to iterate over a k-mer database on disk, where a single producer thread From 32aa7e66f4b33cb8f20e1818c206acd36860cec5 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 12:55:40 -0400 Subject: [PATCH 197/350] Must have cache size --- include/Kmer_SPMC_Iterator.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index ae1eef01..e8a21b0e 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -14,10 +14,7 @@ // Data required by the consumers to correctly parse raw binary k-mers. -struct -#ifdef L1_CACHE_LINE_SIZE - alignas(L1_CACHE_LINE_SIZE) -#endif +struct alignas(L1_CACHE_LINE_SIZE) Consumer_Data { uint8_t* suff_buf{nullptr}; // Buffer for the raw binary suffixes of the k-mers. From 9bbb8d3671fc80b8f1dea18e9f4299368f182e23 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 13:59:42 -0400 Subject: [PATCH 198/350] Use infinite jobs for KMC --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5129f62..f509fc1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,7 +120,7 @@ ExternalProject_Add(prj_kmc BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ CONFIGURE_COMMAND "" - BUILD_COMMAND make -j16 + BUILD_COMMAND make -j INSTALL_COMMAND cp bin/libkmc_core.a ${EXT_LIB} && cp include/kmc_runner.h ${EXT_INCLUDE} ) From 2202ff035a8b28859e9347a4ff4ae878cb20afb1 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 14:50:08 -0400 Subject: [PATCH 199/350] Break timing between unipaths and DCCs --- src/Detached_Cycles_Extractor.cpp | 2 +- src/Read_CdBG.cpp | 2 +- src/Read_CdBG_Extractor.cpp | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 7a130d5e..348a5703 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -30,7 +30,7 @@ void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_d std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - std::cout << "Done extracting the cycles. Time taken = " << elapsed_seconds << " seconds.\n"; + std::cout << "Extracted the detached chordless cycles. Time taken = " << elapsed_seconds << " seconds.\n"; } diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 6c1672c6..00189c81 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -91,7 +91,7 @@ void Read_CdBG::construct() } std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); - std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; + std::cout << "Extracted the maximal unitigs and DCCs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; const double max_disk_usage = std::max(edge_stats.temp_disk_usage(), vertex_stats.temp_disk_usage()) / (1024.0 * 1024.0 * 1024.0); std::cout << "\nMaximum temporary disk-usage: " << max_disk_usage << "GB.\n"; diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 532a70cd..3e2653d1 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -18,7 +18,7 @@ Read_CdBG_Extractor::Read_CdBG_Extractor(const Build_Params& params, Kmer_Has template void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path) { - // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); // Construct a thread pool. @@ -59,9 +59,9 @@ void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_d " I.e. the cycles are graph components exclusively on their own.\n\n"; - // std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); - // double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); - // std::cout << "Done extracting the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); + std::cout << "Extracted the maximal unitigs. Time taken = " << elapsed_seconds << " seconds.\n"; } From d121db40c7d81dc4bab0c4035a5b2dd61e409fad Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 16:17:00 -0400 Subject: [PATCH 200/350] Remove ununsed var --- include/Kmer.hpp | 18 +++++++++++++----- src/kmer_Enumerator.cpp | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/Kmer.hpp b/include/Kmer.hpp index 276eb52d..011901b8 100644 --- a/include/Kmer.hpp +++ b/include/Kmer.hpp @@ -70,6 +70,10 @@ class Kmer: public DNA_Utility // `label[kmer_idx,...,kmer_idx + k - 1]`. Kmer(const char* label, size_t kmer_idx); + // Constructs a k-mer from the provided characters at + // `label[0, ..., k - 1]`. + Kmer(const char* label); + // Constructs a k-mer from the provided string `label`. Kmer(const std::string& label); @@ -272,7 +276,7 @@ inline void Kmer::left_shift(char(*)[B == 0]) template -inline uint64_t Kmer::to_u64(uint64_t seed) const +inline uint64_t Kmer::to_u64(const uint64_t seed) const { constexpr const uint16_t NUM_BYTES = (k + 3) / 4; return XXH3_64bits_withSeed(kmer_data, NUM_BYTES, seed); @@ -289,6 +293,7 @@ template inline Kmer::Kmer(const char* const label, const size_t kmer_idx): Kmer() { + // TODO: avoid the chaining left-shift at each turn. Insert the 2-bit base directly at it's target position. for(size_t idx = kmer_idx; idx < kmer_idx + k; ++idx) { const DNA::Base base = map_base(label[idx]); @@ -300,14 +305,17 @@ inline Kmer::Kmer(const char* const label, const size_t kmer_idx): template -inline Kmer::Kmer(const std::string& label): - Kmer(label.c_str(), 0) +inline Kmer::Kmer(const char* const label): Kmer(label, 0) +{} + + +template +inline Kmer::Kmer(const std::string& label): Kmer(label.c_str()) {} template -inline Kmer::Kmer(const std::string& label, const size_t kmer_idx): - Kmer(label.c_str(), kmer_idx) +inline Kmer::Kmer(const std::string& label, const size_t kmer_idx): Kmer(label.c_str(), kmer_idx) {} diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index ac0a9e85..5b1dfb7d 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -8,7 +8,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( const uint16_t thread_count, const std::size_t max_memory, const bool strict_memory, const bool estimate_mem_usage, const std::string& working_dir_path, const std::string& output_db_path) { - FunnyProgress progress; + // FunnyProgress progress; std::size_t memory = std::max(max_memory, min_memory); stage1_params From 00d7c33932eb64de3f84b8f15c8ef353b24e20f8 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Sep 2021 16:32:22 -0400 Subject: [PATCH 201/350] Fix compilation --- include/utility.hpp | 1 + src/utility.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index adca1e1c..4bc1ff7e 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -4,6 +4,7 @@ #include +#include // TODO: wrap everything here in some namespaces. diff --git a/src/utility.cpp b/src/utility.cpp index bf2d8fc6..49fc60ed 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include std::string get_random_string(const size_t len, const char* const alphabet) From 51c5994e347a62492e292afd9401037930c7d696 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Sep 2021 17:46:57 -0400 Subject: [PATCH 202/350] Add SPSS CLI-arg --- include/Build_Params.hpp | 10 ++++++++++ src/main.cpp | 3 +++ 2 files changed, 13 insertions(+) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 6e5c60a2..dff3382f 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -31,6 +31,7 @@ class Build_Params const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). const std::string working_dir_path_; // Path to the working directory (for temporary files). + const bool spss_; // Whether to extract a set of simplitigs, i.e. an SPSS (Spectrum-Preserving String Set). const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. @@ -57,6 +58,7 @@ class Build_Params const std::string& output_file_path, const uint8_t output_format, const std::string& working_dir_path, + const bool spss, const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, @@ -76,6 +78,7 @@ class Build_Params output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), + spss_(spss), remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), @@ -177,6 +180,13 @@ class Build_Params } + // Returns whether to extract a set of simplitigs, i.e. an SPSS (Spectrum-Preserving String Set). + bool spss() const + { + return spss_; + } + + // Returns the boolean flag for removing the KMC database. bool remove_kmc_db() const { diff --git a/src/main.cpp b/src/main.cpp index f85d33ea..0d0a771c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ void build(int argc, char** argv) ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("spss", "extract a set of simplitigs that form an SPSS (Spectrum-Preserving String Set) of the input") ("rm", "remove the KMC database") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) @@ -72,6 +73,7 @@ void build(int argc, char** argv) const auto format = result["format"].as(); const auto remove_kmc_db = result["rm"].as(); const auto working_dir = result["work-dir"].as(); + const auto spss = result["spss"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); @@ -83,6 +85,7 @@ void build(int argc, char** argv) refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, + spss, remove_kmc_db, mph_file, buckets_file, save_vertices, json_file, dcc_opt, extract_cycles); if(!params.is_valid()) From 81f5f8aca868c17a1a1dbda5371c0dd0bb9dd319 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Sep 2021 17:53:56 -0400 Subject: [PATCH 203/350] Include end-purpose of edge-processing in driver name --- include/Read_CdBG_Constructor.hpp | 4 ++-- src/Read_CdBG_Constructor.cpp | 2 +- src/Thread_Pool.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 8cef4d9b..e4e6d7b9 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -42,8 +42,8 @@ class Read_CdBG_Constructor // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge - // `(u, v)` provided to that thread. - void process_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // `(u, v)` provided to that thread, in order to construct a CdBG. + void process_cdbg_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index fb19e003..b33a30b5 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -78,7 +78,7 @@ void Read_CdBG_Constructor::distribute_states_computation(Kmer_SPMC_Iterator< template -void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) +void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { // Data locations to be reused per each edge processed. Edge e; // For the edges to be processed one-by-one. diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index e31d0035..df9d7a9e 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -96,7 +96,7 @@ void Thread_Pool::task(const uint16_t thread_id) { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; static_cast*>(dBG)-> - process_edges(static_cast*>(params.parser), params.thread_id); + process_cdbg_edges(static_cast*>(params.parser), params.thread_id); } break; From 7e9a673876b796dba1d8eda4f87cb6a64bc4fa82 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Sep 2021 20:55:46 -0400 Subject: [PATCH 204/350] Not mess with precise timing log --- src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index f85d33ea..ff985211 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -91,7 +91,7 @@ void build(int argc, char** argv) std::exit(EXIT_FAILURE); } - std::cout.precision(3); + // std::cout.precision(3); const std::string dBg_type(params.is_read_graph() ? "read" : "reference"); From 38d7330997cc7e3f547e01f79affc996d048c474 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Sep 2021 21:01:52 -0400 Subject: [PATCH 205/350] Wrap different edge-processing types --- include/Read_CdBG_Constructor.hpp | 9 +++++++++ src/Read_CdBG_Constructor.cpp | 15 +++++++++++++++ src/Thread_Pool.cpp | 2 +- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index e4e6d7b9..694deba9 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -40,11 +40,20 @@ class Read_CdBG_Constructor // for the edges to be processed by making appropriate state transitions for their endpoints. void distribute_states_computation(Kmer_SPMC_Iterator* edge_parser, Thread_Pool& thread_pool); + // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, + // based on the end-purpose of extracting either the maximal unitigs or the simplitigs. + void process_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge // `(u, v)` provided to that thread, in order to construct a CdBG. void process_cdbg_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, + // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge + // `(u, v)` provided to that thread, in order to construct an SPSS. + void process_spss_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the // DFA of `v`. Also stores the edge encodings of the incidence information of the side `s` before diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index b33a30b5..ad97bdea 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -77,6 +77,16 @@ void Read_CdBG_Constructor::distribute_states_computation(Kmer_SPMC_Iterator< } +template +void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) +{ + if(params.spss()) + process_spss_edges(edge_parser, thread_id); + else + process_cdbg_edges(edge_parser, thread_id); +} + + template void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { @@ -133,6 +143,11 @@ void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* con } +template +void Read_CdBG_Constructor::process_spss_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) +{} + + template uint64_t Read_CdBG_Constructor::vertex_count() const { diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index df9d7a9e..e31d0035 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -96,7 +96,7 @@ void Thread_Pool::task(const uint16_t thread_id) { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; static_cast*>(dBG)-> - process_cdbg_edges(static_cast*>(params.parser), params.thread_id); + process_edges(static_cast*>(params.parser), params.thread_id); } break; From 8bcdd73da10bf3532638604d2d3adae3abae4bf1 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 22 Sep 2021 13:08:05 -0400 Subject: [PATCH 206/350] Correct terminology --- include/Build_Params.hpp | 12 ++++++------ src/Read_CdBG_Constructor.cpp | 2 +- src/main.cpp | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index dff3382f..f22def02 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -31,7 +31,7 @@ class Build_Params const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). const std::string working_dir_path_; // Path to the working directory (for temporary files). - const bool spss_; // Whether to extract a set of simplitigs, i.e. an SPSS (Spectrum-Preserving String Set). + const bool simplitigs_; // Whether to extract a set of maximal simplitigs, i.e. vertex-disjoint paths. const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. @@ -58,7 +58,7 @@ class Build_Params const std::string& output_file_path, const uint8_t output_format, const std::string& working_dir_path, - const bool spss, + const bool simplitig, const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, @@ -78,7 +78,7 @@ class Build_Params output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), - spss_(spss), + simplitigs_(simplitig), remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), @@ -180,10 +180,10 @@ class Build_Params } - // Returns whether to extract a set of simplitigs, i.e. an SPSS (Spectrum-Preserving String Set). - bool spss() const + // Returns whether to extract a set of maximal simplitigs, i.e. vertex-disjoint paths. + bool simplitigs() const { - return spss_; + return simplitigs_; } diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index ad97bdea..95363e11 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -80,7 +80,7 @@ void Read_CdBG_Constructor::distribute_states_computation(Kmer_SPMC_Iterator< template void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { - if(params.spss()) + if(params.simplitigs()) process_spss_edges(edge_parser, thread_id); else process_cdbg_edges(edge_parser, thread_id); diff --git a/src/main.cpp b/src/main.cpp index 0d0a771c..43b32705 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,7 +37,7 @@ void build(int argc, char** argv) ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("spss", "extract a set of simplitigs that form an SPSS (Spectrum-Preserving String Set) of the input") + ("simplitigs", "extract a set of maximal simplitigs, i.e. vertex-disjoint paths") ("rm", "remove the KMC database") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) @@ -73,7 +73,7 @@ void build(int argc, char** argv) const auto format = result["format"].as(); const auto remove_kmc_db = result["rm"].as(); const auto working_dir = result["work-dir"].as(); - const auto spss = result["spss"].as(); + const auto simplitigs = result["simplitigs"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); @@ -85,7 +85,7 @@ void build(int argc, char** argv) refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, - spss, + simplitigs, remove_kmc_db, mph_file, buckets_file, save_vertices, json_file, dcc_opt, extract_cycles); if(!params.is_valid()) From 673b911713ed7c3c7f91c86cbe90fb32b073b95f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 22 Sep 2021 14:41:43 -0400 Subject: [PATCH 207/350] Add concurrent hash table update for two entries --- include/Kmer_Hash_Table.hpp | 40 +++++++++++++++++++++++++++++++++++++ include/Sparse_Lock.hpp | 24 ++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 3eb0afd6..03f290d1 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -110,6 +110,13 @@ class Kmer_Hash_Table // `bucket_id` with the state-value `state`. void update(uint64_t bucket_id, const State_Read_Space& state); + // Attempts to update the hash table entries for the API objects `api_1` and + // `api_2` concurrently, i.e. both the updates need to happen in a tied manner + // — both successful or failing. Returns `true` iff the updates succeed. If + // either of the table positions contains a different state than the one + // expected by the API objects, then the concurrent update fails. + bool update_concurrent(Kmer_Hash_Entry_API& api_1, Kmer_Hash_Entry_API& api_2); + // Returns the number of keys in the hash table. uint64_t size() const; @@ -218,6 +225,39 @@ inline void Kmer_Hash_Table::update(const uint64_t bucket_id, c } +template +inline bool Kmer_Hash_Table::update_concurrent(Kmer_Hash_Entry_API& api_1, Kmer_Hash_Entry_API& api_2) +{ + Kmer_Hash_Entry_API* api_l = &api_1; + Kmer_Hash_Entry_API* api_r = &api_2; + uint64_t bucket_l = std::distance(hash_table.begin(), &(api_1.bv_entry)); + uint64_t bucket_r = std::distance(hash_table.begin(), &(api_2.bv_entry)); + + // Resolution for potential deadlocks. + if(bucket_l > bucket_r) + std::swap(api_l, api_r), + std::swap(bucket_l, bucket_r); + + + sparse_lock.lock(bucket_l); + bool success = (api_l->bv_entry == api_l->get_read_state()); + if(success) + { + sparse_lock.lock_if_different(bucket_l, bucket_r); + + success = (api_r->bv_entry == api_r->get_read_state()); + if(success) + api_l->bv_entry = api_l->get_current_state(), + api_r->bv_entry = api_r->get_current_state(); + + sparse_lock.unlock_if_different(bucket_l, bucket_r); + } + sparse_lock.unlock(bucket_l); + + return success; +} + + template inline uint64_t Kmer_Hash_Table::size() const { diff --git a/include/Sparse_Lock.hpp b/include/Sparse_Lock.hpp index 6703339c..8b6ac5e5 100644 --- a/include/Sparse_Lock.hpp +++ b/include/Sparse_Lock.hpp @@ -35,6 +35,14 @@ class Sparse_Lock // Releases lock for the entry with index `idx`. void unlock(size_t idx); + + // Acquires lock for the entry with index `curr_idx` iff the corresponding lock for the index `prev_idx` + // is a different lock. + void lock_if_different(std::size_t prev_idx, std::size_t curr_idx); + + // Releases lock for the entry with index `curr_idx` iff the corresponding lock for the index `prev_idx` + // is a different lock. + void unlock_if_different(std::size_t prev_idx, std::size_t curr_idx); }; @@ -62,5 +70,21 @@ inline void Sparse_Lock::unlock(const size_t idx) } +template +inline void Sparse_Lock::lock_if_different(const std::size_t prev_idx, const std::size_t curr_idx) +{ + if((curr_idx >> lg_per_lock_range) != (prev_idx >> lg_per_lock_range)) + lock_[curr_idx >> lg_per_lock_range].lock(); +} + + +template +inline void Sparse_Lock::unlock_if_different(const std::size_t prev_idx, const std::size_t curr_idx) +{ + if((curr_idx >> lg_per_lock_range) != (prev_idx >> lg_per_lock_range)) + lock_[curr_idx >> lg_per_lock_range].unlock(); +} + + #endif From 955ae03e66dcc1df685586ff94f7d432aa511b29 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 22 Sep 2021 22:45:31 -0400 Subject: [PATCH 208/350] Add dedicated lock-id func --- include/Sparse_Lock.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/Sparse_Lock.hpp b/include/Sparse_Lock.hpp index 8b6ac5e5..12e76a51 100644 --- a/include/Sparse_Lock.hpp +++ b/include/Sparse_Lock.hpp @@ -25,6 +25,10 @@ class Sparse_Lock std::vector lock_; // The collection of locks. + // Returns the ID of the lock that the index `idx` corresponds to. + std::size_t lock_id(std::size_t idx) const; + + public: // Constructs a sparse-lock collection consisting of `lock_count` locks, for `range_size` number of entries. @@ -56,33 +60,40 @@ inline Sparse_Lock::Sparse_Lock(const size_t range_size, const size_t lo {} +template +inline std::size_t Sparse_Lock::lock_id(const std::size_t idx) const +{ + return idx >> lg_per_lock_range; +} + + template inline void Sparse_Lock::lock(const size_t idx) { - lock_[idx >> lg_per_lock_range].lock(); + lock_[lock_id(idx)].lock(); } template inline void Sparse_Lock::unlock(const size_t idx) { - lock_[idx >> lg_per_lock_range].unlock(); + lock_[lock_id(idx)].unlock(); } template inline void Sparse_Lock::lock_if_different(const std::size_t prev_idx, const std::size_t curr_idx) { - if((curr_idx >> lg_per_lock_range) != (prev_idx >> lg_per_lock_range)) - lock_[curr_idx >> lg_per_lock_range].lock(); + if(lock_id(curr_idx) != lock_id(prev_idx)) + lock_[lock_id(curr_idx)].lock(); } template inline void Sparse_Lock::unlock_if_different(const std::size_t prev_idx, const std::size_t curr_idx) { - if((curr_idx >> lg_per_lock_range) != (prev_idx >> lg_per_lock_range)) - lock_[curr_idx >> lg_per_lock_range].unlock(); + if(lock_id(curr_idx) != lock_id(prev_idx)) + lock_[lock_id(curr_idx)].unlock(); } From 9a8416799a29fdf86be37371da105fb77bee40a9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 22 Sep 2021 23:01:20 -0400 Subject: [PATCH 209/350] Add simplitigs extraction --- include/Read_CdBG_Constructor.hpp | 35 ++++++++++++++++++++++++++++-- src/Read_CdBG_Constructor.cpp | 36 +++++++++++++++++++++++++------ 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 694deba9..595b65e2 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -7,6 +7,7 @@ #include "globals.hpp" #include "Kmer_Hash_Table.hpp" #include "State_Read_Space.hpp" +#include "Edge.hpp" #include "Endpoint.hpp" #include "Build_Params.hpp" #include "Thread_Pool.hpp" @@ -51,8 +52,8 @@ class Read_CdBG_Constructor // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge - // `(u, v)` provided to that thread, in order to construct an SPSS. - void process_spss_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // `(u, v)` provided to that thread, to construct a set of maximal simplitigs covering the dBG. + void process_simplitig_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the @@ -94,6 +95,13 @@ class Read_CdBG_Constructor // `v_end` through the unique edge encoded with `e` — making the appropriate state transition. void discard_neighbor_side(const Endpoint& v, cuttlefish::edge_encoding_t e); + // Adds the information of the edge `e = {u, v}` to its endpoint vertices `u` and `v` iff this + // edge connects sides of `u` and `v` that do not have any edges added yet, which ensures that + // neither of the vertices belong to two different simplitig paths; and makes the appropriate + // state transitions for the DFAs of `u` and `v`. Returns `false` iff the edge could not be + // added as a simplitig edge. + bool add_simplitig_edge(const Edge& e); + public: @@ -231,5 +239,28 @@ inline void Read_CdBG_Constructor::discard_neighbor_side(const Endpoint& v } +template +bool Read_CdBG_Constructor::add_simplitig_edge(const Edge& e) +{ + // Fetch the hash table entry for the vertices associated to the endpoints. + + Kmer_Hash_Entry_API bucket_u = hash_table[e.u().canonical()]; + State_Read_Space& st_u = bucket_u.get_state(); + if(st_u.edge_at(e.u().side()) != cuttlefish::edge_encoding_t::E) + return false; + + Kmer_Hash_Entry_API bucket_v = hash_table[e.v().canonical()]; + State_Read_Space& st_v = bucket_v.get_state(); + if(st_v.edge_at(e.v().side()) != cuttlefish::edge_encoding_t::E) + return false; + + + st_u.update_edge_at(e.u().side(), e.u().edge()); + st_v.update_edge_at(e.v().side(), e.v().edge()); + + return hash_table.update_concurrent(bucket_u, bucket_v); +} + + #endif diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 95363e11..de86128e 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -81,7 +81,7 @@ template void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { if(params.simplitigs()) - process_spss_edges(edge_parser, thread_id); + process_simplitig_edges(edge_parser, thread_id); else process_cdbg_edges(edge_parser, thread_id); } @@ -91,7 +91,7 @@ template void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { // Data locations to be reused per each edge processed. - Edge e; // For the edges to be processed one-by-one. + Edge e; // For the edges to be processed one-by-one; say this is between the vertices `u` and `v`. cuttlefish::edge_encoding_t e_front, e_back; // Edges incident to the front and to the back of a vertex with a crossing loop. cuttlefish::edge_encoding_t e_u_old, e_u_new; // Edges incident to some particular side of a vertex `u`, before and after the addition of a new edge. cuttlefish::edge_encoding_t e_v_old, e_v_new; // Edges incident to some particular side of a vertex `v`, before and after the addition of a new edge. @@ -99,6 +99,7 @@ void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* con uint64_t edge_count = 0; // Number of edges processed by this thread. uint64_t progress = 0; // Number of edges processed by the thread; is reset at reaching 1% of its approximate workload. + while(edge_parser->tasks_expected(thread_id)) if(edge_parser->value_at(thread_id, e.e())) { @@ -131,8 +132,6 @@ void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* con } edge_count++; - - progress_tracker.track_work(++progress); } @@ -144,8 +143,33 @@ void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* con template -void Read_CdBG_Constructor::process_spss_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) -{} +void Read_CdBG_Constructor::process_simplitig_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) +{ + Edge e; // For the edges to be processed one-by-one; say this is between the vertices `u` and `v`. + + uint64_t edge_count = 0; // Number of edges processed by this thread. + uint64_t progress = 0; // Number of edges processed by the thread; is reset at reaching 1% of its approximate workload. + + + while(edge_parser->tasks_expected(thread_id)) + if(edge_parser->value_at(thread_id, e.e())) + { + e.configure(hash_table); // A new edge (k + 1)-mer has been parsed; set information for its two endpoints. + + edge_count++; + progress_tracker.track_work(++progress); + + if(e.is_loop()) + continue; + else // It connects two endpoints `u` and `v` of two distinct vertex. + add_simplitig_edge(e); + } + + + lock.lock(); + edges_processed += edge_count; + lock.unlock(); +} template From 631574775260beb6263cb282a07e34ec8431df05 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 14:46:59 -0400 Subject: [PATCH 210/350] Add peak-mem peeker --- include/utility.hpp | 4 ++++ src/utility.cpp | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index 4bc1ff7e..55d55c10 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -45,6 +45,10 @@ const std::string filename(const std::string& file_path); // Moves the file present at path `from_path` to the path `to_path`. void move_file(const std::string& from_path, const std::string& to_path); +// Returns the maximum memory ("high-water-mark") used by the running +// process in bytes. Returns `0` in case of errors encountered. +std::size_t process_peak_memory(); + #endif diff --git a/src/utility.cpp b/src/utility.cpp index 49fc60ed..037b4075 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -109,3 +110,37 @@ void move_file(const std::string& from_path, const std::string& to_path) ghc::filesystem::copy(from_path, to_path); ghc::filesystem::remove(from_path); } + + +std::size_t process_peak_memory() +{ + constexpr const char* process_file = "/proc/self/status"; + constexpr const char* peak_mem_field = "VmHWM:"; + constexpr std::size_t field_len = std::strlen(peak_mem_field); + + std::FILE* fp = std::fopen(process_file, "r"); + if(fp == NULL) + { + std::cerr << "Error opening the process information file.\n"; + return 0; + } + + char line[1024]; + std::size_t peak_mem = 0; + while(std::fgets(line, sizeof(line) - 1, fp)) + if(std::strncmp(line, peak_mem_field, field_len) == 0) + { + peak_mem = std::strtoul(line + field_len, NULL, 0); + break; + } + + + if(std::ferror(fp)) + { + std::cerr << "Error reading the process information file.\n"; + return 0; + } + + + return peak_mem * 1024; +} From 7529908089e7bd93abdae65d896b7263f0b4487f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 14:56:13 -0400 Subject: [PATCH 211/350] Add iterator memory peeker --- include/Kmer_SPMC_Iterator.hpp | 10 ++++++++++ include/kmc_api/Virtual_Prefix_File.hpp | 9 +++++++++ include/kmc_api/kmc_file.h | 11 +++++++++++ 3 files changed, 30 insertions(+) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index e8a21b0e..1c10293c 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -128,6 +128,9 @@ class Kmer_SPMC_Iterator // Returns `true` iff a task is available for the consumer with id `consumer_id`. bool task_available(size_t consumer_id) const; + // Returns the memory (in bytes) used by the iterator. + std::size_t memory() const; + // Dummy methods. const iterator& operator++() { return *this; } Kmer operator*() { return Kmer(); } @@ -355,5 +358,12 @@ inline bool Kmer_SPMC_Iterator::task_available(const size_t consumer_id) cons } +template +inline std::size_t Kmer_SPMC_Iterator::memory() const +{ + return kmer_database.pref_buf_memory() + (consumer_count * BUF_SZ_PER_CONSUMER); +} + + #endif diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index ddaffef6..0036b818 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -55,6 +55,9 @@ class Virtual_Prefix_File // Returns the data at index `idx` of the prefix-file. uint64_t operator[](std::size_t idx); + + // Returns the size of the buffer in bytes. + constexpr std::size_t memory() const; }; @@ -89,5 +92,11 @@ inline uint64_t Virtual_Prefix_File::operator[](const std::size_t idx) } +inline constexpr std::size_t Virtual_Prefix_File::memory() const +{ + return buffer_elem_count * sizeof(uint64_t); +} + + #endif diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index aef5eaba..e47c8599 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -147,6 +147,9 @@ class CKMC_DB // where "abundance" is the count of remaining k-mers to be parsed having this "prefix". The // iterator is adjusted accordingly for the next parse operation from the buffers. template void parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; + + // Returns the memory (in bytes) used by the prefix file buffer. + constexpr std::size_t pref_buf_memory() const; // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF bool ReadNextKmer(CKmerAPI &kmer, float &count); @@ -607,6 +610,14 @@ inline void CKMC_DB::parse_kmer_buf(std::vector>:: kmer.from_KMC_data(kmc_data); } + +inline constexpr std::size_t CKMC_DB::pref_buf_memory() const +{ + return prefix_virt_buf.memory(); +} + + + #endif // ***** EOF From 5465829d98ad5019907b6da024a0d1f5300958ac Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 16:38:37 -0400 Subject: [PATCH 212/350] Add static memory-peeker for iterator --- include/Kmer_SPMC_Iterator.hpp | 12 +++++++++++- include/kmc_api/Virtual_Prefix_File.hpp | 4 ++-- include/kmc_api/kmc_file.h | 6 +++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/Kmer_SPMC_Iterator.hpp b/include/Kmer_SPMC_Iterator.hpp index 1c10293c..70ba56ad 100644 --- a/include/Kmer_SPMC_Iterator.hpp +++ b/include/Kmer_SPMC_Iterator.hpp @@ -131,6 +131,9 @@ class Kmer_SPMC_Iterator // Returns the memory (in bytes) used by the iterator. std::size_t memory() const; + // Returns the memory (in bytes) to be used by an iterator supporting `consumer_count` consumers. + static std::size_t memory(std::size_t consumer_count); + // Dummy methods. const iterator& operator++() { return *this; } Kmer operator*() { return Kmer(); } @@ -361,7 +364,14 @@ inline bool Kmer_SPMC_Iterator::task_available(const size_t consumer_id) cons template inline std::size_t Kmer_SPMC_Iterator::memory() const { - return kmer_database.pref_buf_memory() + (consumer_count * BUF_SZ_PER_CONSUMER); + return CKMC_DB::pref_buf_memory() + (consumer_count * BUF_SZ_PER_CONSUMER); +} + + +template +inline std::size_t Kmer_SPMC_Iterator::memory(const std::size_t consumer_count) +{ + return CKMC_DB::pref_buf_memory() + (consumer_count * BUF_SZ_PER_CONSUMER); } diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 0036b818..815a1922 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -57,7 +57,7 @@ class Virtual_Prefix_File uint64_t operator[](std::size_t idx); // Returns the size of the buffer in bytes. - constexpr std::size_t memory() const; + static constexpr std::size_t memory(); }; @@ -92,7 +92,7 @@ inline uint64_t Virtual_Prefix_File::operator[](const std::size_t idx) } -inline constexpr std::size_t Virtual_Prefix_File::memory() const +inline constexpr std::size_t Virtual_Prefix_File::memory() { return buffer_elem_count * sizeof(uint64_t); } diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index e47c8599..31d90fd3 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -149,7 +149,7 @@ class CKMC_DB template void parse_kmer_buf(std::vector>::iterator& prefix_it, const uint8_t* suff_buf, size_t buf_idx, Kmer& kmer) const; // Returns the memory (in bytes) used by the prefix file buffer. - constexpr std::size_t pref_buf_memory() const; + static constexpr std::size_t pref_buf_memory(); // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF bool ReadNextKmer(CKmerAPI &kmer, float &count); @@ -611,9 +611,9 @@ inline void CKMC_DB::parse_kmer_buf(std::vector>:: } -inline constexpr std::size_t CKMC_DB::pref_buf_memory() const +inline constexpr std::size_t CKMC_DB::pref_buf_memory() { - return prefix_virt_buf.memory(); + return Virtual_Prefix_File::memory(); } From f5a7be95c46e764805668ae864a6900f06d3ce19 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 16:45:41 -0400 Subject: [PATCH 213/350] Better deligate ctr --- src/Kmer_Hash_Table.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index ca94382d..663fc213 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -11,10 +11,7 @@ template -Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path): - kmc_db_path(kmc_db_path), - kmer_count{Kmer_Container::size(kmc_db_path)}, - sparse_lock(kmer_count, lock_count) +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path): Kmer_Hash_Table(kmc_db_path, Kmer_Container::size(kmc_db_path)) {} From 1abeb2c5f819b450288e962c4db0e99b5a218dc4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 16:54:10 -0400 Subject: [PATCH 214/350] =?UTF-8?q?Make=20=CE=B3=20object-specific?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/Kmer_Hash_Table.hpp | 2 +- src/Kmer_Hash_Table.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 3eb0afd6..99aafd89 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -26,7 +26,7 @@ class Kmer_Hash_Table private: // Lowest bits/elem is achieved with gamma = 1, higher values lead to larger mphf but faster construction/query. - constexpr static double GAMMA_FACTOR = 2.0; + double gamma = 2.0; // Path to the underlying k-mer database, over which the hash table is constructed. const std::string kmc_db_path; diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 663fc213..dff94b80 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -47,7 +47,7 @@ void Kmer_Hash_Table::build_mph_function(const uint16_t thread_ // auto data_iterator = boomphf::range(kmer_container.buf_begin(), kmer_container.buf_end()); const auto data_iterator = boomphf::range(kmer_container.spmc_begin(thread_count), kmer_container.spmc_end(thread_count)); - mph = new mphf_t(kmer_count, data_iterator, working_dir_path, thread_count, GAMMA_FACTOR); + mph = new mphf_t(kmer_count, data_iterator, working_dir_path, thread_count, gamma); std::cout << "Built the MPHF in memory.\n"; From 058aeb0001d6a92bf2a689a6ce57b587761975bf Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Sep 2021 18:51:51 -0400 Subject: [PATCH 215/350] Add memory-constrained hash-table design --- include/Kmer_Hash_Table.hpp | 13 ++++++++++++- src/Kmer_Hash_Table.cpp | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 99aafd89..15238cbe 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -25,8 +25,14 @@ class Kmer_Hash_Table private: + // The minimum gamma-value that we require for BBHash. + static constexpr uint8_t gamma_min = 2; + + // Empiricial bits-per-key requirement for each gamma in the range (0, 10]. + static constexpr double bits_per_gamma[] = {0, 3.07, 3.71, 4.71, 5.78, 6.87, 7.97, 9.08, 10.20, 11.30, 12.4}; + // Lowest bits/elem is achieved with gamma = 1, higher values lead to larger mphf but faster construction/query. - double gamma = 2.0; + double gamma; // Path to the underlying k-mer database, over which the hash table is constructed. const std::string kmc_db_path; @@ -74,6 +80,11 @@ class Kmer_Hash_Table // database having path prefix `kmer_db_path` and `kmer_count` distinct k-mers. Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count); + // Constructs a k-mer hash table where the table is to be built over the k-mer + // database having path prefix `kmer_db_path` and `kmer_count` distinct k-mers. + // The hash table may use at most `max_memory` bytes of memory. + Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count, std::size_t max_memory); + // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, // using up-to `thread_count` number of threads. If a non-empty path is passed diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index dff94b80..17c4f697 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -17,12 +17,33 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count): + gamma(static_cast(gamma_min)), kmc_db_path(kmc_db_path), kmer_count(kmer_count), sparse_lock(kmer_count, lock_count) {} +template +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory): Kmer_Hash_Table(kmc_db_path, kmer_count) +{ + const std::size_t max_memory_bits = max_memory * 8U; + const std::size_t min_memory_bits = static_cast(kmer_count * bits_per_gamma[gamma_min]); + if(max_memory_bits > min_memory_bits) + { + const double max_bits_per_hash_key = (static_cast(max_memory_bits) / kmer_count) - BITS_PER_KEY; + constexpr double eps = 0.01; + + std::size_t gamma_idx = sizeof(bits_per_gamma) / sizeof(*bits_per_gamma) - 1; + for(; gamma_idx > gamma_min; --gamma_idx) + if(bits_per_gamma[gamma_idx] - eps < max_bits_per_hash_key) + break; + + gamma = gamma_idx; + } +} + + template void Kmer_Hash_Table::build_mph_function(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { From c6c0df82a5b6a25fe1be8589a03ee0b58fc20878 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Sep 2021 12:16:44 -0400 Subject: [PATCH 216/350] Separate gamma-setter --- include/Kmer_Hash_Table.hpp | 4 ++++ src/Kmer_Hash_Table.cpp | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 15238cbe..eb76608b 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -55,6 +55,10 @@ class Kmer_Hash_Table // The locks to maintain mutually exclusive access for threads to the same indices into the bitvector `hash_table`. mutable Sparse_Lock sparse_lock; + + // Sets the `gamma` parameter of the hash function to the maximum amount so that the + // hash table does not incur more than `max_memory` bytes of space. + void set_gamma(std::size_t max_memory); // Builds the minimal perfect hash function `mph` over the set of // k-mers present at the KMC database container `kmer_container`, diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 17c4f697..64b96fa0 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -26,6 +26,13 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory): Kmer_Hash_Table(kmc_db_path, kmer_count) +{ + set_gamma(max_memory); +} + + +template +void Kmer_Hash_Table::set_gamma(const std::size_t max_memory) { const std::size_t max_memory_bits = max_memory * 8U; const std::size_t min_memory_bits = static_cast(kmer_count * bits_per_gamma[gamma_min]); From 13da23ea9e8a32a1aed9be497f4962f44553a29d Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Sep 2021 12:30:20 -0400 Subject: [PATCH 217/350] Reduce single unipath buffer size (empirical) --- include/Read_CdBG_Extractor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 52e67e04..1b48df3e 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -41,7 +41,7 @@ class Read_CdBG_Extractor // TODO: give these limits more thoughts, especially their exact impact on the memory usage. static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitigs can be retained in memory, at most, before flushing. - static constexpr std::size_t SEQ_SZ = 5 * 1024ULL * 1024ULL; // 5 MB (soft limit) sized maximal unitig, at most, is constructed at a time. + static constexpr std::size_t SEQ_SZ = 1 * 1024ULL * 1024ULL; // 1 MB (soft limit) sized maximal unitig, at most, is constructed at a time. mutable uint64_t vertices_scanned = 0; // Total number of vertices scanned from the database. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. From 48658813efdcb60fa093de0146116507233dfb63 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Sep 2021 14:18:55 -0400 Subject: [PATCH 218/350] Disambiguate doc --- include/Build_Params.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 6e5c60a2..44cfbb91 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -26,7 +26,7 @@ class Build_Params const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). const uint16_t thread_count_; // Number of threads to work with. - const std::size_t max_memory_; // Soft maximum memory limit. + const std::size_t max_memory_; // Soft maximum memory limit (in GB). const bool strict_memory_; // Whether strict memory limit restriction is specifiied. const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). @@ -135,7 +135,7 @@ class Build_Params } - // Returns the soft maximum memory limit. + // Returns the soft maximum memory limit (in GB). std::size_t max_memory() const { return max_memory_; From b424d794cca47e502cb808b0ac7d3825e5a7a11c Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Sep 2021 14:43:15 -0400 Subject: [PATCH 219/350] Use memory-constrained hash table --- src/Read_CdBG.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 00189c81..5f012cf6 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -121,9 +121,20 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_me template void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool load) { - hash_table = std::make_unique>(vertex_db_path(), vertex_count); - load ? hash_table->load(params) : - hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + if(load) + { + hash_table = std::make_unique>(vertex_db_path(), vertex_count); + hash_table->load(params); + } + else + { + std::size_t max_memory = std::max(process_peak_memory(), params.max_memory() * 1024U * 1024U * 1024U); + const std::size_t parser_memory = Kmer_SPMC_Iterator::memory(params.thread_count()); + max_memory = (max_memory > parser_memory ? max_memory - parser_memory : 0); + + hash_table = std::make_unique>(vertex_db_path(), vertex_count, max_memory); + hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + } } From 92d6829dc669b36f739049ea8b1e2f2ceeed476e Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 1 Oct 2021 16:27:17 -0400 Subject: [PATCH 220/350] Use finer-resolution memory should squeeze more performance --- include/Kmer_Hash_Table.hpp | 23 ++++++++++++++++++++--- src/Kmer_Hash_Table.cpp | 25 ++++++++++++++----------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index eb76608b..6ad5d9a0 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -26,11 +26,28 @@ class Kmer_Hash_Table private: // The minimum gamma-value that we require for BBHash. - static constexpr uint8_t gamma_min = 2; + static constexpr double gamma_min = 2.0; - // Empiricial bits-per-key requirement for each gamma in the range (0, 10]. - static constexpr double bits_per_gamma[] = {0, 3.07, 3.71, 4.71, 5.78, 6.87, 7.97, 9.08, 10.20, 11.30, 12.4}; + // The minimum bits per hash key we require for BBHash. + static constexpr double min_bits_per_key = 3.71; + // Empiricial bits-per-key requirement for each gamma in the range (0, 10]. + static constexpr double bits_per_gamma[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3.06, 3.07, 3.11, 3.16, 3.22, 3.29, 3.36, 3.44, 3.53, 3.62, + 3.71, 3.80, 3.90, 4.00, 4.10, 4.20, 4.30, 4.40, 4.50, 4.61, + 4.71, 4.82, 4.92, 5.03, 5.13, 5.24, 5.35, 5.45, 5.56, 5.67, + 5.78, 5.89, 6.00, 6.10, 6.21, 6.32, 6.43, 6.54, 6.65, 6.76, + 6.87, 6.98, 7.09, 7.20, 7.31, 7.42, 7.53, 7.64, 7.75, 7.86, + 7.97, 8.08, 8.20, 8.31, 8.42, 8.53, 8.64, 8.75, 8.86, 8.97, + 9.08, 9.20, 9.31, 9.42, 9.53, 9.64, 9.75, 9.86, 9.98, 10.09, + 10.20, 10.31, 10.42, 10.53, 10.64, 10.76, 10.87, 10.98, 11.09, 11.20, + 11.31, 11.43, 11.54, 11.65, 11.76, 11.87, 11.99, 12.10, 12.21, 12.32, + 12.43}; + + // The resolution of gamma that we support. + static constexpr double gamma_resolution = 0.1; + + // The gamma parameter of the BBHash function. // Lowest bits/elem is achieved with gamma = 1, higher values lead to larger mphf but faster construction/query. double gamma; diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 64b96fa0..2a4262bc 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -5,11 +5,19 @@ #include #include +#include #include #include #include + +template constexpr double Kmer_Hash_Table::gamma_min; +template constexpr double Kmer_Hash_Table::min_bits_per_key; +template constexpr double Kmer_Hash_Table::bits_per_gamma[]; +template constexpr double Kmer_Hash_Table::gamma_resolution; + + template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path): Kmer_Hash_Table(kmc_db_path, Kmer_Container::size(kmc_db_path)) {} @@ -17,7 +25,7 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count): - gamma(static_cast(gamma_min)), + gamma(gamma_min), kmc_db_path(kmc_db_path), kmer_count(kmer_count), sparse_lock(kmer_count, lock_count) @@ -35,18 +43,12 @@ template void Kmer_Hash_Table::set_gamma(const std::size_t max_memory) { const std::size_t max_memory_bits = max_memory * 8U; - const std::size_t min_memory_bits = static_cast(kmer_count * bits_per_gamma[gamma_min]); + const std::size_t min_memory_bits = static_cast(kmer_count * min_bits_per_key); if(max_memory_bits > min_memory_bits) { const double max_bits_per_hash_key = (static_cast(max_memory_bits) / kmer_count) - BITS_PER_KEY; - constexpr double eps = 0.01; - - std::size_t gamma_idx = sizeof(bits_per_gamma) / sizeof(*bits_per_gamma) - 1; - for(; gamma_idx > gamma_min; --gamma_idx) - if(bits_per_gamma[gamma_idx] - eps < max_bits_per_hash_key) - break; - - gamma = gamma_idx; + const std::size_t gamma_idx = (std::upper_bound(bits_per_gamma, bits_per_gamma + (sizeof(bits_per_gamma) / sizeof(*bits_per_gamma)), max_bits_per_hash_key) - 1) - bits_per_gamma; + gamma = gamma_idx * gamma_resolution; } } @@ -75,6 +77,7 @@ void Kmer_Hash_Table::build_mph_function(const uint16_t thread_ // auto data_iterator = boomphf::range(kmer_container.buf_begin(), kmer_container.buf_end()); const auto data_iterator = boomphf::range(kmer_container.spmc_begin(thread_count), kmer_container.spmc_end(thread_count)); + std::cout << "Using gamma = " << gamma << ".\n"; mph = new mphf_t(kmer_count, data_iterator, working_dir_path, thread_count, gamma); std::cout << "Built the MPHF in memory.\n"; @@ -199,7 +202,7 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co const uint64_t total_bits = mph->totalBitSize(); std::cout << "\nTotal MPHF size: " << total_bits / (8 * 1024 * 1024) << " MB." - " Bits per k-mer: " << (float)(total_bits) / kmer_count << ".\n"; + " Bits per k-mer: " << static_cast(total_bits) / kmer_count << ".\n"; // Allocate the hash table buckets. hash_table.resize(kmer_count); From f90d19241cc59b5a9d4ca9761928e0d1d3cfd50a Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 5 Oct 2021 19:25:11 -0400 Subject: [PATCH 221/350] Bump min memory req. to 3 --- include/kmer_Enumerator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index 26c69bfb..a0f1252e 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -17,7 +17,7 @@ class kmer_Enumerator { private: - static constexpr std::size_t min_memory = 2; // In GB; set as per the KMC3 library requirement. + static constexpr std::size_t min_memory = 3; // In GB; set as per the KMC3 library requirement. static constexpr uint16_t bin_count = 2000; static constexpr uint16_t signature_len = 11; static constexpr double bits_per_kmer = 9.71; From 0108f3cda1933d3d0debbabe3c5f9dc288bde339 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 14 Oct 2021 18:54:30 -0400 Subject: [PATCH 222/350] Rectify potential symbol clash --- CMakeLists.txt | 4 ++-- src/kmer_Enumerator.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 369028a1..af225451 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,8 +45,8 @@ if(INSTANCE_COUNT) add_compile_definitions(INSTANCE_COUNT=${INSTANCE_COUNT}) endif() -if(VALIDATION_MODE) - add_compile_definitions(VALIDATION_MODE) +if(CF_VALIDATION_MODE) + add_compile_definitions(CF_VALIDATION_MODE) endif() execute_process( diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 5b1dfb7d..1109b64d 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -38,7 +38,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetCutoffMin(cutoff) .SetNThreads(thread_count) .SetStrictMemoryMode(strict_memory) -#ifndef VALIDATION_MODE +#ifndef CF_VALIDATION_MODE .SetCounterMax(counter_max) #endif .SetOutputFileName(output_db_path) From 910ae31b1a439491f9d0d56bc565271ce26805f6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 14 Oct 2021 18:58:24 -0400 Subject: [PATCH 223/350] Introduce develop mode --- CMakeLists.txt | 4 ++++ src/main.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index af225451..04a8ced1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,10 @@ if(CF_VALIDATION_MODE) add_compile_definitions(CF_VALIDATION_MODE) endif() +if(CF_DEVELOP_MODE) + add_compile_definitions(CF_DEVELOP_MODE) +endif() + execute_process( COMMAND getconf LEVEL1_DCACHE_LINESIZE COMMAND tr -d '\n' diff --git a/src/main.cpp b/src/main.cpp index ff985211..b1273c00 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -172,6 +172,10 @@ void validate(int argc, char** argv) int main(int argc, char** argv) { +#ifdef CF_DEVELOP_MODE + std::cout << "Warning: Executing in Develop Mode.\n"; +#endif + if(argc < 2) { std::cout << "Usage:\ncuttlefish [OPTIONS]" << std::endl; From 6e57469f09161cfa15b157046be702daf62f4b88 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 14 Oct 2021 19:11:02 -0400 Subject: [PATCH 224/350] Fix misused include --- include/Input_Defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index f571981d..35e53857 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -4,7 +4,7 @@ -#include +#include "Output_Format.hpp" namespace cuttlefish From 5ef3f433390a5f6e8a20b20f3a7dc0653e941912 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 14 Oct 2021 22:26:09 -0400 Subject: [PATCH 225/350] Let developer override gamma --- include/Build_Params.hpp | 21 ++++++++++++++++++++- include/Input_Defaults.hpp | 3 +++ include/Kmer_Hash_Table.hpp | 9 +++++++++ src/Kmer_Hash_Table.cpp | 12 ++++++++++++ src/Read_CdBG.cpp | 7 ++++++- src/main.cpp | 12 +++++++++++- 6 files changed, 61 insertions(+), 3 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 44cfbb91..fd051ab2 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -38,6 +38,9 @@ class Build_Params const std::string json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. +#ifdef CF_DEVELOP_MODE + const double gamma_; // The gamma parameter for the BBHash MPHF. +#endif public: @@ -63,7 +66,11 @@ class Build_Params const bool save_vertices, const std::string& json_file_path, const bool dcc_opt, - const bool extract_cycles): + const bool extract_cycles +#ifdef CF_DEVELOP_MODE + , const double gamma +#endif + ): is_read_graph_(is_read_graph), seq_input_(seq_paths, list_paths, dir_paths), k_(k), @@ -83,6 +90,9 @@ class Build_Params json_file_path_(json_file_path), dcc_opt_(dcc_opt), extract_cycles_(extract_cycles) +#ifdef CF_DEVELOP_MODE + , gamma_(gamma) +#endif {} @@ -226,6 +236,15 @@ class Build_Params } +#ifdef CF_DEVELOP_MODE + // Returns the gamma parameter for the BBHash MPHF. + double gamma() const + { + return gamma_; + } +#endif + + // Returns `true` iff the parameters selections are valid. bool is_valid() const; }; diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 35e53857..cf4ee5a9 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -17,6 +17,9 @@ namespace cuttlefish constexpr uint32_t CUTOFF_FREQ = 2; // Typical practice constexpr uint16_t THREAD_COUNT = 1; constexpr std::size_t MAX_MEMORY = 2; // Set as per KMC3 library requirement. +#ifdef CF_DEVELOP_MODE + constexpr double GAMMA = 0; +#endif constexpr uint16_t OP_FORMAT = Output_Format::txt; constexpr char WORK_DIR[] = "."; } diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 6ad5d9a0..4b150182 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -106,6 +106,15 @@ class Kmer_Hash_Table // The hash table may use at most `max_memory` bytes of memory. Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count, std::size_t max_memory); +#ifdef CF_DEVELOP_MODE + // Constructs a k-mer hash table where the table is to be built over the k-mer + // database having path prefix `kmer_db_path` and `kmer_count` distinct k-mers. + // The gamma factor of the BBHash MPHF of the table is attempted to be set to + // `gamma`, if it is non-zero. Otherwise, it is set such that the the hash + // table may use at most `max_memory` bytes of memory. + Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count, std::size_t max_memory, double gamma); +#endif + // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, // using up-to `thread_count` number of threads. If a non-empty path is passed diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 2a4262bc..9744dabe 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -39,6 +39,18 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path } +#ifdef CF_DEVELOP_MODE +template +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory, const double gamma): Kmer_Hash_Table(kmc_db_path, kmer_count) +{ + if(gamma > 0) + this->gamma = gamma; + else + set_gamma(max_memory); +} +#endif + + template void Kmer_Hash_Table::set_gamma(const std::size_t max_memory) { diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 5f012cf6..32b8681b 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -132,7 +132,12 @@ void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool const std::size_t parser_memory = Kmer_SPMC_Iterator::memory(params.thread_count()); max_memory = (max_memory > parser_memory ? max_memory - parser_memory : 0); - hash_table = std::make_unique>(vertex_db_path(), vertex_count, max_memory); + hash_table = +#ifdef CF_DEVELOP_MODE + std::make_unique>(vertex_db_path(), vertex_count, max_memory, params.gamma()); +#else + std::make_unique>(vertex_db_path(), vertex_count, max_memory); +#endif hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); } } diff --git a/src/main.cpp b/src/main.cpp index b1273c00..6800b0d4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -46,6 +46,9 @@ void build(int argc, char** argv) // TODO: remove the following arg ("no-dcc", "turn off optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") ("cycles", "extract the detached chordless cycles of the graph") +#ifdef CF_DEVELOP_MODE + ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) +#endif ("h,help", "print usage"); try @@ -78,13 +81,20 @@ void build(int argc, char** argv) const auto json_file = result["json"].as(); const auto dcc_opt = !result["no-dcc"].as(); const auto extract_cycles = result["cycles"].as(); +#ifdef CF_DEVELOP_MODE + const double gamma = result["gamma"].as(); +#endif const Build_Params params( is_read_graph, refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, remove_kmc_db, mph_file, buckets_file, save_vertices, json_file, - dcc_opt, extract_cycles); + dcc_opt, extract_cycles +#ifdef CF_DEVELOP_MODE + , gamma +#endif + ); if(!params.is_valid()) { std::cerr << "Invalid input configuration. Aborting.\n"; From 6c2260811887064ccc0f2e793a1430d59361c38d Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 15 Oct 2021 22:30:20 -0400 Subject: [PATCH 226/350] Fix logic bug --- include/Kmer_Hash_Table.hpp | 2 +- src/Kmer_Hash_Table.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 4b150182..f88a5a17 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -29,7 +29,7 @@ class Kmer_Hash_Table static constexpr double gamma_min = 2.0; // The minimum bits per hash key we require for BBHash. - static constexpr double min_bits_per_key = 3.71; + static constexpr double min_bits_per_hash_key = 3.71; // Empiricial bits-per-key requirement for each gamma in the range (0, 10]. static constexpr double bits_per_gamma[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 9744dabe..34775fdd 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -13,7 +13,7 @@ template constexpr double Kmer_Hash_Table::gamma_min; -template constexpr double Kmer_Hash_Table::min_bits_per_key; +template constexpr double Kmer_Hash_Table::min_bits_per_hash_key; template constexpr double Kmer_Hash_Table::bits_per_gamma[]; template constexpr double Kmer_Hash_Table::gamma_resolution; @@ -55,7 +55,7 @@ template void Kmer_Hash_Table::set_gamma(const std::size_t max_memory) { const std::size_t max_memory_bits = max_memory * 8U; - const std::size_t min_memory_bits = static_cast(kmer_count * min_bits_per_key); + const std::size_t min_memory_bits = static_cast(kmer_count * (min_bits_per_hash_key + BITS_PER_KEY)); if(max_memory_bits > min_memory_bits) { const double max_bits_per_hash_key = (static_cast(max_memory_bits) / kmer_count) - BITS_PER_KEY; From 1b2b78ab60c12f1610f7fbbd81290df27a289c3c Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 16 Oct 2021 18:24:39 -0400 Subject: [PATCH 227/350] Let developer override graph DBs --- src/Read_CdBG.cpp | 52 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 32b8681b..47ec7e99 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -43,6 +43,33 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); +#ifdef CF_DEVELOP_MODE + + uint64_t edge_count; + uint64_t vertex_count; + + if(params.edge_db_path().empty()) + { + kmer_Enumeration_Stats edge_stats = enumerate_edges(); + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); + + edge_count = edge_stats.kmer_count(); + vertex_count = vertex_stats.kmer_count(); + } + else if(!params.vertex_db_path().empty()) + { + edge_count = Kmer_Container::size(params.edge_db_path()); + vertex_count = Kmer_Container::size(params.vertex_db_path()); + } + else + { + std::cerr << "Vertex database must also be provided if edge database is passed. Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); + std::cout << "Enumerated the edge and the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_start).count() << " seconds.\n"; +#else std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; kmer_Enumeration_Stats edge_stats = enumerate_edges(); @@ -57,12 +84,15 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_edges).count() << " seconds.\n"; - std::cout << "Number of edges: " << edge_stats.kmer_count() << ".\n"; - std::cout << "Number of vertices: " << vertex_stats.kmer_count() << ".\n"; + const uint64_t edge_count = edge_stats.kmer_count(); + const uint64_t vertex_count = vertex_stats.kmer_count(); +#endif + std::cout << "Number of edges: " << edge_count << ".\n"; + std::cout << "Number of vertices: " << vertex_count << ".\n"; std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; - construct_hash_table(vertex_stats.kmer_count()); + construct_hash_table(vertex_count); std::chrono::high_resolution_clock::time_point t_mphf = std::chrono::high_resolution_clock::now(); std::cout << "Constructed the minimal perfect hash function for the vertices. Time taken = " << std::chrono::duration_cast>(t_mphf - t_vertices).count() << " seconds.\n"; @@ -71,6 +101,9 @@ void Read_CdBG::construct() std::cout << "\nComputing the DFA states.\n"; compute_DFA_states(); +#ifdef CF_DEVELOP_MODE + if(params.edge_db_path().empty()) +#endif Kmer_Container::remove(edge_db_path()); if(!params.extract_cycles() && !params.dcc_opt()) hash_table->save(params); @@ -84,6 +117,9 @@ void Read_CdBG::construct() if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) // Either there are no DCCs, or the DCCs have already been extracted in this run. { +#ifdef CF_DEVELOP_MODE + if(params.vertex_db_path().empty()) +#endif if(!params.save_vertices()) Kmer_Container::remove(vertex_db_path()); @@ -93,8 +129,10 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs and DCCs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; +#ifndef CF_DEVELOP_MODE const double max_disk_usage = std::max(edge_stats.temp_disk_usage(), vertex_stats.temp_disk_usage()) / (1024.0 * 1024.0 * 1024.0); std::cout << "\nMaximum temporary disk-usage: " << max_disk_usage << "GB.\n"; +#endif } @@ -246,6 +284,10 @@ bool Read_CdBG::is_constructed() const template const std::string Read_CdBG::edge_db_path() const { +#ifdef CF_DEVELOP_MODE + return params.edge_db_path().empty()? (params.output_prefix() + cuttlefish::file_ext::edges_ext) : params.edge_db_path(); +#endif + return params.output_prefix() + cuttlefish::file_ext::edges_ext; } @@ -253,6 +295,10 @@ const std::string Read_CdBG::edge_db_path() const template const std::string Read_CdBG::vertex_db_path() const { +#ifdef CF_DEVELOP_MODE + return params.vertex_db_path().empty() ? (params.output_prefix() + cuttlefish::file_ext::vertices_ext) : params.vertex_db_path(); +#endif + return params.output_prefix() + cuttlefish::file_ext::vertices_ext; } From d3bb3518e1bfb0c27864ddb2a29376f2237d9449 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 18 Oct 2021 21:33:26 -0400 Subject: [PATCH 228/350] Templatize k-mer enum stats --- include/Read_CdBG.hpp | 6 +++--- include/kmer_Enumerator.hpp | 5 +++-- src/Read_CdBG.cpp | 8 ++++---- src/kmer_Enumerator.cpp | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 9705402b..e276511e 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -12,7 +12,7 @@ #include -class kmer_Enumeration_Stats; +template class kmer_Enumeration_Stats; // Read de Bruijn graph class to support the compaction algorithm. @@ -29,11 +29,11 @@ class Read_CdBG // Enumerates the edges of the de Bruijn graph and returns summary statistics of the // enumearation. - kmer_Enumeration_Stats enumerate_edges() const; + kmer_Enumeration_Stats enumerate_edges() const; // Enumerates the vertices of the de Bruijn graph using at most `max_memory` amount of // memory, and returns summary statistics of the enumeration. - kmer_Enumeration_Stats enumerate_vertices(std::size_t max_memory) const; + kmer_Enumeration_Stats enumerate_vertices(std::size_t max_memory) const; // Constructs the Cuttlefish hash table for the `vertex_count` vertices of the graph. // If `load` is specified, then it is loaded from disk. diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index a0f1252e..fabfa08e 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -8,7 +8,7 @@ #include "kmc_runner.h" -class kmer_Enumeration_Stats; +template class kmer_Enumeration_Stats; // Class to enumerate all the k-mers for some provided input collection. @@ -50,7 +50,7 @@ class kmer_Enumerator // `estimate_mem_usage` is `true`, otherwise `max_memory` is the limit. Temporary files are // written to `working_dir_path`. The output database is stored at path prefix `output_db_path`. // Returns summary statistics of the enumeration. - kmer_Enumeration_Stats enumerate( + kmer_Enumeration_Stats enumerate( KMC::InputFileType input_file_type, const std::vector& seqs, uint32_t cutoff, uint16_t thread_count, std::size_t max_memory, bool strict_memory, bool estimate_mem_usage, const std::string& working_dir_path, const std::string& output_db_path); @@ -58,6 +58,7 @@ class kmer_Enumerator // A class to wrap summary statistics of k-mer enumeration by `kmer_Enumerator`. +template class kmer_Enumeration_Stats { private: diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 47ec7e99..a11992f0 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -72,14 +72,14 @@ void Read_CdBG::construct() #else std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; - kmer_Enumeration_Stats edge_stats = enumerate_edges(); + kmer_Enumeration_Stats edge_stats = enumerate_edges(); std::chrono::high_resolution_clock::time_point t_edges = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the edge set of the graph. Time taken = " << std::chrono::duration_cast>(t_edges - t_start).count() << " seconds.\n"; std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; - kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_edges).count() << " seconds.\n"; @@ -137,7 +137,7 @@ void Read_CdBG::construct() template -kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const +kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { return kmer_Enumerator().enumerate( KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), @@ -147,7 +147,7 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const template -kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const +kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { return kmer_Enumerator().enumerate( KMC::InputFileType::KMC, std::vector(1, edge_db_path()), 1, diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 1109b64d..84da5401 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -3,7 +3,7 @@ template -kmer_Enumeration_Stats kmer_Enumerator::enumerate( +kmer_Enumeration_Stats kmer_Enumerator::enumerate( const KMC::InputFileType input_file_type, const std::vector& seqs, const uint32_t cutoff, const uint16_t thread_count, const std::size_t max_memory, const bool strict_memory, const bool estimate_mem_usage, const std::string& working_dir_path, const std::string& output_db_path) @@ -51,7 +51,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( const uint64_t kmer_count = stage2_results.nUniqueKmers - stage2_results.nBelowCutoffMin - stage2_results.nAboveCutoffMax; - return kmer_Enumeration_Stats(kmer_count, memory, stage2_results.maxDiskUsage); + return kmer_Enumeration_Stats(kmer_count, memory, stage2_results.maxDiskUsage); } From ff952d32a1aecafd22912e914b3f8e1e013467da Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 18 Oct 2021 22:37:43 -0400 Subject: [PATCH 229/350] Revamp k-mer enum stats --- include/kmer_Enumerator.hpp | 68 +++++++++++++++++++++++++++++++------ src/Read_CdBG.cpp | 5 +-- src/kmer_Enumerator.cpp | 3 +- 3 files changed, 62 insertions(+), 14 deletions(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index fabfa08e..50eb05ee 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -63,23 +63,59 @@ class kmer_Enumeration_Stats { private: - const uint64_t kmer_count_; - const std::size_t max_memory_; - const std::size_t temp_disk_usage_; + const KMC::Stage1Results stage1_results; // Results stats of KMC stage 1 execution. + const KMC::Stage2Results stage2_results; // Results stats of KMC stage 2 execution. + const std::size_t max_memory_; // Maximum memory usage allowed for the KMC executions. public: - kmer_Enumeration_Stats(const uint64_t kmer_count, const std::size_t max_memory, const std::size_t temp_disk_usage): - kmer_count_(kmer_count), - max_memory_(max_memory), - temp_disk_usage_(temp_disk_usage) + kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory): + stage1_results(stage1_results), + stage2_results(stage2_results), + max_memory_(max_memory) {} - uint64_t kmer_count() const + uint64_t seq_count() const { - return kmer_count_; + return stage1_results.nSeqences; + } + + + uint64_t seq_len() const + { + return total_kmer_count() + (seq_count() * (k - 1)); + } + + + uint64_t total_kmer_count() const + { + return stage2_results.nTotalKmers; + } + + + uint64_t unique_kmer_count() const + { + return stage2_results.nUniqueKmers; + } + + + uint64_t below_min_cutoff_kmer_count() const + { + return stage2_results.nBelowCutoffMin; + } + + + uint64_t above_max_cutoff_kmer_count() const + { + return stage2_results.nAboveCutoffMax; + } + + + uint64_t counted_kmer_count() const + { + return unique_kmer_count() - (below_min_cutoff_kmer_count() + above_max_cutoff_kmer_count()); } @@ -91,7 +127,19 @@ class kmer_Enumeration_Stats std::size_t temp_disk_usage() const { - return temp_disk_usage_; + return stage2_results.maxDiskUsage; + } + + + void log_stats() const + { + std::cout << k << "-mer enumeration statistics:\n"; + + std::cout << "\tNumber of sequences:\t" << seq_count() << ".\n"; + std::cout << "\tTotal sequence length:\t" << seq_len() << ".\n"; + std::cout << "\tTotal number of " << k << "-mers:\t" << total_kmer_count() << ".\n"; + std::cout << "\tNumber of unique " << k << "-mers:\t" << unique_kmer_count() << ".\n"; + std::cout << "\tNumber of counted " << k << "-mers:\t" << counted_kmer_count() << ".\n"; } }; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index a11992f0..fdf60295 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -73,6 +73,7 @@ void Read_CdBG::construct() std::cout << "\nEnumerating the edges of the de Bruijn graph.\n"; kmer_Enumeration_Stats edge_stats = enumerate_edges(); + edge_stats.log_stats(); std::chrono::high_resolution_clock::time_point t_edges = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the edge set of the graph. Time taken = " << std::chrono::duration_cast>(t_edges - t_start).count() << " seconds.\n"; @@ -84,8 +85,8 @@ void Read_CdBG::construct() std::chrono::high_resolution_clock::time_point t_vertices = std::chrono::high_resolution_clock::now(); std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertices - t_edges).count() << " seconds.\n"; - const uint64_t edge_count = edge_stats.kmer_count(); - const uint64_t vertex_count = vertex_stats.kmer_count(); + const uint64_t edge_count = edge_stats.counted_kmer_count(); + const uint64_t vertex_count = vertex_stats.counted_kmer_count(); #endif std::cout << "Number of edges: " << edge_count << ".\n"; std::cout << "Number of vertices: " << vertex_count << ".\n"; diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 84da5401..4c8d5006 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -50,8 +50,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( stage2_results = kmc.RunStage2(stage2_params); - const uint64_t kmer_count = stage2_results.nUniqueKmers - stage2_results.nBelowCutoffMin - stage2_results.nAboveCutoffMax; - return kmer_Enumeration_Stats(kmer_count, memory, stage2_results.maxDiskUsage); + return kmer_Enumeration_Stats(stage1_results, stage2_results, memory); } From 045b4d1c20f0539f445bb4b9300f020969bfa3a0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 18 Oct 2021 22:56:18 -0400 Subject: [PATCH 230/350] Clarify documentation --- include/kmer_Enumerator.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index 50eb05ee..47258027 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -35,7 +35,7 @@ class kmer_Enumerator // estimated through KMC3's approximation step. uint64_t solid_kmer_count_approx(uint16_t cutoff) const; - // Returns the strict memory limit for the actual KMC3 execution, based on the number of + // Returns the strict memory limit (in GB) for the actual KMC3 execution, based on the number of // unique k-mers `unique_kmer_count` (typically approximated earlier). std::size_t memory_limit(uint64_t unique_kmer_count) const; @@ -44,9 +44,9 @@ class kmer_Enumerator // Enumerates the k-mers from the sequences (of type `input_file_type`) present is `seqs`, that // are present at least `cutoff` times. Employs `thread_count` number of processor threads and - // uses a soft memory-cap of `max_memory`. If `strict_memory` is `true`, then the memory usage - // is attempted to be kept within a limit—the max of `max_memory` and the estimated memory to - // be used by the downstream stages of Cuttlefish. This memory estimation is made only if + // uses a soft memory-cap of `max_memory` (in GB). If `strict_memory` is `true`, then the memory + // usage is attempted to be kept within a limit—the max of `max_memory` and the estimated memory + // to be used by the downstream stages of Cuttlefish. This memory estimation is made only if // `estimate_mem_usage` is `true`, otherwise `max_memory` is the limit. Temporary files are // written to `working_dir_path`. The output database is stored at path prefix `output_db_path`. // Returns summary statistics of the enumeration. From 80bcc16eda8e9f0d0d676369cb1bc9fa6f29c31a Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 18 Oct 2021 23:05:13 -0400 Subject: [PATCH 231/350] Separate k-mer enum stats files --- include/kmer_Enumeration_Stats.hpp | 68 +++++++++++++++++++++ include/kmer_Enumerator.hpp | 87 +-------------------------- src/CMakeLists.txt | 1 + src/kmer_Enumeration_Stats.cpp | 94 ++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 86 deletions(-) create mode 100644 include/kmer_Enumeration_Stats.hpp create mode 100644 src/kmer_Enumeration_Stats.cpp diff --git a/include/kmer_Enumeration_Stats.hpp b/include/kmer_Enumeration_Stats.hpp new file mode 100644 index 00000000..6bf17b8d --- /dev/null +++ b/include/kmer_Enumeration_Stats.hpp @@ -0,0 +1,68 @@ + +#ifndef KMER_ENUMERATION_STATS_HPP +#define KMER_ENUMERATION_STATS_HPP + + + +#include +#include + +#include "kmc_runner.h" + + +// A class to wrap summary statistics of k-mer enumeration by `kmer_Enumerator`. +template +class kmer_Enumeration_Stats +{ +private: + + const KMC::Stage1Results stage1_results; // Results stats of KMC stage 1 execution. + const KMC::Stage2Results stage2_results; // Results stats of KMC stage 2 execution. + const std::size_t max_memory_; // Maximum memory usage allowed for the KMC executions. + + +public: + + // Constructs a a k-mer enumeration stats wrapper object for a KMC execution with + // first stage results in `stage1_results`, second stage results in `stage2_results`, + // and maximum allowed memory usage to be `max_memory` (in GB). + kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory); + + // Returns the number of sequences in the execution input. + uint64_t seq_count() const; + + // Returns the total length of the sequences in the execution input. + uint64_t seq_len() const; + + // Returns the total number of k-mers in the execution input. + uint64_t total_kmer_count() const; + + // Returns the number of unique k-mers (irrespective of the cutoff frequency used) in the + // execution input. + uint64_t unique_kmer_count() const; + + // Returns the number of unique k-mers in the execution input that have frequency below + // the minimum cutoff frequency used. + uint64_t below_min_cutoff_kmer_count() const; + + // Returns the number of unique k-mers in the execution input that have frequency above + // the maximum cutoff frequency used. + uint64_t above_max_cutoff_kmer_count() const; + + // Returns the number of unique k-mers in the execution input that have frequencies within + // the min and max cutoff frequencies used. + uint64_t counted_kmer_count() const; + + // Returns the maximum memory (in GB) allowed for the execution. + std::size_t max_memory() const; + + // Returns the temporary disk usage (in bytes) used by the execution. + std::size_t temp_disk_usage() const; + + // Logs a summary statistics of the execution. + void log_stats() const; +}; + + + +#endif diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index 47258027..5ba8ae76 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -5,6 +5,7 @@ #include "Build_Params.hpp" +#include "kmer_Enumeration_Stats.hpp" #include "kmc_runner.h" @@ -57,92 +58,6 @@ class kmer_Enumerator }; -// A class to wrap summary statistics of k-mer enumeration by `kmer_Enumerator`. -template -class kmer_Enumeration_Stats -{ -private: - - const KMC::Stage1Results stage1_results; // Results stats of KMC stage 1 execution. - const KMC::Stage2Results stage2_results; // Results stats of KMC stage 2 execution. - const std::size_t max_memory_; // Maximum memory usage allowed for the KMC executions. - - -public: - - kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory): - stage1_results(stage1_results), - stage2_results(stage2_results), - max_memory_(max_memory) - {} - - - uint64_t seq_count() const - { - return stage1_results.nSeqences; - } - - - uint64_t seq_len() const - { - return total_kmer_count() + (seq_count() * (k - 1)); - } - - - uint64_t total_kmer_count() const - { - return stage2_results.nTotalKmers; - } - - - uint64_t unique_kmer_count() const - { - return stage2_results.nUniqueKmers; - } - - - uint64_t below_min_cutoff_kmer_count() const - { - return stage2_results.nBelowCutoffMin; - } - - - uint64_t above_max_cutoff_kmer_count() const - { - return stage2_results.nAboveCutoffMax; - } - - - uint64_t counted_kmer_count() const - { - return unique_kmer_count() - (below_min_cutoff_kmer_count() + above_max_cutoff_kmer_count()); - } - - - std::size_t max_memory() const - { - return max_memory_; - } - - - std::size_t temp_disk_usage() const - { - return stage2_results.maxDiskUsage; - } - - - void log_stats() const - { - std::cout << k << "-mer enumeration statistics:\n"; - - std::cout << "\tNumber of sequences:\t" << seq_count() << ".\n"; - std::cout << "\tTotal sequence length:\t" << seq_len() << ".\n"; - std::cout << "\tTotal number of " << k << "-mers:\t" << total_kmer_count() << ".\n"; - std::cout << "\tNumber of unique " << k << "-mers:\t" << unique_kmer_count() << ".\n"; - std::cout << "\tNumber of counted " << k << "-mers:\t" << counted_kmer_count() << ".\n"; - } -}; - // A class to display progress of the k-mer enumeration execution. class FunnyProgress: public KMC::IPercentProgressObserver diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8e357b92..81204833 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,6 +27,7 @@ set(PROJECT_SRC CdBG_GFA_Writer.cpp CdBG_GFA_Reduced_Writer.cpp kmer_Enumerator.cpp + kmer_Enumeration_Stats.cpp State_Read_Space.cpp Read_CdBG.cpp Read_CdBG_Constructor.cpp diff --git a/src/kmer_Enumeration_Stats.cpp b/src/kmer_Enumeration_Stats.cpp new file mode 100644 index 00000000..0efa6053 --- /dev/null +++ b/src/kmer_Enumeration_Stats.cpp @@ -0,0 +1,94 @@ + +#include + +#include "kmer_Enumeration_Stats.hpp" +#include "globals.hpp" + + +template +kmer_Enumeration_Stats::kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory): + stage1_results(stage1_results), + stage2_results(stage2_results), + max_memory_(max_memory) +{} + + +template +uint64_t kmer_Enumeration_Stats::seq_count() const +{ + return stage1_results.nSeqences; +} + + +template +uint64_t kmer_Enumeration_Stats::seq_len() const +{ + return total_kmer_count() + (seq_count() * (k - 1)); +} + + +template +uint64_t kmer_Enumeration_Stats::total_kmer_count() const +{ + return stage2_results.nTotalKmers; +} + + +template +uint64_t kmer_Enumeration_Stats::unique_kmer_count() const +{ + return stage2_results.nUniqueKmers; +} + + +template +uint64_t kmer_Enumeration_Stats::below_min_cutoff_kmer_count() const +{ + return stage2_results.nBelowCutoffMin; +} + + +template +uint64_t kmer_Enumeration_Stats::above_max_cutoff_kmer_count() const +{ + return stage2_results.nAboveCutoffMax; +} + + +template +uint64_t kmer_Enumeration_Stats::counted_kmer_count() const +{ + return unique_kmer_count() - (below_min_cutoff_kmer_count() + above_max_cutoff_kmer_count()); +} + + +template +std::size_t kmer_Enumeration_Stats::max_memory() const +{ + return max_memory_; +} + + +template +std::size_t kmer_Enumeration_Stats::temp_disk_usage() const +{ + return stage2_results.maxDiskUsage; +} + + +template +void kmer_Enumeration_Stats::log_stats() const +{ + std::cout << k << "-mer enumeration statistics:\n"; + + std::cout << "\tNumber of sequences:\t" << seq_count() << ".\n"; + std::cout << "\tTotal sequence length:\t" << seq_len() << ".\n"; + std::cout << "\tTotal number of " << k << "-mers:\t" << total_kmer_count() << ".\n"; + std::cout << "\tNumber of unique " << k << "-mers:\t" << unique_kmer_count() << ".\n"; + std::cout << "\tNumber of counted " << k << "-mers:\t" << counted_kmer_count() << ".\n"; +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE_ALL, kmer_Enumeration_Stats) From 0ba9ffbe725237d942b2a2cf30d9e5ff6a8abbb4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 26 Oct 2021 19:37:50 -0400 Subject: [PATCH 232/350] Skip temp o/p file and move --- src/Read_CdBG.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index fdf60295..457852cc 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -196,16 +196,13 @@ template void Read_CdBG::extract_maximal_unitigs() { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); - const std::string temp_output_path = params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::temp; const std::string output_file_path = params.output_file_path(); - cdBg_extractor.extract_maximal_unitigs(vertex_db_path(), temp_output_path); + cdBg_extractor.extract_maximal_unitigs(vertex_db_path(), output_file_path); dbg_info.add_unipaths_info(cdBg_extractor); - if(!extract_DCCs(temp_output_path) && params.dcc_opt()) + if(!extract_DCCs(output_file_path) && params.dcc_opt()) hash_table->save(params); - - move_file(temp_output_path, output_file_path); } @@ -297,10 +294,10 @@ template const std::string Read_CdBG::vertex_db_path() const { #ifdef CF_DEVELOP_MODE - return params.vertex_db_path().empty() ? (params.output_prefix() + cuttlefish::file_ext::vertices_ext) : params.vertex_db_path(); + return params.vertex_db_path().empty() ? (params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::vertices_ext) : params.vertex_db_path(); #endif - return params.output_prefix() + cuttlefish::file_ext::vertices_ext; + return params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::vertices_ext; } From 38672643df7ad07f2c8c9d92787b5b8d6e0e7c81 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 27 Oct 2021 19:25:51 -0400 Subject: [PATCH 233/350] Misc. changes in k-mers reading --- include/kmc_api/kmc_file.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 31d90fd3..70c9b8a0 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -509,13 +509,11 @@ inline uint64_t CKMC_DB::read_raw_suffixes(uint8_t* const suff_buf, std::vector< const uint64_t suff_to_read = suff_id_next - sufix_number; if(suff_to_read > 0) { - const uint64_t prev_sufix_number = sufix_number; - if(suff_read_count + suff_to_read <= max_suff_count) { - suff_read_count += suff_to_read; sufix_number += suff_to_read; - pref_buf.emplace_back(prefix_index, sufix_number - prev_sufix_number); + pref_buf.emplace_back(prefix_index, suff_to_read); + suff_read_count += suff_to_read; if(sufix_number == total_kmers) end_of_file = true; @@ -523,8 +521,8 @@ inline uint64_t CKMC_DB::read_raw_suffixes(uint8_t* const suff_buf, std::vector< else { sufix_number += (max_suff_count - suff_read_count); + pref_buf.emplace_back(prefix_index, max_suff_count - suff_read_count); suff_read_count = max_suff_count; - pref_buf.emplace_back(prefix_index, sufix_number - prev_sufix_number); break; } From 7dec684fa47407fe80522d9c18cdc04641be23c6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 28 Oct 2021 19:18:41 -0400 Subject: [PATCH 234/350] Change handle type for fasta seq ref -> ptr --- include/FASTA_Record.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index b1499889..22105041 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -17,7 +17,7 @@ class FASTA_Record private: const T_id_ id_; // Identifier for the FASTA sequence. - const T_seq_& seq_; // The FASTA sequence. + const T_seq_* const seq_; // Pointer to the FASTA sequence. public: @@ -51,7 +51,7 @@ class FASTA_Record template inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): id_(id), - seq_(seq) + seq_(&seq) {} @@ -65,7 +65,7 @@ inline std::size_t FASTA_Record::header_size() const template inline std::size_t FASTA_Record::seq_size() const { - return seq_.size(); + return seq_->size(); } @@ -82,7 +82,7 @@ template inline void FASTA_Record::append_seq(std::vector& buffer) const { // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. - buffer.insert(buffer.end(), seq_.begin(), seq_.end()); + buffer.insert(buffer.end(), seq_->begin(), seq_->end()); } @@ -90,8 +90,8 @@ template template inline void FASTA_Record::append_rotated_cycle(std::vector& buffer, const std::size_t pivot) const { - buffer.insert(buffer.end(), seq_.begin() + pivot, seq_.end()); - buffer.insert(buffer.end(), seq_.begin() + k - 1, seq_.begin() + k - 1 + pivot); + buffer.insert(buffer.end(), seq_->begin() + pivot, seq_->end()); + buffer.insert(buffer.end(), seq_->begin() + k - 1, seq_->begin() + k - 1 + pivot); } From be50c27b5244bcfa9ef54bb090d66d2a93716a46 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 28 Oct 2021 20:58:09 -0400 Subject: [PATCH 235/350] Add provision for two seq containers in o/p record --- include/FASTA_Record.hpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index 22105041..ee602c6b 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -18,6 +18,7 @@ class FASTA_Record const T_id_ id_; // Identifier for the FASTA sequence. const T_seq_* const seq_; // Pointer to the FASTA sequence. + const T_seq_* const seq_add_; // Additional FASTA sequence (in case the original sequence `*seq` is broken into two parts). public: @@ -27,6 +28,12 @@ class FASTA_Record // correctness holds as long as the referred sequence itself remains unaltered. FASTA_Record(uint64_t id, const T_seq_& str); + // Constructs a FASTA header with identifier `id`, along with the sequences + // `seq` and `seq_add`. Only constant references to the sequences are captured, + // so the record's correctness holds as long as the referred sequences themselves + // remains unaltered. + FASTA_Record(uint64_t id, const T_seq_* seq, const T_seq_* seq_add = nullptr); + // Returns the length of the header line of the record. std::size_t header_size() const; @@ -51,7 +58,16 @@ class FASTA_Record template inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): id_(id), - seq_(&seq) + seq_(&seq), + seq_add_(nullptr) +{} + + +template +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_* const seq, const T_seq_* const seq_add): + id_(id), + seq_(seq), + seq_add_(seq_add) {} @@ -65,7 +81,7 @@ inline std::size_t FASTA_Record::header_size() const template inline std::size_t FASTA_Record::seq_size() const { - return seq_->size(); + return seq_->size() + (seq_add_ != nullptr ? seq_add_->size() : 0); } @@ -83,6 +99,8 @@ inline void FASTA_Record::append_seq(std::vector& buffer) c { // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. buffer.insert(buffer.end(), seq_->begin(), seq_->end()); + if(seq_add_ != nullptr) + buffer.insert(buffer.end(), seq_add_->begin(), seq_add_->end()); } From fde4df4c7ca55189f370226a7679daf23b74cd77 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 12:31:59 -0400 Subject: [PATCH 236/350] Resolve potential mult-def prob --- include/Read_CdBG_Extractor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 1b48df3e..10b1b019 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -274,7 +274,7 @@ inline void Read_CdBG_Extractor::reverse_complement(T_container_& seq) template -void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) +inline void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) { for(const uint64_t hash: path_hashes) hash_table.update(hash, State_Read_Space::get_outputted_state()); From 8e6558287ed7fe3e514f468c91b7a5cbaa05c47e Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 13:07:59 -0400 Subject: [PATCH 237/350] Restruct fasta-rec ctrs --- include/FASTA_Record.hpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index ee602c6b..5a0a3efd 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -21,6 +21,13 @@ class FASTA_Record const T_seq_* const seq_add_; // Additional FASTA sequence (in case the original sequence `*seq` is broken into two parts). + // Constructs a FASTA header with identifier `id`, along with the sequences + // `seq` and `seq_add`. Only constant references to the sequences are captured, + // so the record's correctness holds as long as the referred sequences themselves + // remains unaltered. + FASTA_Record(uint64_t id, const T_seq_* seq, const T_seq_* seq_add = nullptr); + + public: // Constructs a FASTA header with identifier `id` and the sequence `seq`. @@ -32,7 +39,7 @@ class FASTA_Record // `seq` and `seq_add`. Only constant references to the sequences are captured, // so the record's correctness holds as long as the referred sequences themselves // remains unaltered. - FASTA_Record(uint64_t id, const T_seq_* seq, const T_seq_* seq_add = nullptr); + FASTA_Record(uint64_t id, const T_seq_& seq, const T_seq_& seq_add); // Returns the length of the header line of the record. std::size_t header_size() const; @@ -56,10 +63,12 @@ class FASTA_Record template -inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): - id_(id), - seq_(&seq), - seq_add_(nullptr) +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): FASTA_Record(id, &seq) +{} + + +template +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq, const T_seq_& seq_add): FASTA_Record(id, &seq, &seq_add) {} From d11067bca76cd859a121b86a2b1d9bde09a03831 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 13:27:57 -0400 Subject: [PATCH 238/350] Be able to skip fasta-seq prefix --- include/FASTA_Record.hpp | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/include/FASTA_Record.hpp b/include/FASTA_Record.hpp index 5a0a3efd..7ba8e69f 100644 --- a/include/FASTA_Record.hpp +++ b/include/FASTA_Record.hpp @@ -6,6 +6,8 @@ #include "fmt/format.h" +#include + // ============================================================================= // A class wrapping a basic FASTA record: the sequence of type `T_seq_` and its @@ -19,27 +21,32 @@ class FASTA_Record const T_id_ id_; // Identifier for the FASTA sequence. const T_seq_* const seq_; // Pointer to the FASTA sequence. const T_seq_* const seq_add_; // Additional FASTA sequence (in case the original sequence `*seq` is broken into two parts). + const std::size_t offset_; // Offset position into the sequence `seq_`—data before this index will be skipped in the record. + const std::size_t offset_add_; // Offset position into the additional sequence `seq_add`—data before this index will be skipped in the record. // Constructs a FASTA header with identifier `id`, along with the sequences - // `seq` and `seq_add`. Only constant references to the sequences are captured, + // `seq` and `seq_add` (onward their indices `offset` and `offset_add`, + // respectively). Only constant references to the sequences are captured, // so the record's correctness holds as long as the referred sequences themselves // remains unaltered. - FASTA_Record(uint64_t id, const T_seq_* seq, const T_seq_* seq_add = nullptr); + FASTA_Record(uint64_t id, const T_seq_* seq, const T_seq_* seq_add, std::size_t offset = 0, std::size_t offset_add = 0); public: - // Constructs a FASTA header with identifier `id` and the sequence `seq`. - // Only a constant reference to the sequence is captured, so the record's - // correctness holds as long as the referred sequence itself remains unaltered. - FASTA_Record(uint64_t id, const T_seq_& str); + // Constructs a FASTA header with identifier `id` and the sequence `seq` + // (onward its index `offset`). Only a constant reference to the sequence + // is captured, so the record's correctness holds as long as the referred + // sequence itself remains unaltered. + FASTA_Record(uint64_t id, const T_seq_& str, std::size_t offset = 0); // Constructs a FASTA header with identifier `id`, along with the sequences - // `seq` and `seq_add`. Only constant references to the sequences are captured, + // `seq` and `seq_add` (onward their indices `offset` and `offset_add`, + // respectively). Only constant references to the sequences are captured, // so the record's correctness holds as long as the referred sequences themselves // remains unaltered. - FASTA_Record(uint64_t id, const T_seq_& seq, const T_seq_& seq_add); + FASTA_Record(uint64_t id, const T_seq_& seq, const T_seq_& seq_add, std::size_t offset = 0, std::size_t offset_add = 0); // Returns the length of the header line of the record. std::size_t header_size() const; @@ -63,20 +70,24 @@ class FASTA_Record template -inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq): FASTA_Record(id, &seq) +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq, const std::size_t offset): + FASTA_Record(id, &seq, nullptr, offset) {} template -inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq, const T_seq_& seq_add): FASTA_Record(id, &seq, &seq_add) +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_& seq, const T_seq_& seq_add, const std::size_t offset, const std::size_t offset_add): + FASTA_Record(id, &seq, &seq_add, offset, offset_add) {} template -inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_* const seq, const T_seq_* const seq_add): +inline FASTA_Record::FASTA_Record(const uint64_t id, const T_seq_* const seq, const T_seq_* const seq_add, const std::size_t offset, const std::size_t offset_add): id_(id), seq_(seq), - seq_add_(seq_add) + seq_add_(seq_add), + offset_(offset), + offset_add_(offset_add) {} @@ -90,7 +101,7 @@ inline std::size_t FASTA_Record::header_size() const template inline std::size_t FASTA_Record::seq_size() const { - return seq_->size() + (seq_add_ != nullptr ? seq_add_->size() : 0); + return (seq_->size() - offset_) + (seq_add_ != nullptr ? (seq_add_->size() - offset_add_) : 0); } @@ -107,9 +118,9 @@ template inline void FASTA_Record::append_seq(std::vector& buffer) const { // `std::memcpy` at the end of `buffer` does not update the size of the vector `buffer`. - buffer.insert(buffer.end(), seq_->begin(), seq_->end()); + buffer.insert(buffer.end(), seq_->begin() + offset_, seq_->end()); if(seq_add_ != nullptr) - buffer.insert(buffer.end(), seq_add_->begin(), seq_add_->end()); + buffer.insert(buffer.end(), seq_add_->begin() + offset_add_, seq_add_->end()); } From 26a41fcaf85cbe58fad86453e2d0d94f4554a05d Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 14:13:36 -0400 Subject: [PATCH 239/350] Delegate unipath line-break to buffer --- include/Character_Buffer.hpp | 5 +++-- src/Read_CdBG_Extractor.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/Character_Buffer.hpp b/include/Character_Buffer.hpp index 79f95685..afe193aa 100644 --- a/include/Character_Buffer.hpp +++ b/include/Character_Buffer.hpp @@ -133,11 +133,12 @@ template template inline void Character_Buffer::operator+=(const FASTA_Record& fasta_rec) { - ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size()); // 1 extra byte for the line-break. + ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size() + 1); // Two extra bytes for the line-breaks. fasta_rec.append_header(buffer); // Append the header. buffer.emplace_back('\n'); // Break line. fasta_rec.append_seq(buffer); // Append the sequence. + buffer.emplace_back('\n'); // Break line. } @@ -145,7 +146,7 @@ template template inline void Character_Buffer::rotate_append_cycle(const FASTA_Record& fasta_rec, const std::size_t pivot) { - ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size() + 1); // 2 extra bytes for two line-breaks. + ensure_space(fasta_rec.header_size() + 1 + fasta_rec.seq_size() + 1); // Two extra bytes for two line-breaks. fasta_rec.append_header(buffer); // Append the header. buffer.emplace_back('\n'); // Break line. diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 3e2653d1..70f1c558 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -111,7 +111,7 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p { extracted_unipaths_info.add_maximal_unitig(unipath); - unipath.emplace_back('\n'); + // unipath.emplace_back('\n'); // output_buffer += unipath; output_buffer += FASTA_Record>(id, unipath); // unipath.clear(); From 4805a5e6db4f8563694f9b25f8175a053dd65c64 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 14:35:27 -0400 Subject: [PATCH 240/350] Be able to build vertex from k-mer --- include/Directed_Vertex.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/Directed_Vertex.hpp b/include/Directed_Vertex.hpp index 21022cc9..37e69f26 100644 --- a/include/Directed_Vertex.hpp +++ b/include/Directed_Vertex.hpp @@ -49,6 +49,10 @@ class Directed_Vertex // Returns `true` iff the k-mer observed for the vertex is in its canonical form. bool in_canonical_form() const; + // Configures the vertex with the k-mer `v`, and uses the hash table `hash` to get the + // hash value of the vertex. + void from_kmer(const Kmer& v, const Kmer_Hash_Table& hash); + // Configures the vertex with the source (i.e. prefix) k-mer of the edge (k + 1)-mer `e`; // and uses the hash table `hash` to get the hash value of the vertex. void from_prefix(const Kmer& e, const Kmer_Hash_Table& hash); @@ -137,6 +141,14 @@ inline bool Directed_Vertex::in_canonical_form() const } +template +inline void Directed_Vertex::from_kmer(const Kmer& v, const Kmer_Hash_Table& hash) +{ + kmer_ = v; + init(hash); +} + + template inline void Directed_Vertex::from_prefix(const Kmer& e, const Kmer_Hash_Table& hash) { From e32863d99fb2f3720fd5f41e826d12c1100e7af9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 14:49:08 -0400 Subject: [PATCH 241/350] Add branching-ness checker for states --- include/State_Read_Space.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index c07eaddb..7c29c77f 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -67,6 +67,10 @@ class State_Read_Space // `side` of a vertex having this state. cuttlefish::edge_encoding_t edge_at(cuttlefish::side_t side) const; + // Returns `true` iff some vertex having this state is branching (i.e. has + // multiple incident edges) at its side `side`. + bool is_branching_side(cuttlefish::side_t side) const; + // Updates the `Extended_Base` encoding of the side `side` of this state, with // `edge`. For optimization purposes, only certain edge-updates have defined // behavior: empty-to-rest and unique-to-multi. @@ -123,6 +127,12 @@ inline cuttlefish::edge_encoding_t State_Read_Space::edge_at(const cuttlefish::s } +inline bool State_Read_Space::is_branching_side(const cuttlefish::side_t side) const +{ + return edge_at(side) == cuttlefish::edge_encoding_t::N; +} + + inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, const cuttlefish::edge_encoding_t edge) { side == cuttlefish::side_t::front ? set_front_encoding(edge) : set_back_encoding(edge); From 680112efbdbd760752056ab50eaa853452248b28 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 15:43:09 -0400 Subject: [PATCH 242/350] Separate some dBG utils --- include/Read_CdBG_Extractor.hpp | 26 ------------- include/dBG_Utilities.hpp | 61 +++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Detached_Cycles_Extractor.cpp | 3 +- src/Read_CdBG_Extractor.cpp | 2 +- src/dBG_Utilities.cpp | 2 + 6 files changed, 67 insertions(+), 28 deletions(-) create mode 100644 include/dBG_Utilities.hpp create mode 100644 src/dBG_Utilities.cpp diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 10b1b019..21b323c6 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -131,10 +131,6 @@ class Read_CdBG_Extractor // Closes the output sink. void close_output_sink(); - // Replaces the character sequence `seq` in-place with its reverse complement. - template - static void reverse_complement(T_container_& seq); - // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation // phase. In the general case, these functions with their specified input parameters and their @@ -251,28 +247,6 @@ inline bool Read_CdBG_Extractor::is_flanking_side(const State_Read_Space stat } -template -template -inline void Read_CdBG_Extractor::reverse_complement(T_container_& seq) -{ - assert(!seq.empty()); - - auto fwd = seq.begin(); - auto bwd = seq.end() - 1; - - for(; fwd < bwd; ++fwd, --bwd) - { - std::swap(*fwd, *bwd); - - *fwd = DNA_Utility::complement(*fwd), - *bwd = DNA_Utility::complement(*bwd); - } - - if(fwd == bwd) - *fwd = DNA_Utility::complement(*fwd); -} - - template inline void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) { diff --git a/include/dBG_Utilities.hpp b/include/dBG_Utilities.hpp new file mode 100644 index 00000000..39e1fc9a --- /dev/null +++ b/include/dBG_Utilities.hpp @@ -0,0 +1,61 @@ + +#ifndef DBG_UTILITIES_HPP +#define DBG_UTILITIES_HPP + + + +#include "DNA_Utility.hpp" +#include "globals.hpp" + + +// ============================================================================= +namespace cuttlefish +{ + // Returns `true` iff the edge encoding `e` is fuzzy, i.e. a unique encoding + // is not known for the corresponding edge(s). + bool is_fuzzy_edge(const edge_encoding_t e); + + // Returns the opposite (or complement) side of the vertex-side `s`. + side_t opposite_side(const side_t s); + + + // Replaces the sequence `seq` in-place with its reverse complement. + template void reverse_complement(T_container_& seq); +} + + +inline bool cuttlefish::is_fuzzy_edge(const edge_encoding_t e) +{ + return e == edge_encoding_t::N || e == edge_encoding_t::E; +} + + +inline cuttlefish::side_t cuttlefish::opposite_side(const side_t s) +{ + return s == side_t::back ? side_t::front : side_t::back; +} + + +template +inline void cuttlefish::reverse_complement(T_container_& seq) +{ + assert(!seq.empty()); + + auto fwd = seq.begin(); + auto bwd = seq.end() - 1; + + for(; fwd < bwd; ++fwd, --bwd) + { + std::swap(*fwd, *bwd); + + *fwd = DNA_Utility::complement(*fwd), + *bwd = DNA_Utility::complement(*bwd); + } + + if(fwd == bwd) + *fwd = DNA_Utility::complement(*fwd); +} + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 81204833..6a1d8da9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,7 @@ set(PROJECT_SRC Read_CdBG_Extractor.cpp Unipaths_Meta_info.cpp Detached_Cycles_Extractor.cpp + dBG_Utilities.cpp Character_Buffer_Flusher.cpp Progress_Tracker.cpp dBG_Info.cpp diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index 348a5703..d5cdf484 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -2,6 +2,7 @@ #include "Read_CdBG_Extractor.hpp" #include "Read_CdBG.hpp" #include "Kmer_SPMC_Iterator.hpp" +#include "dBG_Utilities.hpp" #include "FASTA_Record.hpp" #include "Character_Buffer.hpp" #include "Thread_Pool.hpp" @@ -275,7 +276,7 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s if(!sign_vertex.in_canonical_form()) { - reverse_complement(cycle); + cuttlefish::reverse_complement(cycle); pivot = (cycle.size() - 1) - (pivot + k - 1); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 70f1c558..b7e1c087 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -191,7 +191,7 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const return false; if(!in_canonical) - reverse_complement(unipath); + cuttlefish::reverse_complement(unipath); id = sign_vertex.hash(); diff --git a/src/dBG_Utilities.cpp b/src/dBG_Utilities.cpp new file mode 100644 index 00000000..89cff8fe --- /dev/null +++ b/src/dBG_Utilities.cpp @@ -0,0 +1,2 @@ + +#include "dBG_Utilities.hpp" From 1be4642999aa9de69668749a4342190e4eeb4ee9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 30 Oct 2021 16:05:36 -0400 Subject: [PATCH 243/350] Fix develop mode compilation --- src/Read_CdBG.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 457852cc..a690bdae 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -50,11 +50,11 @@ void Read_CdBG::construct() if(params.edge_db_path().empty()) { - kmer_Enumeration_Stats edge_stats = enumerate_edges(); - kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); + kmer_Enumeration_Stats edge_stats = enumerate_edges(); + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(edge_stats.max_memory()); - edge_count = edge_stats.kmer_count(); - vertex_count = vertex_stats.kmer_count(); + edge_count = edge_stats.counted_kmer_count(); + vertex_count = vertex_stats.counted_kmer_count(); } else if(!params.vertex_db_path().empty()) { From ec63c75c70752b936cdbaca81dd9d15c50f5018f Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 31 Oct 2021 00:12:17 -0400 Subject: [PATCH 244/350] Add unitig scratches w/ functionality --- include/Maximal_Unitig_Scratch.hpp | 135 +++++++++++++++++++++++++++++ include/Unitig_Scratch.hpp | 128 +++++++++++++++++++++++++++ src/Maximal_Unitig_Scratch.cpp | 12 +++ src/Unitig_Scratch.cpp | 15 ++++ 4 files changed, 290 insertions(+) create mode 100644 include/Maximal_Unitig_Scratch.hpp create mode 100644 include/Unitig_Scratch.hpp create mode 100644 src/Maximal_Unitig_Scratch.cpp create mode 100644 src/Unitig_Scratch.cpp diff --git a/include/Maximal_Unitig_Scratch.hpp b/include/Maximal_Unitig_Scratch.hpp new file mode 100644 index 00000000..2dc447ca --- /dev/null +++ b/include/Maximal_Unitig_Scratch.hpp @@ -0,0 +1,135 @@ + +#ifndef MAXIMAL_UNITIG_SCRATCH_HPP +#define MAXIMAL_UNITIG_SCRATCH_HPP + + + +#include "Unitig_Scratch.hpp" +#include "FASTA_Record.hpp" +#include "globals.hpp" + +#include +#include + + +// ============================================================================= +// A class to keep scratch data for building maximal unitigs from two of its +// constituent unitigs that cover it and overlap at a meeting-point vertex. +// That is, the maximal unitig is split into two unitigs `u_b` and `u_f`, at +// some vertex `v`—`u_b` and `u_f` are connected to the front and to the back +// of `v`, respectively. The unitigs are built such that the paths start from +// `v`. Thus, the maximal unitig in literal form is `\bar(u_f) \glue_k u_b` +// (or its reverse complement). +template +class Maximal_Unitig_Scratch +{ +private: + + Unitig_Scratch unitig_back; // The unitig `u_b` (see note above class body). + Unitig_Scratch unitig_front; // The unitig `u_f` (see note above class body). + + uint64_t id_; // The unique ID of the maximal unitig. + + + // Returns whether the maximal unitig is in canonical form. + bool is_canonical() const; + + +public: + + // Constructs an empty scratch space for the unitig. + Maximal_Unitig_Scratch(); + + // Returns the unitig scratch `u_b` or `u_f`, based on `s` (see note above + // class body). + Unitig_Scratch& unitig(const cuttlefish::side_t s); + + // Returns the unique ID of the maximal unitig. + uint64_t id() const; + + // Returns the hashes of the vertices of the unitig at side `s`. + const std::vector& unitig_hash(cuttlefish::side_t s) const; + + // Returns the count of vertices in the maximal unitig. + std::size_t size() const; + + // Returns the signature vertex of the maximal unitig, which is the first + // vertex in the canonical form of the unitig. + const Directed_Vertex& sign_vertex() const; + + // Signals the scratch that the unitig pieces `u_b` and `u_f` are in their + // final forms and will not be modified anymore. So it restructures the + // maximal unitig so as to put its label in canonical form and sets its + // unique ID. + void finalize(); + + // Returns a FASTA record of the maximal unitig (in canonical form). + const FASTA_Record> fasta_rec() const; +}; + + +template +inline Unitig_Scratch& Maximal_Unitig_Scratch::unitig(const cuttlefish::side_t s) +{ + return s == cuttlefish::side_t::back ? unitig_back : unitig_front; +} + + +template +inline bool Maximal_Unitig_Scratch::is_canonical() const +{ + return unitig_front.endpoint().kmer_bar() < unitig_back.endpoint().kmer_bar(); +} + + +template +inline uint64_t Maximal_Unitig_Scratch::id() const +{ + return id_; +} + + +template +inline const std::vector& Maximal_Unitig_Scratch::unitig_hash(const cuttlefish::side_t s) const +{ + return (s == cuttlefish::side_t::back ? unitig_back.hash() : unitig_front.hash()); +} + + +template +inline std::size_t Maximal_Unitig_Scratch::size() const +{ + return unitig_back.size() + unitig_front.size() - 1; +} + + +template +inline const Directed_Vertex& Maximal_Unitig_Scratch::sign_vertex() const +{ + return is_canonical() ? unitig_front.endpoint() : unitig_back.endpoint(); +} + + +template +inline void Maximal_Unitig_Scratch::finalize() +{ + if(is_canonical()) + id_ = unitig_front.endpoint().hash(), + unitig_front.rev_compl_label(); + else + id_ = unitig_back.endpoint().hash(), + unitig_back.rev_compl_label(); +} + + +template +inline const FASTA_Record> Maximal_Unitig_Scratch::fasta_rec() const +{ + return is_canonical() ? + FASTA_Record>(id(), unitig_front.label(), unitig_back.label(), 0, k) : + FASTA_Record>(id(), unitig_back.label(), unitig_front.label(), 0, k); +} + + + +#endif diff --git a/include/Unitig_Scratch.hpp b/include/Unitig_Scratch.hpp new file mode 100644 index 00000000..5263cb58 --- /dev/null +++ b/include/Unitig_Scratch.hpp @@ -0,0 +1,128 @@ + +#ifndef UNITIG_SCRATCH_HPP +#define UNITIG_SCRATCH_HPP + + + +#include "Directed_Vertex.hpp" +#include "dBG_Utilities.hpp" + +#include +#include + + +// ============================================================================= +// A class to keep scratch data (i.e. working space) for unitigs. +template +class Unitig_Scratch +{ +private: + + // 100K (soft limit) unitig vertices can be retained in memory, at most, before reallocations. + static constexpr std::size_t BUFF_SZ = 100 * 1024UL; + + Directed_Vertex endpoint_; // The current end of the unitig through which farther extensions can be done. + // (The side for the extension is to be handled by the client code, although can + // also be inferred from the "directed" vertex.) + std::vector label_; // Literal label of the unitig. + std::vector hash_; // Hashes of the constituent vertices of the unitig. + + + // Clears the scratch data. + void clear(); + +public: + + // Constructs an empty unitig scratch. + Unitig_Scratch(); + + // Initializes the unitig scratch with the vertex `v`. + void init(const Directed_Vertex& v); + + // Extends the unitig scratch with the vertex `v`, and its literal form + // with the symbol `b`. + void extend(const Directed_Vertex& v, char b); + + // Reverse complements the label sequence (literal form) of the unitig. + void rev_compl_label(); + + // Returns the literal label of the unitig. + const std::vector& label() const; + + // Returns the hash collection of the unitig vertices. + const std::vector& hash() const; + + // Returns the current extension-end vertex of the unitig. + const Directed_Vertex& endpoint() const; + + // Returns the count of vertices in this unitig. + std::size_t size() const; +}; + + +template +inline void Unitig_Scratch::clear() +{ + label_.clear(); + hash_.clear(); +} + + +template +inline void Unitig_Scratch::init(const Directed_Vertex& v) +{ + clear(); + + endpoint_ = v; + endpoint_.kmer().get_label(label_); + hash_.emplace_back(endpoint_.hash()); +} + + +template +inline void Unitig_Scratch::extend(const Directed_Vertex& v, const char b) +{ + endpoint_ = v; + + label_.emplace_back(b); + hash_.emplace_back(endpoint_.hash()); +} + + +template +inline void Unitig_Scratch::rev_compl_label() +{ + cuttlefish::reverse_complement(label_); +} + + +template +inline const std::vector& Unitig_Scratch::label() const +{ + return label_; +} + + +template +inline const std::vector& Unitig_Scratch::hash() const +{ + return hash_; +} + + +template +inline const Directed_Vertex& Unitig_Scratch::endpoint() const +{ + return endpoint_; +} + + +template +inline std::size_t Unitig_Scratch::size() const +{ + return hash_.size(); +} + + + +#endif diff --git a/src/Maximal_Unitig_Scratch.cpp b/src/Maximal_Unitig_Scratch.cpp new file mode 100644 index 00000000..82b5fb0c --- /dev/null +++ b/src/Maximal_Unitig_Scratch.cpp @@ -0,0 +1,12 @@ + +#include "Maximal_Unitig_Scratch.hpp" + + +template +Maximal_Unitig_Scratch::Maximal_Unitig_Scratch() +{} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Maximal_Unitig_Scratch) diff --git a/src/Unitig_Scratch.cpp b/src/Unitig_Scratch.cpp new file mode 100644 index 00000000..fea449f6 --- /dev/null +++ b/src/Unitig_Scratch.cpp @@ -0,0 +1,15 @@ + +#include "Unitig_Scratch.hpp" + + +template +Unitig_Scratch::Unitig_Scratch() +{ + label_.reserve(BUFF_SZ + k - 1), + hash_.reserve(BUFF_SZ); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Unitig_Scratch) From 295f4c93a28ff33177ec263131e10e41a8a70055 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 31 Oct 2021 00:30:30 -0400 Subject: [PATCH 245/350] Fix incomplete commit --- src/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6a1d8da9..52f572b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,8 @@ set(PROJECT_SRC Read_CdBG.cpp Read_CdBG_Constructor.cpp Read_CdBG_Extractor.cpp + Unitig_Scratch.cpp + Maximal_Unitig_Scratch.cpp Unipaths_Meta_info.cpp Detached_Cycles_Extractor.cpp dBG_Utilities.cpp From 627bda7cb5fb7ae1016824fe7c7c87b89a18c1fa Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 31 Oct 2021 12:37:30 -0400 Subject: [PATCH 246/350] Extract maximal unitigs from internal vertices --- include/Read_CdBG_Extractor.hpp | 106 +++++++++++++++++++++++++++++++- include/Unipaths_Meta_info.hpp | 24 ++++++++ src/Read_CdBG_Extractor.cpp | 40 ++++++++++++ src/Thread_Pool.cpp | 2 +- 4 files changed, 169 insertions(+), 3 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 21b323c6..bb4d28d8 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -3,10 +3,12 @@ #define READ_CDBG_EXTRACTOR_HPP - +// TODO: reduce header-inclusions throughout the entire headers-collection using forward decl. #include "globals.hpp" #include "Kmer_Hash_Table.hpp" #include "Directed_Vertex.hpp" +#include "Maximal_Unitig_Scratch.hpp" +#include "dBG_Utilities.hpp" #include "Build_Params.hpp" #include "Spin_Lock.hpp" #include "Async_Logger_Wrapper.hpp" @@ -40,7 +42,7 @@ class Read_CdBG_Extractor Output_Sink output_sink; // Sink for the output maximal unitigs. // TODO: give these limits more thoughts, especially their exact impact on the memory usage. - static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitigs can be retained in memory, at most, before flushing. + static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitig records (FASTA) can be retained in memory, at most, before flushing. static constexpr std::size_t SEQ_SZ = 1 * 1024ULL * 1024ULL; // 1 MB (soft limit) sized maximal unitig, at most, is constructed at a time. mutable uint64_t vertices_scanned = 0; // Total number of vertices scanned from the database. @@ -64,6 +66,11 @@ class Read_CdBG_Extractor // the corresponding unipath. void scan_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + // Prcesses the vertices provided to the thread with id `thread_id` from the parser + // `vertex_parser`, i.e. for each vertex `v` provided to that thread, attempts to + // piece-wise construct its containing maximal unitig. + void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If @@ -73,9 +80,25 @@ class Read_CdBG_Extractor // `unipath` and `path_hashes` may contain partial form of the path, and `id` is unaltered. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes); + // Extracts the maximal unitig `p` that contains the vertex `v_hat`, and `maximal_unitig` is + // used as the working scratch for the extraction, i.e. to build and store the two unitigs + // connecting to the two sides of `v_hat`. Returns `true` iff the extraction is successful, + // which happens when `p` is attempted for output-marking first by this thread. + bool extract_maximal_unitig(const Kmer& v_hat, Maximal_Unitig_Scratch& maximal_unitig); + + // Traverses a unitig starting from the vertex `v_hat`, exiting it through the side `s_v_hat`. + // The DFA of `v_hat` is supposed to have the state `st_v`. `unitig` is used as the working + // scratch to build the unitig. Returns `true` iff the unitig could have been traversed + // maximally up-to its endpoint in the direction of the walk from `v_hat`, which is possible + // iff no other thread output-marks it in the meantime. + bool walk_unitig(const Kmer& v_hat, State_Read_Space st_v, cuttlefish::side_t s_v_hat, Unitig_Scratch& unitig); + // Marks all the vertices which have their hashes present in `path_hashes` as outputted. void mark_path(const std::vector& path_hashes); + // Marks all the vertices in the constituent unitigs of `maximal_unitig` as outputted. + void mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); + // Marks all the vertices that are present in the maximal unitigs of the graph with its vertex // set being present at the path prefix `vertex_db_path`. void mark_maximal_unitig_vertices(const std::string& vertex_db_path); @@ -255,5 +278,84 @@ inline void Read_CdBG_Extractor::mark_path(const std::vector& path_ } +template +inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +{ + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); +} + + +template +inline bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, Maximal_Unitig_Scratch& maximal_unitig) +{ + static constexpr cuttlefish::side_t back = cuttlefish::side_t::back; + static constexpr cuttlefish::side_t front = cuttlefish::side_t::front; + + + State_Read_Space state = hash_table[v_hat].state(); // State of the vertex `v_hat`. + if(state.is_outputted()) // The containing maximal unitig has already been outputted. + return false; + + + if( !walk_unitig(v_hat, state, back, maximal_unitig.unitig(back)) || + !walk_unitig(v_hat, state, front, maximal_unitig.unitig(front))) + return false; + + if(!mark_vertex(maximal_unitig.sign_vertex())) + return false; + + + maximal_unitig.finalize(); + + return true; +} + + +template +inline bool Read_CdBG_Extractor::walk_unitig(const Kmer& v_hat, const State_Read_Space st_v, const cuttlefish::side_t s_v_hat, Unitig_Scratch& unitig) +{ + // Data structures to be reused per each vertex extension of the unitig. + + cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v_hat` through which to extend the unitig, i.e. exit `v_hat`. + Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the unitig. + State_Read_Space state = st_v; // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The potential next edge from `v` to include into the unitig. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to potentially add to the literal form of the unitig. + const Directed_Vertex anchor(v); // The anchor vertex where the unitig traversal starts from. + + unitig.init(v); // Initialize the unitig with the current vertex. + + + while(true) + { + if(state.is_outputted()) // The unitig has already been outputted earlier / in the meantime. + return false; + + e_v = state.edge_at(s_v); + if(cuttlefish::is_fuzzy_edge(e_v)) // Reached an endpoint. + break; + + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + v.roll_forward(b_ext, hash_table); + + if(v.is_same_vertex(anchor)) // The unitig is a DCC (Detached Chordless Cycle). + return false; // Temporary omission. TODO: include DCCs well. + + state = hash_table[v.hash()].state(); + s_v = v.entrance_side(); + if(state.is_branching_side(s_v)) // Crossed an endpoint and reached a different unitig. + break; + + // Still within the unitig. + unitig.extend(v, Kmer::map_char(b_ext)); + s_v = cuttlefish::opposite_side(s_v); + } + + + return true; +} + + #endif diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index df2199ad..535b6761 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -4,6 +4,7 @@ +#include "Maximal_Unitig_Scratch.hpp" #include "globals.hpp" #include @@ -36,6 +37,10 @@ class Unipaths_Meta_info template void add_maximal_unitig(const T_container_& unipath); + // Adds information of the maximal unitig at the scratch space `unipath_scratch` + // to the tracker. + void add_maximal_unitig(const Maximal_Unitig_Scratch& unipath_scratch); + // Adds information of the DCC (Detached Chordless Cycle) `cycle` to the tracker. template void add_DCC(const T_container_& cycle); @@ -93,6 +98,25 @@ inline void Unipaths_Meta_info::add_maximal_unitig(const T_container_& unipat } +template +inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +{ + unipath_count_++; + + const std::size_t vertex_count = maximal_unitig.size(); + const std::size_t unipath_size = vertex_count + (k - 1); + kmer_count_ += vertex_count; + + if(max_len_ < unipath_size) + max_len_ = unipath_size; + + if(min_len_ > unipath_size) + min_len_ = unipath_size; + + sum_len_ += unipath_size; +} + + template template inline void Unipaths_Meta_info::add_DCC(const T_container_& cycle) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index b7e1c087..6116a660 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -138,6 +138,46 @@ void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_p } +template +void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v_hat; // The vertex copy to be scanned one-by-one. + Maximal_Unitig_Scratch maximal_unitig; // The scratch space to be used to construct the containing maximal unitig of `v_hat`. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. + uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. + + Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. + + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v_hat)) + { + if(extract_maximal_unitig(v_hat, maximal_unitig)) + { + mark_maximal_unitig(maximal_unitig); + + extracted_unipaths_info.add_maximal_unitig(maximal_unitig); + output_buffer += maximal_unitig.fasta_rec(); + } + + vertex_count++; + progress_tracker.track_work(++progress); + } + + + // Aggregate the meta-information over the extracted maximal unitigs and the thread-executions. + lock.lock(); + + vertices_scanned += vertex_count; + unipaths_meta_info_.aggregate(extracted_unipaths_info); + + lock.unlock(); +} + + template bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes) { diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index e31d0035..b6c2247a 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -104,7 +104,7 @@ void Thread_Pool::task(const uint16_t thread_id) { const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; static_cast*>(dBG)-> - scan_vertices(static_cast*>(params.parser), params.thread_id); + process_vertices(static_cast*>(params.parser), params.thread_id); } break; From e7a0c352fb49875980b55f19a0086c573b888baf Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 31 Oct 2021 13:29:45 -0400 Subject: [PATCH 247/350] Delegate cycle-check to unitig scratch --- include/Read_CdBG_Extractor.hpp | 20 +++++++++++--------- include/Unitig_Scratch.hpp | 31 +++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index bb4d28d8..c7ca5be1 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -298,16 +298,20 @@ inline bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, return false; - if( !walk_unitig(v_hat, state, back, maximal_unitig.unitig(back)) || - !walk_unitig(v_hat, state, front, maximal_unitig.unitig(front))) + if(!walk_unitig(v_hat, state, back, maximal_unitig.unitig(back))) return false; - if(!mark_vertex(maximal_unitig.sign_vertex())) + if(maximal_unitig.unitig(back).is_cycle()) + return false; // Temporary omission. TODO: include DCCs well. + + if(!walk_unitig(v_hat, state, front, maximal_unitig.unitig(front))) return false; - maximal_unitig.finalize(); + if(!mark_vertex(maximal_unitig.sign_vertex())) + return false; + maximal_unitig.finalize(); return true; } @@ -322,7 +326,6 @@ inline bool Read_CdBG_Extractor::walk_unitig(const Kmer& v_hat, const Stat State_Read_Space state = st_v; // State of the vertex `v`. cuttlefish::edge_encoding_t e_v; // The potential next edge from `v` to include into the unitig. cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to potentially add to the literal form of the unitig. - const Directed_Vertex anchor(v); // The anchor vertex where the unitig traversal starts from. unitig.init(v); // Initialize the unitig with the current vertex. @@ -339,16 +342,15 @@ inline bool Read_CdBG_Extractor::walk_unitig(const Kmer& v_hat, const Stat b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); v.roll_forward(b_ext, hash_table); - if(v.is_same_vertex(anchor)) // The unitig is a DCC (Detached Chordless Cycle). - return false; // Temporary omission. TODO: include DCCs well. - state = hash_table[v.hash()].state(); s_v = v.entrance_side(); if(state.is_branching_side(s_v)) // Crossed an endpoint and reached a different unitig. break; // Still within the unitig. - unitig.extend(v, Kmer::map_char(b_ext)); + if(!unitig.extend(v, Kmer::map_char(b_ext))) + break; // The unitig is a DCC (Detached Chordless Cycle). + s_v = cuttlefish::opposite_side(s_v); } diff --git a/include/Unitig_Scratch.hpp b/include/Unitig_Scratch.hpp index 5263cb58..aa977c49 100644 --- a/include/Unitig_Scratch.hpp +++ b/include/Unitig_Scratch.hpp @@ -21,11 +21,13 @@ class Unitig_Scratch // 100K (soft limit) unitig vertices can be retained in memory, at most, before reallocations. static constexpr std::size_t BUFF_SZ = 100 * 1024UL; + Directed_Vertex anchor; // The anchor vertex of the unitig traversal. Directed_Vertex endpoint_; // The current end of the unitig through which farther extensions can be done. // (The side for the extension is to be handled by the client code, although can // also be inferred from the "directed" vertex.) std::vector label_; // Literal label of the unitig. std::vector hash_; // Hashes of the constituent vertices of the unitig. + bool is_cycle_; // Whether the unitig is cyclical or not. // Clears the scratch data. @@ -40,8 +42,9 @@ class Unitig_Scratch void init(const Directed_Vertex& v); // Extends the unitig scratch with the vertex `v`, and its literal form - // with the symbol `b`. - void extend(const Directed_Vertex& v, char b); + // with the symbol `b`. Returns `true` iff adding `v` to the unitig does + // not render itself a cycle. + bool extend(const Directed_Vertex& v, char b); // Reverse complements the label sequence (literal form) of the unitig. void rev_compl_label(); @@ -57,6 +60,9 @@ class Unitig_Scratch // Returns the count of vertices in this unitig. std::size_t size() const; + + // Returns `true` iff unitig is a cycle. + bool is_cycle() const; }; @@ -73,19 +79,29 @@ inline void Unitig_Scratch::init(const Directed_Vertex& v) { clear(); - endpoint_ = v; + endpoint_ = anchor = v; endpoint_.kmer().get_label(label_); hash_.emplace_back(endpoint_.hash()); + is_cycle_ = false; } template -inline void Unitig_Scratch::extend(const Directed_Vertex& v, const char b) +inline bool Unitig_Scratch::extend(const Directed_Vertex& v, const char b) { + if(v.is_same_vertex(anchor)) + { + is_cycle_ = true; + return false; + } + + endpoint_ = v; label_.emplace_back(b); hash_.emplace_back(endpoint_.hash()); + + return true; } @@ -124,5 +140,12 @@ inline std::size_t Unitig_Scratch::size() const } +template +inline bool Unitig_Scratch::is_cycle() const +{ + return is_cycle_; +} + + #endif From 861100a0859bc02543f0045432e26f58a330ba07 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 31 Oct 2021 22:37:23 -0400 Subject: [PATCH 248/350] Simultaneously extract DCCs with unipaths --- include/Maximal_Unitig_Scratch.hpp | 86 +++++++++++++++++++++++++++--- include/Read_CdBG_Extractor.hpp | 19 ++++--- include/Unitig_Scratch.hpp | 40 ++++++++++++-- src/Read_CdBG_Extractor.cpp | 3 +- 4 files changed, 130 insertions(+), 18 deletions(-) diff --git a/include/Maximal_Unitig_Scratch.hpp b/include/Maximal_Unitig_Scratch.hpp index 2dc447ca..42e67208 100644 --- a/include/Maximal_Unitig_Scratch.hpp +++ b/include/Maximal_Unitig_Scratch.hpp @@ -6,6 +6,7 @@ #include "Unitig_Scratch.hpp" #include "FASTA_Record.hpp" +#include "Character_Buffer.hpp" #include "globals.hpp" #include @@ -30,6 +31,8 @@ class Maximal_Unitig_Scratch uint64_t id_; // The unique ID of the maximal unitig. + Unitig_Scratch* cycle; // Pointer to either `u_b` or `u_f`, whichever contains the maximal unitig in case of it's a cycle. + // Returns whether the maximal unitig is in canonical form. bool is_canonical() const; @@ -47,9 +50,17 @@ class Maximal_Unitig_Scratch // Returns the unique ID of the maximal unitig. uint64_t id() const; + // Returns whether the maximal unitig is linear, i.e. it is a linear path + // and not a Detached Chordless Cycle (DCC) in the underlying graph. + bool is_linear() const; + // Returns the hashes of the vertices of the unitig at side `s`. const std::vector& unitig_hash(cuttlefish::side_t s) const; + // Returns the hashes of the vertices in the maximal unitig in case it's + // a DCC. + const std::vector& cycle_hash() const; + // Returns the count of vertices in the maximal unitig. std::size_t size() const; @@ -57,6 +68,13 @@ class Maximal_Unitig_Scratch // vertex in the canonical form of the unitig. const Directed_Vertex& sign_vertex() const; + // Marks the maximal unitig as linear, i.e not a DCC. + void mark_linear(); + + // Marks the maximal unitig as a DCC, and signals that the cycle has been + // extracted in the unitig scratch at side `s`. + void mark_cycle(cuttlefish::side_t s); + // Signals the scratch that the unitig pieces `u_b` and `u_f` are in their // final forms and will not be modified anymore. So it restructures the // maximal unitig so as to put its label in canonical form and sets its @@ -64,7 +82,11 @@ class Maximal_Unitig_Scratch void finalize(); // Returns a FASTA record of the maximal unitig (in canonical form). + // Applicable when the maximal unitig is linear. const FASTA_Record> fasta_rec() const; + + // Adds a corresponding FASTA record for the maximal unitig into `buffer`. + template void add_fasta_rec_to_buffer(Character_Buffer& buffer) const; }; @@ -82,6 +104,13 @@ inline bool Maximal_Unitig_Scratch::is_canonical() const } +template +inline bool Maximal_Unitig_Scratch::is_linear() const +{ + return cycle == nullptr; +} + + template inline uint64_t Maximal_Unitig_Scratch::id() const { @@ -89,6 +118,13 @@ inline uint64_t Maximal_Unitig_Scratch::id() const } +template +inline void Maximal_Unitig_Scratch::mark_linear() +{ + cycle = nullptr; +} + + template inline const std::vector& Maximal_Unitig_Scratch::unitig_hash(const cuttlefish::side_t s) const { @@ -96,29 +132,54 @@ inline const std::vector& Maximal_Unitig_Scratch::unitig_hash(const } +template +inline const std::vector& Maximal_Unitig_Scratch::cycle_hash() const +{ + return cycle->hash(); +} + + template inline std::size_t Maximal_Unitig_Scratch::size() const { - return unitig_back.size() + unitig_front.size() - 1; + return is_linear() ? (unitig_back.size() + unitig_front.size() - 1) : + cycle->size(); } template inline const Directed_Vertex& Maximal_Unitig_Scratch::sign_vertex() const { - return is_canonical() ? unitig_front.endpoint() : unitig_back.endpoint(); + return is_linear() ? (is_canonical() ? unitig_front.endpoint() : unitig_back.endpoint()) : + cycle->min_vertex(); +} + + +template +inline void Maximal_Unitig_Scratch::mark_cycle(const cuttlefish::side_t s) +{ + cycle = &(s == cuttlefish::side_t::back ? unitig_back : unitig_front); } template inline void Maximal_Unitig_Scratch::finalize() { - if(is_canonical()) - id_ = unitig_front.endpoint().hash(), - unitig_front.rev_compl_label(); + if(is_linear()) + { + if(is_canonical()) + id_ = unitig_front.endpoint().hash(), + unitig_front.reverse_complement(); + else + id_ = unitig_back.endpoint().hash(), + unitig_back.reverse_complement(); + } else - id_ = unitig_back.endpoint().hash(), - unitig_back.rev_compl_label(); + { + id_ = cycle->min_vertex().hash(); + if(!cycle->min_vertex().in_canonical_form()) + cycle->reverse_complement(); + } } @@ -131,5 +192,16 @@ inline const FASTA_Record> Maximal_Unitig_Scratch::fasta_re } +template +template +inline void Maximal_Unitig_Scratch::add_fasta_rec_to_buffer(Character_Buffer& buffer) const +{ + if(is_linear()) + buffer += fasta_rec(); + else + buffer.template rotate_append_cycle(FASTA_Record>(id(), cycle->label()), cycle->min_vertex_idx()); +} + + #endif diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index c7ca5be1..f64a4b39 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -281,8 +281,13 @@ inline void Read_CdBG_Extractor::mark_path(const std::vector& path_ template inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) { - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); + if(maximal_unitig.is_linear()) + { + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); + } + else + mark_path(maximal_unitig.cycle_hash()); } @@ -298,14 +303,16 @@ inline bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, return false; + maximal_unitig.mark_linear(); + if(!walk_unitig(v_hat, state, back, maximal_unitig.unitig(back))) return false; if(maximal_unitig.unitig(back).is_cycle()) - return false; // Temporary omission. TODO: include DCCs well. - - if(!walk_unitig(v_hat, state, front, maximal_unitig.unitig(front))) - return false; + maximal_unitig.mark_cycle(back); + else + if(!walk_unitig(v_hat, state, front, maximal_unitig.unitig(front))) + return false; if(!mark_vertex(maximal_unitig.sign_vertex())) diff --git a/include/Unitig_Scratch.hpp b/include/Unitig_Scratch.hpp index aa977c49..23d6157f 100644 --- a/include/Unitig_Scratch.hpp +++ b/include/Unitig_Scratch.hpp @@ -25,6 +25,10 @@ class Unitig_Scratch Directed_Vertex endpoint_; // The current end of the unitig through which farther extensions can be done. // (The side for the extension is to be handled by the client code, although can // also be inferred from the "directed" vertex.) + Directed_Vertex min_vertex_; // The lexicographically minimum vertex in the unitig. + std::size_t vertex_idx; // Index of the vertex in the path being traversed. + std::size_t min_v_idx; // Index of the lexicographically minimum vertex in the path. + std::vector label_; // Literal label of the unitig. std::vector hash_; // Hashes of the constituent vertices of the unitig. bool is_cycle_; // Whether the unitig is cyclical or not. @@ -46,8 +50,8 @@ class Unitig_Scratch // not render itself a cycle. bool extend(const Directed_Vertex& v, char b); - // Reverse complements the label sequence (literal form) of the unitig. - void rev_compl_label(); + // Reverse complements the unitig. + void reverse_complement(); // Returns the literal label of the unitig. const std::vector& label() const; @@ -63,6 +67,12 @@ class Unitig_Scratch // Returns `true` iff unitig is a cycle. bool is_cycle() const; + + // Returns the lexicographically minimum vertex in the unitig. + const Directed_Vertex& min_vertex() const; + + // Returns the index of the lexicographically minimum vertex in the unitig. + std::size_t min_vertex_idx() const; }; @@ -79,7 +89,9 @@ inline void Unitig_Scratch::init(const Directed_Vertex& v) { clear(); - endpoint_ = anchor = v; + min_vertex_ = endpoint_ = anchor = v; + min_v_idx = vertex_idx = 0; + endpoint_.kmer().get_label(label_); hash_.emplace_back(endpoint_.hash()); is_cycle_ = false; @@ -97,6 +109,11 @@ inline bool Unitig_Scratch::extend(const Directed_Vertex& v, const char b) endpoint_ = v; + vertex_idx++; + + if(min_vertex_.canonical() > endpoint_.canonical()) + min_vertex_ = endpoint_, + min_v_idx = vertex_idx; label_.emplace_back(b); hash_.emplace_back(endpoint_.hash()); @@ -106,9 +123,10 @@ inline bool Unitig_Scratch::extend(const Directed_Vertex& v, const char b) template -inline void Unitig_Scratch::rev_compl_label() +inline void Unitig_Scratch::reverse_complement() { cuttlefish::reverse_complement(label_); + min_v_idx = (hash_.size() - 1 - min_v_idx); } @@ -147,5 +165,19 @@ inline bool Unitig_Scratch::is_cycle() const } +template +inline const Directed_Vertex& Unitig_Scratch::min_vertex() const +{ + return min_vertex_; +} + + +template +inline std::size_t Unitig_Scratch::min_vertex_idx() const +{ + return min_v_idx; +} + + #endif diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 6116a660..8b44711e 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -160,7 +160,8 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte mark_maximal_unitig(maximal_unitig); extracted_unipaths_info.add_maximal_unitig(maximal_unitig); - output_buffer += maximal_unitig.fasta_rec(); + // output_buffer += maximal_unitig.fasta_rec(); + maximal_unitig.add_fasta_rec_to_buffer(output_buffer); } vertex_count++; From f9418a7c4e1ad4fbcd85b263d658def82eb5c707 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 2 Nov 2021 12:36:45 -0400 Subject: [PATCH 249/350] Allow arbitrary edge updates not just the earlier restricted set (premature optimization) --- include/State_Read_Space.hpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 7c29c77f..92eb03a3 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -44,11 +44,9 @@ class State_Read_Space State_Read_Space(cuttlefish::state_code_t code); // Sets the back-encoding of the state to the `Extended_Base`-encoding `edge`. - // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. void set_back_encoding(cuttlefish::edge_encoding_t edge); // Sets the front-encoding of the state to the `Extended_Base`-encoding `edge`. - // Requirement: except while for setting `Extended_Base::N`, the bits must be zero beforehand. void set_front_encoding(cuttlefish::edge_encoding_t edge); @@ -72,8 +70,7 @@ class State_Read_Space bool is_branching_side(cuttlefish::side_t side) const; // Updates the `Extended_Base` encoding of the side `side` of this state, with - // `edge`. For optimization purposes, only certain edge-updates have defined - // behavior: empty-to-rest and unique-to-multi. + // `edge`. void update_edge_at(cuttlefish::side_t side, cuttlefish::edge_encoding_t edge); // Marks the state as already been outputted. @@ -99,13 +96,13 @@ inline State_Read_Space::State_Read_Space(const cuttlefish::state_code_t code): inline void State_Read_Space::set_back_encoding(cuttlefish::edge_encoding_t edge) { - code |= (static_cast(edge) << BACK_IDX); + code = (code & FRONT_MASK) | (static_cast(edge) << BACK_IDX); } inline void State_Read_Space::set_front_encoding(cuttlefish::edge_encoding_t edge) { - code |= (static_cast(edge) << FRONT_IDX); + code = (code & BACK_MASK) | (static_cast(edge) << FRONT_IDX); } From 10f1a24cc274261302a1b92b19725e43ffdbcaae Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 2 Nov 2021 18:30:10 -0400 Subject: [PATCH 250/350] Track more info w/ output-marked states "branching-ness" of the sides --- include/DNA.hpp | 8 +++--- include/Kmer_Hash_Table.hpp | 13 ++++++++++ include/Read_CdBG_Extractor.hpp | 2 +- include/State_Read_Space.hpp | 46 +++++++++++++++++++++------------ src/State_Read_Space.cpp | 3 --- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/include/DNA.hpp b/include/DNA.hpp index 4a373d78..e6c6898e 100644 --- a/include/DNA.hpp +++ b/include/DNA.hpp @@ -24,10 +24,8 @@ namespace DNA }; - // E = 0, A = 1, C = 2, G = 3, T = 4, N = 7. - // Implementation of the state class for the read de Bruijn graph uses intricacies - // of this mapping (and the underlying transition function of the DFA) heavily. Do - // not alter the mapping without updating the state-class. + // E = 0, A = 1, C = 2, G = 3, T = 4, N = 7; + // O/P and — non-branching = 5, branching = 6. enum class Extended_Base: uint8_t { E = 0b000, // 0 @@ -36,6 +34,8 @@ namespace DNA G = 0b011, // 3 T = 0b100, // 4 N = 0b111, // 7 + OP_non_branch = 0b101, // 5 + OP_branching = 0b110, // 6 }; } diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index f88a5a17..b6fc8dca 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -151,6 +151,10 @@ class Kmer_Hash_Table // `bucket_id` with the state-value `state`. void update(uint64_t bucket_id, const State_Read_Space& state); + // Transforms the state-entry in the hash-table that's at the bucket with ID + // `bucket_id` through the function `transform`. + void update(uint64_t bucket_id, cuttlefish::state_code_t (*transform)(cuttlefish::state_code_t)); + // Returns the number of keys in the hash table. uint64_t size() const; @@ -259,6 +263,15 @@ inline void Kmer_Hash_Table::update(const uint64_t bucket_id, c } +template +inline void Kmer_Hash_Table::update(const uint64_t bucket_id, cuttlefish::state_code_t (* const transform)(cuttlefish::state_code_t)) +{ + sparse_lock.lock(bucket_id); + hash_table[bucket_id] = transform(hash_table[bucket_id]); + sparse_lock.unlock(bucket_id); +} + + template inline uint64_t Kmer_Hash_Table::size() const { diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index f64a4b39..be828168 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -274,7 +274,7 @@ template inline void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) { for(const uint64_t hash: path_hashes) - hash_table.update(hash, State_Read_Space::get_outputted_state()); + hash_table.update(hash, State_Read_Space::mark_outputted); } diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index 92eb03a3..c8a2bb23 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -32,13 +32,6 @@ class State_Read_Space // Bitmask used to extract the 'Extended_Base`-encoding of the edge(s) incident to the back side of a vertex. static constexpr cuttlefish::state_code_t BACK_MASK = SIDE_MASK << BACK_IDX; - // State code for vertices that have been outputted. - // TODO: Use a well-thought-out value as the marker. - static constexpr cuttlefish::state_code_t OUTPUTTED = static_cast((0b101 << FRONT_IDX) | 0b101 << BACK_IDX); - - // State for the vertices that have been outputted. - static const State_Read_Space outputted_state; - // Constructs a state that wraps the provided numeric value `code`. State_Read_Space(cuttlefish::state_code_t code); @@ -81,6 +74,11 @@ class State_Read_Space // Returns the state for the vertices that have been marked as outputted. static const State_Read_Space& get_outputted_state(); + + // For the given code `code` of some state `s`, returns the code of the + // state `s_op` which is the corresponding state where the vertices having + // the DFA state `s` in the underlying graph transition to when outputted. + static cuttlefish::state_code_t mark_outputted(cuttlefish::state_code_t code); }; @@ -112,12 +110,6 @@ inline cuttlefish::state_code_t State_Read_Space::get_state() const } -inline bool State_Read_Space::is_outputted() const -{ - return code == OUTPUTTED; -} - - inline cuttlefish::edge_encoding_t State_Read_Space::edge_at(const cuttlefish::side_t side) const { return static_cast(side == cuttlefish::side_t::front ? (code & FRONT_MASK) >> FRONT_IDX : (code & BACK_MASK) >> BACK_IDX); @@ -138,7 +130,26 @@ inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, cons inline void State_Read_Space::mark_outputted() { - code = OUTPUTTED; + static constexpr cuttlefish::edge_encoding_t OP_non_branch = cuttlefish::edge_encoding_t::OP_non_branch; + static constexpr cuttlefish::edge_encoding_t OP_branching = cuttlefish::edge_encoding_t::OP_branching; + + if(!is_outputted()) + { + set_back_encoding(is_branching_side(cuttlefish::side_t::back) ? OP_branching : OP_non_branch); + set_front_encoding(is_branching_side(cuttlefish::side_t::front) ? OP_branching : OP_non_branch); + } +} + + +inline bool State_Read_Space::is_outputted() const +{ + static constexpr uint8_t OP_non_branch = static_cast(cuttlefish::edge_encoding_t::OP_non_branch); + static constexpr uint8_t OP_branching = static_cast(cuttlefish::edge_encoding_t::OP_branching); + + return code == ((OP_non_branch << FRONT_IDX) | (OP_non_branch << BACK_IDX)) || + code == ((OP_non_branch << FRONT_IDX) | (OP_branching << BACK_IDX)) || + code == ((OP_branching << FRONT_IDX) | (OP_non_branch << BACK_IDX)) || + code == ((OP_branching << FRONT_IDX) | (OP_branching << BACK_IDX)); } @@ -148,9 +159,12 @@ inline bool State_Read_Space::operator==(const State_Read_Space& rhs) const } -inline const State_Read_Space& State_Read_Space::get_outputted_state() +inline cuttlefish::state_code_t State_Read_Space::mark_outputted(const cuttlefish::state_code_t code) { - return outputted_state; + State_Read_Space state(code); + state.mark_outputted(); + + return state.get_state(); } diff --git a/src/State_Read_Space.cpp b/src/State_Read_Space.cpp index e4edc32a..486b54eb 100644 --- a/src/State_Read_Space.cpp +++ b/src/State_Read_Space.cpp @@ -1,5 +1,2 @@ #include "State_Read_Space.hpp" - - -const State_Read_Space State_Read_Space::outputted_state(State_Read_Space::OUTPUTTED); From 1941018593537a28b8048ee4caab9d5e7e3770cd Mon Sep 17 00:00:00 2001 From: Juri Kuronen Date: Wed, 3 Nov 2021 14:37:49 +0200 Subject: [PATCH 251/350] Fix GFA1 path overlaps --- src/CdBG_GFA_Writer.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index 0513c046..26df89de 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -591,7 +591,7 @@ void CdBG::append_link_to_path(const uint16_t thread_id, const Oriented_Uniti p_buffer += (right_unitig.dir == cuttlefish::FWD ? "+" : "-"); std::string& o_buffer = overlap_buffer[thread_id]; - o_buffer += ","; + if (!o_buffer.empty()) o_buffer += ","; o_buffer += fmt::format_int(right_unitig.start_kmer_idx == left_unitig.end_kmer_idx + 1 ? k - 1 : 0).c_str(); o_buffer += "M"; @@ -797,10 +797,6 @@ void CdBG::write_gfa_path(const std::string& path_name) output << "*"; // Write an empty CIGAR string at the 'Overlaps' field. else { - // The first overlap of the path (not inferrable from the path output files). - const uint16_t overlap = (right_unitig.start_kmer_idx == left_unitig.end_kmer_idx + 1 ? k - 1 : 0); - output << overlap << "M"; - // Copy the thread-specific overlap output file contents to the GFA output file. for(uint16_t t_id = 0; t_id < thread_count; ++t_id) { From 956fe7643cdd089a51bf1ae35731bde3f8201f69 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 3 Nov 2021 17:04:31 -0400 Subject: [PATCH 252/350] Skip propagation of edge-discard heuristic --- include/Read_CdBG_Constructor.hpp | 87 +++++++++++++++++++++++++++++++ include/Read_CdBG_Extractor.hpp | 11 ++-- include/State_Read_Space.hpp | 12 ++++- src/Read_CdBG_Constructor.cpp | 12 +++++ 4 files changed, 117 insertions(+), 5 deletions(-) diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index 8cef4d9b..f1c55104 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -52,12 +52,22 @@ class Read_CdBG_Constructor // attempted state transition failed. bool add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new); + // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped + // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the + // DFA of `v`. Returns `false` iff an attempted state transition failed. + bool add_incident_edge(const Endpoint& endpoint); + // Adds the information of an incident loop that connects the two different endpoints of some // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state // transition for the DFA of `v`. Also stores the edge encodings of the incidence information of // the front and the back sides before this addition, in `e_front` and `e_back` respectively. // Returns `false` iff an attempted state transition failed. bool add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back); + + // Adds the information of an incident loop that connects the two different endpoints of some + // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state + // transition for the DFA of `v`. Returns `false` iff an attempted state transition failed. + bool add_crossing_loop(const Endpoint& endpoint); // Adds the information of an incident loop for some vertex `v` that connects its side `s` to // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the @@ -66,6 +76,12 @@ class Read_CdBG_Constructor // state transition failed. bool add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old); + // Adds the information of an incident loop for some vertex `v` that connects its side `s` to + // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the + // appropriate state transition for the DFA of `v`. Returns `false` iff an attempted state + // transition failed. + bool add_one_sided_loop(const Endpoint& endpoint); + // If the endpoint object `v_end` connects to some neighboring endpoint `w_end` through a unique // edge encoded with `e`, then discards the incidence information of `w_end` — making the // appropriate state transition for the corresponding neighboring vertex `w`. @@ -135,6 +151,38 @@ inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpo } +template +inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint) +{ + // Fetch the hash table entry for the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + const cuttlefish::edge_encoding_t e_curr = state.edge_at(endpoint.side()); + + // If we've already discarded the incidence information for this side, then a self-transition happens. + if(e_curr == cuttlefish::edge_encoding_t::N) + return true; // The side has already been determined to be branching—nothing to update here anymore. + + cuttlefish::edge_encoding_t e_new = endpoint.edge(); + if(e_curr != cuttlefish::edge_encoding_t::E) // The side is not empty. + { + // We can get away without updating the same value again, because — (1) even if this DFA's state changes + // in the hash table by the time this method completes, making no updates at this point is theoretically + // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the + // ordering of the edges processed does not matter in the algorithm. + if(e_new == e_curr) + return true; + + // This side has been visited earlier, but with a different edge—discard the incidence information. + e_new = cuttlefish::edge_encoding_t::N; + } + + state.update_edge_at(endpoint.side(), e_new); + return hash_table.update(bucket); +} + + template inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back) { @@ -158,6 +206,27 @@ inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpo } +template +inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint) +{ + // Fetch the hash table entry for the DFA of vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + + const State_Read_Space state_curr = state; + + if(state.edge_at(cuttlefish::side_t::front) != cuttlefish::edge_encoding_t::N) // Discard the front-incidence information, if not done already. + state.update_edge_at(cuttlefish::side_t::front, cuttlefish::edge_encoding_t::N); + + if(state.edge_at(cuttlefish::side_t::back) != cuttlefish::edge_encoding_t::N) // Discard the back-incidence information, if not done already. + state.update_edge_at(cuttlefish::side_t::back, cuttlefish::edge_encoding_t::N); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + return state == state_curr ? true : hash_table.update(bucket); +} + + template inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old) { @@ -177,6 +246,24 @@ inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endp } +template +inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint) +{ + // Fetch the hash table entry for the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + if(state.edge_at(endpoint.side()) == cuttlefish::edge_encoding_t::N) // The incidence information has already been discarded. + return true; + + // Discard the incidence information. + state.update_edge_at(endpoint.side(), cuttlefish::edge_encoding_t::N); + return hash_table.update(bucket); +} + + template inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) { diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index be828168..b385242f 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -339,18 +339,21 @@ inline bool Read_CdBG_Extractor::walk_unitig(const Kmer& v_hat, const Stat while(true) { - if(state.is_outputted()) // The unitig has already been outputted earlier / in the meantime. - return false; - e_v = state.edge_at(s_v); if(cuttlefish::is_fuzzy_edge(e_v)) // Reached an endpoint. break; b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); - v.roll_forward(b_ext, hash_table); + v.roll_forward(b_ext, hash_table); // Walk to the next vertex. state = hash_table[v.hash()].state(); s_v = v.entrance_side(); + + if(state.is_outputted()) + return state.was_branching_side(s_v); // If `s_v` was a branching side, then the walk just crossed to a different unitig; + // so this unitig is depleted. Otherwise, `s_v` must belong to this unitig. In that + // case, the unitig has already been outputted earlier. + if(state.is_branching_side(s_v)) // Crossed an endpoint and reached a different unitig. break; diff --git a/include/State_Read_Space.hpp b/include/State_Read_Space.hpp index c8a2bb23..5fe312ab 100644 --- a/include/State_Read_Space.hpp +++ b/include/State_Read_Space.hpp @@ -59,9 +59,13 @@ class State_Read_Space cuttlefish::edge_encoding_t edge_at(cuttlefish::side_t side) const; // Returns `true` iff some vertex having this state is branching (i.e. has - // multiple incident edges) at its side `side`. + // multiple incident edges) at its side `side`, and hasn't been outputted yet. bool is_branching_side(cuttlefish::side_t side) const; + // Returns `true` iff some vertex having this state is branching (i.e. has + // multiple incident edges) at its side `side`, and has already been outputted. + bool was_branching_side(cuttlefish::side_t side) const; + // Updates the `Extended_Base` encoding of the side `side` of this state, with // `edge`. void update_edge_at(cuttlefish::side_t side, cuttlefish::edge_encoding_t edge); @@ -122,6 +126,12 @@ inline bool State_Read_Space::is_branching_side(const cuttlefish::side_t side) c } +inline bool State_Read_Space::was_branching_side(const cuttlefish::side_t side) const +{ + return edge_at(side) == cuttlefish::edge_encoding_t::OP_branching; +} + + inline void State_Read_Space::update_edge_at(const cuttlefish::side_t side, const cuttlefish::edge_encoding_t edge) { side == cuttlefish::side_t::front ? set_front_encoding(edge) : set_back_encoding(edge); diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index fb19e003..cb2dafb0 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -82,9 +82,11 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed { // Data locations to be reused per each edge processed. Edge e; // For the edges to be processed one-by-one. +/* cuttlefish::edge_encoding_t e_front, e_back; // Edges incident to the front and to the back of a vertex with a crossing loop. cuttlefish::edge_encoding_t e_u_old, e_u_new; // Edges incident to some particular side of a vertex `u`, before and after the addition of a new edge. cuttlefish::edge_encoding_t e_v_old, e_v_new; // Edges incident to some particular side of a vertex `v`, before and after the addition of a new edge. +*/ uint64_t edge_count = 0; // Number of edges processed by this thread. uint64_t progress = 0; // Number of edges processed by the thread; is reset at reaching 1% of its approximate workload. @@ -97,19 +99,28 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed if(e.is_loop()) if(e.u().side() != e.v().side()) // It is a crossing loop. { + while(!add_crossing_loop(e.u())); +/* while(!add_crossing_loop(e.u(), e_front, e_back)); propagate_discard(e.u(), e.u().side() == cuttlefish::side_t::front ? e_front : e_back); propagate_discard(e.v(), e.v().side() == cuttlefish::side_t::front ? e_front : e_back); +*/ } else // A one-sided loop. { + while(!add_one_sided_loop(e.u())); +/* while(!add_one_sided_loop(e.u(), e_u_old)); propagate_discard(e.u(), e_u_old); +*/ } else // It connects two endpoints `u` and `v` of two distinct vertex. { + while(!add_incident_edge(e.u())); + while(!add_incident_edge(e.v())); +/* while(!add_incident_edge(e.u(), e_u_old, e_u_new)); while(!add_incident_edge(e.v(), e_v_old, e_v_new)); @@ -118,6 +129,7 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed if(e_v_new == cuttlefish::edge_encoding_t::N) propagate_discard(e.v(), e.u(), e_v_old); +*/ } edge_count++; From 49ab64ddaf1270e9c82679445fd8fc49cdbf6222 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 3 Nov 2021 20:18:39 -0400 Subject: [PATCH 253/350] Separate (to be) obsolete code --- include/Read_CdBG.hpp | 34 ++-- include/Read_CdBG_Constructor.hpp | 187 ++++------------- include/Read_CdBG_Extractor.hpp | 142 +++++-------- src/CMakeLists.txt | 3 + .../Read_CdBG_Constructor_Obsolete.cpp | 128 ++++++++++++ src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp | 192 ++++++++++++++++++ src/Obsolete/Read_CdBG_Obsolete.cpp | 75 +++++++ src/Read_CdBG.cpp | 66 ------ src/Read_CdBG_Extractor.cpp | 135 ------------ 9 files changed, 504 insertions(+), 458 deletions(-) create mode 100644 src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp create mode 100644 src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp create mode 100644 src/Obsolete/Read_CdBG_Obsolete.cpp diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index e276511e..c43c270a 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -45,13 +45,6 @@ class Read_CdBG // Extracts the maximal unitigs from the graph. void extract_maximal_unitigs(); - // Extracts the detached chordless cycles of the graph and appends the output to the - // output file at path `output_file_path`. Specifying `rerun` implies that the graph - // has been compacted earlier in a separate run of Cuttlefish; otherwise it's done - // in this same run. Returns `true` iff either there is no DCC in the graph, or the - // DCCs have already been extracted earlier. - bool extract_DCCs(const std::string& output_file_path, bool rerun = false); - // Returns the path prefix to the edge database being used by Cuttlefish. const std::string edge_db_path() const; @@ -63,14 +56,6 @@ class Read_CdBG // NB: only the existence of the output meta-info file is checked for this purpose. bool is_constructed() const; - // Returns `true` iff the graph contains detached chordless cycles and the current - // execution is configured to extract those in this same run. - bool extract_DCCs_this_run() const; - - // Returns `true` iff the data structures required for DCC-extraction is present - // from an earlier execution of the algorithm. - bool DCC_data_structs_exist() const; - public: @@ -85,6 +70,25 @@ class Read_CdBG // Constructs the compacted read de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); + + +// The following stuffs are not used anymore with the current algorithm. +private: + + // Extracts the detached chordless cycles of the graph and appends the output to the + // output file at path `output_file_path`. Specifying `rerun` implies that the graph + // has been compacted earlier in a separate run of Cuttlefish; otherwise it's done + // in this same run. Returns `true` iff either there is no DCC in the graph, or the + // DCCs have already been extracted earlier. + bool extract_DCCs(const std::string& output_file_path, bool rerun = false); + + // Returns `true` iff the graph contains detached chordless cycles and the current + // execution is configured to extract those in this same run. + bool extract_DCCs_this_run() const; + + // Returns `true` iff the data structures required for DCC-extraction is present + // from an earlier execution of the algorithm. + bool DCC_data_structs_exist() const; }; diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index f1c55104..b491f866 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -45,6 +45,42 @@ class Read_CdBG_Constructor // `(u, v)` provided to that thread. void process_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped + // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the + // DFA of `v`. Returns `false` iff an attempted state transition failed. + bool add_incident_edge(const Endpoint& endpoint); + + // Adds the information of an incident loop that connects the two different endpoints of some + // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state + // transition for the DFA of `v`. Returns `false` iff an attempted state transition failed. + bool add_crossing_loop(const Endpoint& endpoint); + + // Adds the information of an incident loop for some vertex `v` that connects its side `s` to + // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the + // appropriate state transition for the DFA of `v`. Returns `false` iff an attempted state + // transition failed. + bool add_one_sided_loop(const Endpoint& endpoint); + + +public: + + // Consructs a read-CdBG builder object, with the required parameters wrapped in `params`, and uses + // the Cuttlefish hash table `hash_table`. + Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table); + + // Computes the states of the DFA in the de Bruijn graph with the edge set at path prefix `edge_db_path`. + void compute_DFA_states(const std::string& edge_db_path); + + // Returns the number of distinct vertices in the underlying graph. + uint64_t vertex_count() const; + + // Returns the number of distinct edges in the underlying graph. + uint64_t edge_count() const; + + +// The following methods are not used anymore with the current algorithm. +private: + // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the // DFA of `v`. Also stores the edge encodings of the incidence information of the side `s` before @@ -52,22 +88,12 @@ class Read_CdBG_Constructor // attempted state transition failed. bool add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new); - // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped - // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the - // DFA of `v`. Returns `false` iff an attempted state transition failed. - bool add_incident_edge(const Endpoint& endpoint); - // Adds the information of an incident loop that connects the two different endpoints of some // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state // transition for the DFA of `v`. Also stores the edge encodings of the incidence information of // the front and the back sides before this addition, in `e_front` and `e_back` respectively. // Returns `false` iff an attempted state transition failed. bool add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back); - - // Adds the information of an incident loop that connects the two different endpoints of some - // vertex `v`, wrapped inside the edge-endpoint object `endpoint` — making the appropriate state - // transition for the DFA of `v`. Returns `false` iff an attempted state transition failed. - bool add_crossing_loop(const Endpoint& endpoint); // Adds the information of an incident loop for some vertex `v` that connects its side `s` to // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the @@ -76,12 +102,6 @@ class Read_CdBG_Constructor // state transition failed. bool add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old); - // Adds the information of an incident loop for some vertex `v` that connects its side `s` to - // the side itself, all wrapped inside the edge-endpoint object `endpoint` — making the - // appropriate state transition for the DFA of `v`. Returns `false` iff an attempted state - // transition failed. - bool add_one_sided_loop(const Endpoint& endpoint); - // If the endpoint object `v_end` connects to some neighboring endpoint `w_end` through a unique // edge encoded with `e`, then discards the incidence information of `w_end` — making the // appropriate state transition for the corresponding neighboring vertex `w`. @@ -100,57 +120,9 @@ class Read_CdBG_Constructor // Discards the incidence information of some endpoint `w_end` that connects to the endpoint // `v_end` through the unique edge encoded with `e` — making the appropriate state transition. void discard_neighbor_side(const Endpoint& v, cuttlefish::edge_encoding_t e); - - -public: - - // Consructs a read-CdBG builder object, with the required parameters wrapped in `params`, and uses - // the Cuttlefish hash table `hash_table`. - Read_CdBG_Constructor(const Build_Params& params, Kmer_Hash_Table& hash_table); - - // Computes the states of the DFA in the de Bruijn graph with the edge set at path prefix `edge_db_path`. - void compute_DFA_states(const std::string& edge_db_path); - - // Returns the number of distinct vertices in the underlying graph. - uint64_t vertex_count() const; - - // Returns the number of distinct edges in the underlying graph. - uint64_t edge_count() const; }; -template -inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new) -{ - // Fetch the hash table entry for the vertex associated to the endpoint. - - Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; - State_Read_Space& state = bucket.get_state(); - e_new = e_old = state.edge_at(endpoint.side()); - - - // If we've already discarded the incidence information for this side, then a self-transition happens. - if(e_old == cuttlefish::edge_encoding_t::N) - return true; // Early return w/o updating the same value again is safe — see the note at the end of the method. - - if(e_old == cuttlefish::edge_encoding_t::E) // This side of the vertex is observed for the first time. - e_new = endpoint.edge(); - else if(e_old != endpoint.edge()) // This side has been visited earlier, but with a different edge — discard the incidence information. - e_new = cuttlefish::edge_encoding_t::N; - - - // We can get away without updating the same value again, because — (1) even if this DFA's state changes - // in the hash table by the time this method completes, making no updates at this point is theoretically - // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the - // ordering of the edges processed does not matter in the algorithm. - if(e_new == e_old) - return true; - - state.update_edge_at(endpoint.side(), e_new); - return hash_table.update(bucket); -} - - template inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint) { @@ -183,29 +155,6 @@ inline bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpo } -template -inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back) -{ - // Fetch the hash table entry for the DFA of vertex associated to the endpoint. - - Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; - State_Read_Space& state = bucket.get_state(); - e_front = state.edge_at(cuttlefish::side_t::front); - e_back = state.edge_at(cuttlefish::side_t::back); - - const State_Read_Space state_old = state; - - if(e_front != cuttlefish::edge_encoding_t::N) // Discard the front-incidence information, if not done already. - state.update_edge_at(cuttlefish::side_t::front, cuttlefish::edge_encoding_t::N); - - if(e_back != cuttlefish::edge_encoding_t::N) // Discard the back-incidence information, if not done already. - state.update_edge_at(cuttlefish::side_t::back, cuttlefish::edge_encoding_t::N); - - // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. - return state == state_old ? true : hash_table.update(bucket); -} - - template inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint) { @@ -227,25 +176,6 @@ inline bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpo } -template -inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old) -{ - // Fetch the hash table entry for the vertex associated to the endpoint. - - Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; - State_Read_Space& state = bucket.get_state(); - e_old = state.edge_at(endpoint.side()); - - // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. - if(e_old == cuttlefish::edge_encoding_t::N) // The incidence information has already been discarded. - return true; - - // Discard the incidence information. - state.update_edge_at(endpoint.side(), cuttlefish::edge_encoding_t::N); - return hash_table.update(bucket); -} - - template inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint) { @@ -264,50 +194,5 @@ inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endp } -template -inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) -{ - if(e != cuttlefish::edge_encoding_t::E && e != cuttlefish::edge_encoding_t::N) // The incident edge is unique. - discard_neighbor_side(v_end, e); -} - - -template -inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& u_end, const Endpoint& v_end, const cuttlefish::edge_encoding_t e) -{ - while(!discard_side(v_end)); // Discard the neighbor `v_end`. - - propagate_discard(u_end, e); // Discard the other neighbor. -} - - -template -inline bool Read_CdBG_Constructor::discard_side(const Endpoint& v_end) -{ - // Fetch the hash table entry for the DFA of the vertex associated to the endpoint. - - Kmer_Hash_Entry_API bucket = hash_table[v_end.hash()]; - State_Read_Space& state = bucket.get_state(); - const cuttlefish::edge_encoding_t e_curr = state.edge_at(v_end.side()); - - // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. - if(e_curr == cuttlefish::edge_encoding_t::N) // The incidende information has already been discarded. - return true; - - // Discard the incidence information. - state.update_edge_at(v_end.side(), cuttlefish::edge_encoding_t::N); - return hash_table.update(bucket); -} - - -template -inline void Read_CdBG_Constructor::discard_neighbor_side(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) -{ - const Endpoint w = v_end.neighbor_endpoint(e, hash_table); // Get the neighboring endpoint connected with `e`. - - while(!discard_side(w)); // Discard the incidence information off that neighbor. -} - - #endif diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index b385242f..6717e52d 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -43,7 +43,6 @@ class Read_CdBG_Extractor // TODO: give these limits more thoughts, especially their exact impact on the memory usage. static constexpr std::size_t BUFF_SZ = 100 * 1024ULL; // 100 KB (soft limit) worth of maximal unitig records (FASTA) can be retained in memory, at most, before flushing. - static constexpr std::size_t SEQ_SZ = 1 * 1024ULL * 1024ULL; // 1 MB (soft limit) sized maximal unitig, at most, is constructed at a time. mutable uint64_t vertices_scanned = 0; // Total number of vertices scanned from the database. mutable Spin_Lock lock; // Mutual exclusion lock to access various unique resources by threads spawned off this class' methods. @@ -60,26 +59,11 @@ class Read_CdBG_Extractor // for the unitpath-flanking vertices to be identified and the corresponding unipaths to be extracted. void distribute_unipaths_extraction(Kmer_SPMC_Iterator* vertex_parser, Thread_Pool& thread_pool); - // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` - // for potential unipath-flanking vertices, i.e. for each vertex `v` provided to that thread, - // identifies whether it is a unipath-flanking vertex, and if it is, then piece-wise constructs - // the corresponding unipath. - void scan_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); - // Prcesses the vertices provided to the thread with id `thread_id` from the parser // `vertex_parser`, i.e. for each vertex `v` provided to that thread, attempts to // piece-wise construct its containing maximal unitig. void process_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); - // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` - // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when - // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If - // the attempt is successful, then the maximal unitig is extracted in its canonical form into - // `unipath` (it is overwritten), a unique ID for it is put in `id`, and the hashes of the vertices - // constituting the path overwrites `path_hashes` (when the user-option is specified). If not, - // `unipath` and `path_hashes` may contain partial form of the path, and `id` is unaltered. - bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes); - // Extracts the maximal unitig `p` that contains the vertex `v_hat`, and `maximal_unitig` is // used as the working scratch for the extraction, i.e. to build and store the two unitigs // connecting to the two sides of `v_hat`. Returns `true` iff the extraction is successful, @@ -96,6 +80,57 @@ class Read_CdBG_Extractor // Marks all the vertices which have their hashes present in `path_hashes` as outputted. void mark_path(const std::vector& path_hashes); + // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash + // table update is successful. + bool mark_vertex(const Directed_Vertex& v); + + // Initializes the output sink, corresponding to the file `output_file_path`. + void init_output_sink(const std::string& output_file_path); + + // Closes the output sink. + void close_output_sink(); + + +public: + + // Constructs a vertex-extractor object for some compacted read de Bruijn graph, with the required + // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. + Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); + + // Extracts the maximal unitigs of the de Bruijn graph with the vertex set at path prefix `vertex_db_path`, + // into the output file at `output_file_path`. + void extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path); + + // Returns the parameters collection for the compacted graph construction. + const Build_Params& get_params() const; + + // Returns a wrapper over the meta-information of the extracted unitigs. + const Unipaths_Meta_info& unipaths_meta_info() const; + + // Returns the number of vertices in the underlying graph. + uint64_t vertex_count() const; + + +// The following stuffs are not used anymore with the current algorithm. +private: + + static constexpr std::size_t SEQ_SZ = 1 * 1024ULL * 1024ULL; // 1 MB (soft limit) sized maximal unitig, at most, is constructed at a time. + + // Scans the vertices provided to the thread with id `thread_id` from the parser `vertex_parser` + // for potential unipath-flanking vertices, i.e. for each vertex `v` provided to that thread, + // identifies whether it is a unipath-flanking vertex, and if it is, then piece-wise constructs + // the corresponding unipath. + void scan_vertices(Kmer_SPMC_Iterator* vertex_parser, uint16_t thread_id); + + // Extracts the maximal unitig `p` that is flanked by the vertex `v_hat` and connects to `v_hat` + // through its side `s_v_hat`. Returns `true` iff the extraction is successful, which happens when + // the maximal unitig is encountered and attempted for output-marking _first_, by some thread. If + // the attempt is successful, then the maximal unitig is extracted in its canonical form into + // `unipath` (it is overwritten), a unique ID for it is put in `id`, and the hashes of the vertices + // constituting the path overwrites `path_hashes` (when the user-option is specified). If not, + // `unipath` and `path_hashes` may contain partial form of the path, and `id` is unaltered. + bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes); + // Marks all the vertices in the constituent unitigs of `maximal_unitig` as outputted. void mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); @@ -138,22 +173,12 @@ class Read_CdBG_Extractor // in `cycle` is recorded at `pivot`. bool extract_cycle(const Kmer& v_hat, uint64_t& id, std::vector& cycle, std::size_t& pivot); - // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash - // table update is successful. - bool mark_vertex(const Directed_Vertex& v); - // Marks the two endpoint vertices of a maximal unitig `p` as outputted: the first vertex in the // canonical form of `p`, `sign_vertex`, and the last vertex in the form, `cosign_vertex`. Returns // `true` iff the vertices have not been marked yet and the corresponding hash table updates are // successful. bool mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex); - // Initializes the output sink, corresponding to the file `output_file_path`. - void init_output_sink(const std::string& output_file_path); - - // Closes the output sink. - void close_output_sink(); - // Note: The following methods are only applicable when the heuristic of information-discarding // from branching vertices to their neighbors has been implemented in the DFA states computation // phase. In the general case, these functions with their specified input parameters and their @@ -189,14 +214,6 @@ class Read_CdBG_Extractor public: - // Constructs a vertex-extractor object for some compacted read de Bruijn graph, with the required - // parameters wrapped inside `params`, and uses the Cuttlefish hash table `hash_table`. - Read_CdBG_Extractor(const Build_Params& params, Kmer_Hash_Table& hash_table); - - // Extracts the maximal unitigs of the de Bruijn graph with the vertex set at path prefix `vertex_db_path`, - // into the output file at `output_file_path`. - void extract_maximal_unitigs(const std::string& vertex_db_path, const std::string& output_file_path); - // Extracts the chordless cycles from the de Bruijn graph that are completely disconnected from the // rest of the graph. The graph is to contain its vertex set at the path prefix `vertex_db_path`, // and the cycles are appeneded to the output file at `output_file_path`. `dbg_info` is used to @@ -204,15 +221,6 @@ class Read_CdBG_Extractor // structures are re-used from the earlier construction. void extract_detached_cycles(const std::string& vertex_db_path, const std::string& output_file_path, const dBG_Info& dbg_info); - // Returns the parameters collection for the compacted graph construction. - const Build_Params& get_params() const; - - // Returns a wrapper over the meta-information of the extracted unitigs. - const Unipaths_Meta_info& unipaths_meta_info() const; - - // Returns the number of vertices in the underlying graph. - uint64_t vertex_count() const; - // Returns `true` iff the de Bruijn graph has DCCs (Detached Chordless Cycles). bool has_dcc() const; @@ -235,41 +243,6 @@ inline bool Read_CdBG_Extractor::mark_vertex(const Directed_Vertex& v) } -template -inline bool Read_CdBG_Extractor::mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex) -{ - return mark_vertex(sign_vertex) && (sign_vertex.hash() == cosign_vertex.hash() || mark_vertex(cosign_vertex)); -} - - -template -inline bool Read_CdBG_Extractor::is_flanking_state(const State_Read_Space state, cuttlefish::side_t& unipath_side) -{ - if(is_flanking_side(state, cuttlefish::side_t::front)) - { - unipath_side = cuttlefish::side_t::back; - return true; - } - - if(is_flanking_side(state, cuttlefish::side_t::back)) - { - unipath_side = cuttlefish::side_t::front; - return true; - } - - return false; -} - - -template -inline bool Read_CdBG_Extractor::is_flanking_side(const State_Read_Space state, const cuttlefish::side_t side) -{ - const cuttlefish::edge_encoding_t edge = state.edge_at(side); - - return edge == cuttlefish::edge_encoding_t::N || edge == cuttlefish::edge_encoding_t::E; -} - - template inline void Read_CdBG_Extractor::mark_path(const std::vector& path_hashes) { @@ -278,19 +251,6 @@ inline void Read_CdBG_Extractor::mark_path(const std::vector& path_ } -template -inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) -{ - if(maximal_unitig.is_linear()) - { - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); - } - else - mark_path(maximal_unitig.cycle_hash()); -} - - template inline bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, Maximal_Unitig_Scratch& maximal_unitig) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 52f572b6..7649546e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,8 +30,11 @@ set(PROJECT_SRC kmer_Enumeration_Stats.cpp State_Read_Space.cpp Read_CdBG.cpp + Obsolete/Read_CdBG_Obsolete.cpp Read_CdBG_Constructor.cpp + Obsolete/Read_CdBG_Constructor_Obsolete.cpp Read_CdBG_Extractor.cpp + Obsolete/Read_CdBG_Extractor_Obsolete.cpp Unitig_Scratch.cpp Maximal_Unitig_Scratch.cpp Unipaths_Meta_info.cpp diff --git a/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp b/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp new file mode 100644 index 00000000..28fa6c44 --- /dev/null +++ b/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp @@ -0,0 +1,128 @@ + +#include "Read_CdBG_Constructor.hpp" + + +// The following methods are not used anymore with the current algorithm. + +template +bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new) +{ + // Fetch the hash table entry for the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + e_new = e_old = state.edge_at(endpoint.side()); + + + // If we've already discarded the incidence information for this side, then a self-transition happens. + if(e_old == cuttlefish::edge_encoding_t::N) + return true; // Early return w/o updating the same value again is safe — see the note at the end of the method. + + if(e_old == cuttlefish::edge_encoding_t::E) // This side of the vertex is observed for the first time. + e_new = endpoint.edge(); + else if(e_old != endpoint.edge()) // This side has been visited earlier, but with a different edge — discard the incidence information. + e_new = cuttlefish::edge_encoding_t::N; + + + // We can get away without updating the same value again, because — (1) even if this DFA's state changes + // in the hash table by the time this method completes, making no updates at this point is theoretically + // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the + // ordering of the edges processed does not matter in the algorithm. + if(e_new == e_old) + return true; + + state.update_edge_at(endpoint.side(), e_new); + return hash_table.update(bucket); +} + + +template +bool Read_CdBG_Constructor::add_crossing_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_front, cuttlefish::edge_encoding_t& e_back) +{ + // Fetch the hash table entry for the DFA of vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + e_front = state.edge_at(cuttlefish::side_t::front); + e_back = state.edge_at(cuttlefish::side_t::back); + + const State_Read_Space state_old = state; + + if(e_front != cuttlefish::edge_encoding_t::N) // Discard the front-incidence information, if not done already. + state.update_edge_at(cuttlefish::side_t::front, cuttlefish::edge_encoding_t::N); + + if(e_back != cuttlefish::edge_encoding_t::N) // Discard the back-incidence information, if not done already. + state.update_edge_at(cuttlefish::side_t::back, cuttlefish::edge_encoding_t::N); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + return state == state_old ? true : hash_table.update(bucket); +} + + +template +bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old) +{ + // Fetch the hash table entry for the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[endpoint.hash()]; + State_Read_Space& state = bucket.get_state(); + e_old = state.edge_at(endpoint.side()); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + if(e_old == cuttlefish::edge_encoding_t::N) // The incidence information has already been discarded. + return true; + + // Discard the incidence information. + state.update_edge_at(endpoint.side(), cuttlefish::edge_encoding_t::N); + return hash_table.update(bucket); +} + + +template +inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + if(e != cuttlefish::edge_encoding_t::E && e != cuttlefish::edge_encoding_t::N) // The incident edge is unique. + discard_neighbor_side(v_end, e); +} + + +template +inline void Read_CdBG_Constructor::propagate_discard(const Endpoint& u_end, const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + while(!discard_side(v_end)); // Discard the neighbor `v_end`. + + propagate_discard(u_end, e); // Discard the other neighbor. +} + + +template +inline bool Read_CdBG_Constructor::discard_side(const Endpoint& v_end) +{ + // Fetch the hash table entry for the DFA of the vertex associated to the endpoint. + + Kmer_Hash_Entry_API bucket = hash_table[v_end.hash()]; + State_Read_Space& state = bucket.get_state(); + const cuttlefish::edge_encoding_t e_curr = state.edge_at(v_end.side()); + + // We can get away without updating the same value again: see detailed comment in `add_incident_edge`. + if(e_curr == cuttlefish::edge_encoding_t::N) // The incidende information has already been discarded. + return true; + + // Discard the incidence information. + state.update_edge_at(v_end.side(), cuttlefish::edge_encoding_t::N); + return hash_table.update(bucket); +} + + +template +inline void Read_CdBG_Constructor::discard_neighbor_side(const Endpoint& v_end, const cuttlefish::edge_encoding_t e) +{ + const Endpoint w = v_end.neighbor_endpoint(e, hash_table); // Get the neighboring endpoint connected with `e`. + + while(!discard_side(w)); // Discard the incidence information off that neighbor. +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Constructor) diff --git a/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp new file mode 100644 index 00000000..c620cbe3 --- /dev/null +++ b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp @@ -0,0 +1,192 @@ + +#include "Read_CdBG_Extractor.hpp" +#include "Kmer_SPMC_Iterator.hpp" + + +// The following methods are not used anymore with the current algorithm. +template +void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) +{ + // Data structures to be reused per each vertex scanned. + Kmer v; // The vertex copy to be scanned one-by-one. + cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `p` containing it, if `v` is flanking. + State_Read_Space state; // State of the vertex `v`. + uint64_t id; // The unique ID of the maximal unitig `p`. + std::vector unipath; // The extracted maximal unitig `p`. + std::vector path_hashes; // Hash values of the vertices constituting the maximal unitig `p`. + + uint64_t vertex_count = 0; // Number of vertices scanned by this thread. + Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. + uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. + + Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. + unipath.reserve(SEQ_SZ); + + const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); + if(mark_unipaths) + path_hashes.reserve(BUFF_SZ); + + + while(vertex_parser->tasks_expected(thread_id)) + if(vertex_parser->value_at(thread_id, v)) + { + state = hash_table[v].state(); + + if(!state.is_outputted() && is_flanking_state(state, s_v)) + if(extract_maximal_unitig(v, s_v, id, unipath, path_hashes)) + { + extracted_unipaths_info.add_maximal_unitig(unipath); + + // unipath.emplace_back('\n'); + // output_buffer += unipath; + output_buffer += FASTA_Record>(id, unipath); + // unipath.clear(); + + if(mark_unipaths) + mark_path(path_hashes); + } + + vertex_count++; + + + progress_tracker.track_work(++progress); + } + + + // Aggregate the meta-information over the extracted maximal unitigs and the thread-executions. + lock.lock(); + // std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; + + vertices_scanned += vertex_count; + unipaths_meta_info_.aggregate(extracted_unipaths_info); + + lock.unlock(); +} + + +template +bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes) +{ + // Data structures to be reused per each vertex extension of the maximal unitig. + cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. + Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. + State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. + cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. + cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. + const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); // Whether to mark the vertices present in the maximal unitigs. + + const Directed_Vertex init_vertex(v); + init_vertex.kmer().get_label(unipath); + if(mark_unipaths) + { + path_hashes.clear(); + path_hashes.emplace_back(init_vertex.hash()); + } + + + while(true) + { + if(state.is_outputted()) // The opposite end of the maximal unitig has been reached, and the unitig is found to have already been outputted. + return false; + + if(is_flanking_side(state, s_v)) + break; + + + e_v = state.edge_at(s_v); + b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); + + v.roll_forward(b_ext, hash_table); + s_v = v.exit_side(); + state = hash_table[v.hash()].state(); + + unipath.emplace_back(Kmer::map_char(b_ext)); + if(mark_unipaths) + path_hashes.emplace_back(v.hash()); + // TODO: write-out to disk in case of the size crossing some threshold, and modify `mark_path` accordingly — + // would prevent unwanted memory blow-up in presence of very large maximal unitigs. + } + + const Directed_Vertex& term_vertex = v; + const bool in_canonical = (init_vertex.kmer() < term_vertex.kmer_bar()); + const Directed_Vertex& sign_vertex = (in_canonical ? init_vertex : term_vertex); + const Directed_Vertex& cosign_vertex = (in_canonical ? term_vertex : init_vertex); + + // Mark the flanking vertices as outputted. + if(!mark_flanking_vertices(sign_vertex, cosign_vertex)) + return false; + + if(!in_canonical) + cuttlefish::reverse_complement(unipath); + + id = sign_vertex.hash(); + + return true; +} + + +template +inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +{ + if(maximal_unitig.is_linear()) + { + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); + } + else + mark_path(maximal_unitig.cycle_hash()); +} + + +template +bool Read_CdBG_Extractor::mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex) +{ + return mark_vertex(sign_vertex) && (sign_vertex.hash() == cosign_vertex.hash() || mark_vertex(cosign_vertex)); +} + + +template +bool Read_CdBG_Extractor::is_flanking_state(const State_Read_Space state, cuttlefish::side_t& unipath_side) +{ + if(is_flanking_side(state, cuttlefish::side_t::front)) + { + unipath_side = cuttlefish::side_t::back; + return true; + } + + if(is_flanking_side(state, cuttlefish::side_t::back)) + { + unipath_side = cuttlefish::side_t::front; + return true; + } + + return false; +} + + +template +bool Read_CdBG_Extractor::is_flanking_side(const State_Read_Space state, const cuttlefish::side_t side) +{ + const cuttlefish::edge_encoding_t edge = state.edge_at(side); + + return edge == cuttlefish::edge_encoding_t::N || edge == cuttlefish::edge_encoding_t::E; +} + + +template +bool Read_CdBG_Extractor::has_dcc() const +{ + return unipaths_vertex_count() != vertex_count(); +} + + +template +uint64_t Read_CdBG_Extractor::unipaths_vertex_count() const +{ + return unipaths_meta_info_.kmer_count(); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) diff --git a/src/Obsolete/Read_CdBG_Obsolete.cpp b/src/Obsolete/Read_CdBG_Obsolete.cpp new file mode 100644 index 00000000..3eeca40c --- /dev/null +++ b/src/Obsolete/Read_CdBG_Obsolete.cpp @@ -0,0 +1,75 @@ + +#include "Read_CdBG.hpp" +#include "Read_CdBG_Extractor.hpp" + + +// The following methods are not used anymore with the current algorithm. +template +bool Read_CdBG::extract_DCCs(const std::string& output_file_path, const bool rerun) +{ + if(!extract_DCCs_this_run()) + return !dbg_info.has_dcc(); + + if(rerun) + { + if(!DCC_data_structs_exist()) + { + std::cout << "The data structure(s) required for the cycles extraction have been removed.\n" + "Please re-run Cuttlefish with the originial parameters to recover those.\n"; + return false; + } + + construct_hash_table(Kmer_Container::size(vertex_db_path()), true); + } + + + Read_CdBG_Extractor cdBg_extractor(params, *hash_table); + cdBg_extractor.extract_detached_cycles(vertex_db_path(), output_file_path, dbg_info); + + dbg_info.add_DCC_info(cdBg_extractor); + + return true; +} + + +template +bool Read_CdBG::extract_DCCs_this_run() const +{ + if(!dbg_info.has_dcc()) + { + std::cout << "The graph does not contain any detached chordless cycles.\n"; + return false; + } + + if(dbg_info.dcc_extracted()) + { + std::cout << "The detached chordless cycles have been extracted earlier.\n"; + return false; + } + + if(!params.extract_cycles()) + { + std::cout << "There are Detached Chordless Cycles (DCC) present in the graph.\n" + "Run Cuttlefish with the `cycles` argument to extract those.\n"; + return false; + } + + + return true; +} + + +template +bool Read_CdBG::DCC_data_structs_exist() const +{ + const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; + const std::string mph_path = params.mph_file_path(); + const std::string buckets_path = params.buckets_file_path(); + + return Kmer_Container::exists(vertex_db_path) && file_exists(mph_path) && file_exists(buckets_path); +} + + + +// Template instantiations for the required instances. +ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index a690bdae..188bf0d6 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -206,72 +206,6 @@ void Read_CdBG::extract_maximal_unitigs() } -template -bool Read_CdBG::extract_DCCs(const std::string& output_file_path, const bool rerun) -{ - if(!extract_DCCs_this_run()) - return !dbg_info.has_dcc(); - - if(rerun) - { - if(!DCC_data_structs_exist()) - { - std::cout << "The data structure(s) required for the cycles extraction have been removed.\n" - "Please re-run Cuttlefish with the originial parameters to recover those.\n"; - return false; - } - - construct_hash_table(Kmer_Container::size(vertex_db_path()), true); - } - - - Read_CdBG_Extractor cdBg_extractor(params, *hash_table); - cdBg_extractor.extract_detached_cycles(vertex_db_path(), output_file_path, dbg_info); - - dbg_info.add_DCC_info(cdBg_extractor); - - return true; -} - - -template -bool Read_CdBG::extract_DCCs_this_run() const -{ - if(!dbg_info.has_dcc()) - { - std::cout << "The graph does not contain any detached chordless cycles.\n"; - return false; - } - - if(dbg_info.dcc_extracted()) - { - std::cout << "The detached chordless cycles have been extracted earlier.\n"; - return false; - } - - if(!params.extract_cycles()) - { - std::cout << "There are Detached Chordless Cycles (DCC) present in the graph.\n" - "Run Cuttlefish with the `cycles` argument to extract those.\n"; - return false; - } - - - return true; -} - - -template -bool Read_CdBG::DCC_data_structs_exist() const -{ - const std::string vertex_db_path = params.output_prefix() + cuttlefish::file_ext::vertices_ext; - const std::string mph_path = params.mph_file_path(); - const std::string buckets_path = params.buckets_file_path(); - - return Kmer_Container::exists(vertex_db_path) && file_exists(mph_path) && file_exists(buckets_path); -} - - template bool Read_CdBG::is_constructed() const { diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 8b44711e..75219a59 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -78,66 +78,6 @@ void Read_CdBG_Extractor::distribute_unipaths_extraction(Kmer_SPMC_Iterator -void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) -{ - // Data structures to be reused per each vertex scanned. - Kmer v; // The vertex copy to be scanned one-by-one. - cuttlefish::side_t s_v; // The side of the vertex `v` that connects it to the maximal unitig `p` containing it, if `v` is flanking. - State_Read_Space state; // State of the vertex `v`. - uint64_t id; // The unique ID of the maximal unitig `p`. - std::vector unipath; // The extracted maximal unitig `p`. - std::vector path_hashes; // Hash values of the vertices constituting the maximal unitig `p`. - - uint64_t vertex_count = 0; // Number of vertices scanned by this thread. - Unipaths_Meta_info extracted_unipaths_info; // Meta-information over the maximal unitigs extracted by this thread. - uint64_t progress = 0; // Number of vertices scanned by the thread; is reset at reaching 1% of its approximate workload. - - Character_Buffer output_buffer(output_sink.sink()); // The output buffer for maximal unitigs. - unipath.reserve(SEQ_SZ); - - const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); - if(mark_unipaths) - path_hashes.reserve(BUFF_SZ); - - - while(vertex_parser->tasks_expected(thread_id)) - if(vertex_parser->value_at(thread_id, v)) - { - state = hash_table[v].state(); - - if(!state.is_outputted() && is_flanking_state(state, s_v)) - if(extract_maximal_unitig(v, s_v, id, unipath, path_hashes)) - { - extracted_unipaths_info.add_maximal_unitig(unipath); - - // unipath.emplace_back('\n'); - // output_buffer += unipath; - output_buffer += FASTA_Record>(id, unipath); - // unipath.clear(); - - if(mark_unipaths) - mark_path(path_hashes); - } - - vertex_count++; - - - progress_tracker.track_work(++progress); - } - - - // Aggregate the meta-information over the extracted maximal unitigs and the thread-executions. - lock.lock(); - // std::cout << "Thread " << thread_id << " processed " << vertex_count << " vertices.\n"; - - vertices_scanned += vertex_count; - unipaths_meta_info_.aggregate(extracted_unipaths_info); - - lock.unlock(); -} - - template void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { @@ -179,67 +119,6 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte } -template -bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes) -{ - // Data structures to be reused per each vertex extension of the maximal unitig. - cuttlefish::side_t s_v = s_v_hat; // The side of the current vertex `v` through which to extend the maximal unitig, i.e. exit `v`. - Directed_Vertex v(s_v == cuttlefish::side_t::back ? v_hat : v_hat.reverse_complement(), hash_table); // Current vertex being added to the maximal unitig. - State_Read_Space state = hash_table[v.hash()].state(); // State of the vertex `v`. - cuttlefish::edge_encoding_t e_v; // The next edge from `v` to include into the maximal unitig. - cuttlefish::base_t b_ext; // The nucleobase corresponding to the edge `e_v` and the exiting side `s_v` from `v` to add to the literal maximal unitig. - const bool mark_unipaths = params.extract_cycles() || params.dcc_opt(); // Whether to mark the vertices present in the maximal unitigs. - - const Directed_Vertex init_vertex(v); - init_vertex.kmer().get_label(unipath); - if(mark_unipaths) - { - path_hashes.clear(); - path_hashes.emplace_back(init_vertex.hash()); - } - - - while(true) - { - if(state.is_outputted()) // The opposite end of the maximal unitig has been reached, and the unitig is found to have already been outputted. - return false; - - if(is_flanking_side(state, s_v)) - break; - - - e_v = state.edge_at(s_v); - b_ext = (s_v == cuttlefish::side_t::back ? DNA_Utility::map_base(e_v) : DNA_Utility::complement(DNA_Utility::map_base(e_v))); - - v.roll_forward(b_ext, hash_table); - s_v = v.exit_side(); - state = hash_table[v.hash()].state(); - - unipath.emplace_back(Kmer::map_char(b_ext)); - if(mark_unipaths) - path_hashes.emplace_back(v.hash()); - // TODO: write-out to disk in case of the size crossing some threshold, and modify `mark_path` accordingly — - // would prevent unwanted memory blow-up in presence of very large maximal unitigs. - } - - const Directed_Vertex& term_vertex = v; - const bool in_canonical = (init_vertex.kmer() < term_vertex.kmer_bar()); - const Directed_Vertex& sign_vertex = (in_canonical ? init_vertex : term_vertex); - const Directed_Vertex& cosign_vertex = (in_canonical ? term_vertex : init_vertex); - - // Mark the flanking vertices as outputted. - if(!mark_flanking_vertices(sign_vertex, cosign_vertex)) - return false; - - if(!in_canonical) - cuttlefish::reverse_complement(unipath); - - id = sign_vertex.hash(); - - return true; -} - - template void Read_CdBG_Extractor::init_output_sink(const std::string& output_file_path) { @@ -275,20 +154,6 @@ uint64_t Read_CdBG_Extractor::vertex_count() const } -template -uint64_t Read_CdBG_Extractor::unipaths_vertex_count() const -{ - return unipaths_meta_info_.kmer_count(); -} - - -template -bool Read_CdBG_Extractor::has_dcc() const -{ - return unipaths_vertex_count() != vertex_count(); -} - - // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) From d479e7f5c0db2919874d64ad6e330d6169e019ed Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 3 Nov 2021 22:20:53 -0400 Subject: [PATCH 254/350] Reinstate mistakenly obsoleted code --- include/Read_CdBG_Extractor.hpp | 19 ++++++++++++++++--- src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp | 13 ------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 6717e52d..1d9439ab 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -80,6 +80,9 @@ class Read_CdBG_Extractor // Marks all the vertices which have their hashes present in `path_hashes` as outputted. void mark_path(const std::vector& path_hashes); + // Marks all the vertices in the constituent unitigs of `maximal_unitig` as outputted. + void mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); + // Marks the vertex `v` as outputted. Returns `true` iff `v` has not been marked yet and the hash // table update is successful. bool mark_vertex(const Directed_Vertex& v); @@ -131,9 +134,6 @@ class Read_CdBG_Extractor // `unipath` and `path_hashes` may contain partial form of the path, and `id` is unaltered. bool extract_maximal_unitig(const Kmer& v_hat, cuttlefish::side_t s_v_hat, uint64_t& id, std::vector& unipath, std::vector& path_hashes); - // Marks all the vertices in the constituent unitigs of `maximal_unitig` as outputted. - void mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); - // Marks all the vertices that are present in the maximal unitigs of the graph with its vertex // set being present at the path prefix `vertex_db_path`. void mark_maximal_unitig_vertices(const std::string& vertex_db_path); @@ -251,6 +251,19 @@ inline void Read_CdBG_Extractor::mark_path(const std::vector& path_ } +template +inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +{ + if(maximal_unitig.is_linear()) + { + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); + mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); + } + else + mark_path(maximal_unitig.cycle_hash()); +} + + template inline bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, Maximal_Unitig_Scratch& maximal_unitig) { diff --git a/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp index c620cbe3..3cad56be 100644 --- a/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp +++ b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp @@ -125,19 +125,6 @@ bool Read_CdBG_Extractor::extract_maximal_unitig(const Kmer& v_hat, const } -template -inline void Read_CdBG_Extractor::mark_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) -{ - if(maximal_unitig.is_linear()) - { - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::back)); - mark_path(maximal_unitig.unitig_hash(cuttlefish::side_t::front)); - } - else - mark_path(maximal_unitig.cycle_hash()); -} - - template bool Read_CdBG_Extractor::mark_flanking_vertices(const Directed_Vertex& sign_vertex, const Directed_Vertex& cosign_vertex) { From 035e9603ef06451286e2b13fc130b1356114ce1e Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 3 Nov 2021 22:34:09 -0400 Subject: [PATCH 255/350] Loose the DCC code lost in time, like tears in rain: https://en.wikipedia.org/wiki/Tears_in_rain_monologue --- include/Read_CdBG.hpp | 2 ++ include/Read_CdBG_Constructor.hpp | 2 ++ include/Read_CdBG_Extractor.hpp | 2 ++ include/Thread_Pool.hpp | 2 -- src/CMakeLists.txt | 4 ---- src/Detached_Cycles_Extractor.cpp | 3 +++ .../Read_CdBG_Constructor_Obsolete.cpp | 3 ++- src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp | 2 ++ src/Obsolete/Read_CdBG_Obsolete.cpp | 2 ++ src/Read_CdBG.cpp | 9 --------- src/Read_CdBG_Extractor.cpp | 5 ----- src/Thread_Pool.cpp | 18 ------------------ src/dBG_Info.cpp | 5 ----- 13 files changed, 15 insertions(+), 44 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index c43c270a..a7e4ec07 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -73,6 +73,7 @@ class Read_CdBG // The following stuffs are not used anymore with the current algorithm. +/* private: // Extracts the detached chordless cycles of the graph and appends the output to the @@ -89,6 +90,7 @@ class Read_CdBG // Returns `true` iff the data structures required for DCC-extraction is present // from an earlier execution of the algorithm. bool DCC_data_structs_exist() const; +*/ }; diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index b491f866..31fc6df0 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -79,6 +79,7 @@ class Read_CdBG_Constructor // The following methods are not used anymore with the current algorithm. +/* private: // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped @@ -120,6 +121,7 @@ class Read_CdBG_Constructor // Discards the incidence information of some endpoint `w_end` that connects to the endpoint // `v_end` through the unique edge encoded with `e` — making the appropriate state transition. void discard_neighbor_side(const Endpoint& v, cuttlefish::edge_encoding_t e); +*/ }; diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 1d9439ab..0bb38842 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -115,6 +115,7 @@ class Read_CdBG_Extractor // The following stuffs are not used anymore with the current algorithm. +/* private: static constexpr std::size_t SEQ_SZ = 1 * 1024ULL * 1024ULL; // 1 MB (soft limit) sized maximal unitig, at most, is constructed at a time. @@ -226,6 +227,7 @@ class Read_CdBG_Extractor // Returns the number of vertices present in maximal unitigs (excluding the DCCs). uint64_t unipaths_vertex_count() const; +*/ }; diff --git a/include/Thread_Pool.hpp b/include/Thread_Pool.hpp index 1b6fa8ac..1ab7eeb8 100644 --- a/include/Thread_Pool.hpp +++ b/include/Thread_Pool.hpp @@ -29,8 +29,6 @@ class Thread_Pool output_gfa_reduced, compute_states_read_space, extract_unipaths_read_space, - mark_unipath_vertices, - extract_cycles, }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7649546e..42e29296 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,15 +30,11 @@ set(PROJECT_SRC kmer_Enumeration_Stats.cpp State_Read_Space.cpp Read_CdBG.cpp - Obsolete/Read_CdBG_Obsolete.cpp Read_CdBG_Constructor.cpp - Obsolete/Read_CdBG_Constructor_Obsolete.cpp Read_CdBG_Extractor.cpp - Obsolete/Read_CdBG_Extractor_Obsolete.cpp Unitig_Scratch.cpp Maximal_Unitig_Scratch.cpp Unipaths_Meta_info.cpp - Detached_Cycles_Extractor.cpp dBG_Utilities.cpp Character_Buffer_Flusher.cpp Progress_Tracker.cpp diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Detached_Cycles_Extractor.cpp index d5cdf484..0c5f6e2c 100644 --- a/src/Detached_Cycles_Extractor.cpp +++ b/src/Detached_Cycles_Extractor.cpp @@ -9,6 +9,8 @@ #include "dBG_Info.hpp" +// The following stuffs are not used anymore with the current algorithm. +/* template void Read_CdBG_Extractor::extract_detached_cycles(const std::string& vertex_db_path, const std::string& output_file_path, const dBG_Info& dbg_info) { @@ -289,3 +291,4 @@ bool Read_CdBG_Extractor::extract_cycle(const Kmer& v_hat, uint64_t& id, s // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) +*/ diff --git a/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp b/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp index 28fa6c44..fbf76682 100644 --- a/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp +++ b/src/Obsolete/Read_CdBG_Constructor_Obsolete.cpp @@ -3,7 +3,7 @@ // The following methods are not used anymore with the current algorithm. - +/* template bool Read_CdBG_Constructor::add_incident_edge(const Endpoint& endpoint, cuttlefish::edge_encoding_t& e_old, cuttlefish::edge_encoding_t& e_new) { @@ -126,3 +126,4 @@ inline void Read_CdBG_Constructor::discard_neighbor_side(const Endpoint& v // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Constructor) +*/ diff --git a/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp index 3cad56be..d4483af1 100644 --- a/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp +++ b/src/Obsolete/Read_CdBG_Extractor_Obsolete.cpp @@ -4,6 +4,7 @@ // The following methods are not used anymore with the current algorithm. +/* template void Read_CdBG_Extractor::scan_vertices(Kmer_SPMC_Iterator* const vertex_parser, const uint16_t thread_id) { @@ -177,3 +178,4 @@ uint64_t Read_CdBG_Extractor::unipaths_vertex_count() const // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG_Extractor) +*/ diff --git a/src/Obsolete/Read_CdBG_Obsolete.cpp b/src/Obsolete/Read_CdBG_Obsolete.cpp index 3eeca40c..f06bd875 100644 --- a/src/Obsolete/Read_CdBG_Obsolete.cpp +++ b/src/Obsolete/Read_CdBG_Obsolete.cpp @@ -4,6 +4,7 @@ // The following methods are not used anymore with the current algorithm. +/* template bool Read_CdBG::extract_DCCs(const std::string& output_file_path, const bool rerun) { @@ -73,3 +74,4 @@ bool Read_CdBG::DCC_data_structs_exist() const // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) +*/ diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 188bf0d6..a1e42ea2 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -34,7 +34,6 @@ void Read_CdBG::construct() if(is_constructed()) { std::cout << "\nThe compacted de Bruijn graph has been constructed earlier.\n"; - extract_DCCs(params.output_file_path(), true); return; } @@ -106,8 +105,6 @@ void Read_CdBG::construct() if(params.edge_db_path().empty()) #endif Kmer_Container::remove(edge_db_path()); - if(!params.extract_cycles() && !params.dcc_opt()) - hash_table->save(params); std::chrono::high_resolution_clock::time_point t_dfa = std::chrono::high_resolution_clock::now(); std::cout << "Computed the states of the automata. Time taken = " << std::chrono::duration_cast>(t_dfa - t_mphf).count() << " seconds.\n"; @@ -116,8 +113,6 @@ void Read_CdBG::construct() std::cout << "\nExtracting the maximal unitigs.\n"; extract_maximal_unitigs(); - if(!dbg_info.has_dcc() || dbg_info.dcc_extracted()) // Either there are no DCCs, or the DCCs have already been extracted in this run. - { #ifdef CF_DEVELOP_MODE if(params.vertex_db_path().empty()) #endif @@ -125,7 +120,6 @@ void Read_CdBG::construct() Kmer_Container::remove(vertex_db_path()); hash_table->remove(params); - } std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs and DCCs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; @@ -200,9 +194,6 @@ void Read_CdBG::extract_maximal_unitigs() cdBg_extractor.extract_maximal_unitigs(vertex_db_path(), output_file_path); dbg_info.add_unipaths_info(cdBg_extractor); - - if(!extract_DCCs(output_file_path) && params.dcc_opt()) - hash_table->save(params); } diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 75219a59..06cce2e1 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -53,11 +53,6 @@ void Read_CdBG_Extractor::extract_maximal_unitigs(const std::string& vertex_d std::cout << "\nNumber of scanned vertices: " << vertices_scanned << ".\n"; unipaths_meta_info_.print(); - // Check for the existence of cycle(s). - if(has_dcc()) - std::cout << "\nCycles disconnected from the rest of the graph are present." - " I.e. the cycles are graph components exclusively on their own.\n\n"; - std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); diff --git a/src/Thread_Pool.cpp b/src/Thread_Pool.cpp index b6c2247a..fb951c79 100644 --- a/src/Thread_Pool.cpp +++ b/src/Thread_Pool.cpp @@ -35,8 +35,6 @@ Thread_Pool::Thread_Pool(const uint16_t thread_count, void* const dBG, const case Task_Type::compute_states_read_space: case Task_Type::extract_unipaths_read_space: - case Task_Type::mark_unipath_vertices: - case Task_Type::extract_cycles: read_dBG_compaction_params.resize(thread_count); break; @@ -107,22 +105,6 @@ void Thread_Pool::task(const uint16_t thread_id) process_vertices(static_cast*>(params.parser), params.thread_id); } break; - - case Task_Type::mark_unipath_vertices: - { - const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)-> - mark_maximal_unitig_vertices(static_cast*>(params.parser), params.thread_id); - } - break; - - case Task_Type::extract_cycles: - { - const Read_dBG_Compaction_Params& params = read_dBG_compaction_params[thread_id]; - static_cast*>(dBG)-> - extract_detached_chordless_cycles(static_cast*>(params.parser), params.thread_id); - } - break; } diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index f5540c53..7430dea4 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -55,11 +55,6 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor dBg_info[contigs_field]["sum maximal unitig length"] = unipaths_info.sum_len(); dBg_info[contigs_field]["avg. maximal unitig length"] = unipaths_info.avg_len(); dBg_info[contigs_field]["_comment"] = "lengths are in bases"; - - const Build_Params& params = cdbg_extractor.get_params(); - dBg_info[dcc_field]["DCCs present?"] = cdbg_extractor.has_dcc(); - dBg_info[dcc_field]["DCCs extracted?"] = false; - dBg_info[dcc_field]["DCC optimization performed?"] = (params.extract_cycles() || params.dcc_opt()); } From e7f8b60047104bc5e3e7cf618483ccfda2dd4c5f Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 12:16:20 -0400 Subject: [PATCH 256/350] Restruct progress tracking --- include/Progress_Tracker.hpp | 16 ++++++++-------- src/Read_CdBG_Constructor.cpp | 5 ++--- src/Read_CdBG_Extractor.cpp | 3 ++- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/Progress_Tracker.hpp b/include/Progress_Tracker.hpp index b33778aa..b9ddb207 100644 --- a/include/Progress_Tracker.hpp +++ b/include/Progress_Tracker.hpp @@ -32,15 +32,14 @@ class Progress_Tracker // log message to be displayed over the course of tracking is `log_message`. void setup(uint64_t total_work_load, uint64_t work_chunk_threshold, const std::string& log_message); - // Tracks progress made for a work-chunk of size `work_chunk_size`. If an update it made towards progress, - // then the chunk-size is set to 0 to refresh it for the next cycle. - // Note that, the chunk-size must be at least `work_chunk_threshold` for any updates to be made towards - // the progress. All lesser sized chunk update requests are ignored. So, repeated invocation is suggested. - void track_work(uint64_t& work_chunk_size); + // Tracks progress made for a work-chunk of size `work_chunk_size`. If the passed chunk-size is large + // enough, then it is tracked and `true` is returned. All lesser sized chunk update requests are ignored + // and `false` is returned. So, repeated invocation is suggested. + bool track_work(uint64_t work_chunk_size); }; -inline void Progress_Tracker::track_work(uint64_t& work_chunk_size) +inline bool Progress_Tracker::track_work(const uint64_t work_chunk_size) { if(work_chunk_size >= work_chunk_threshold) { @@ -57,9 +56,10 @@ inline void Progress_Tracker::track_work(uint64_t& work_chunk_size) lock.unlock(); - - work_chunk_size = 0; + return true; } + + return false; } diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index cb2dafb0..6525a984 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -133,9 +133,8 @@ void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const ed } edge_count++; - - - progress_tracker.track_work(++progress); + if(progress_tracker.track_work(++progress)) + progress = 0; } lock.lock(); diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 06cce2e1..41f207bc 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -100,7 +100,8 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte } vertex_count++; - progress_tracker.track_work(++progress); + if(progress_tracker.track_work(++progress)) + progress = 0; } From dd52968c53d53bf8b2e2eafe8942f495c98c086d Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 12:17:46 -0400 Subject: [PATCH 257/350] Better progress-tracking for new extraction algo --- src/Read_CdBG_Extractor.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Read_CdBG_Extractor.cpp b/src/Read_CdBG_Extractor.cpp index 41f207bc..267943f3 100644 --- a/src/Read_CdBG_Extractor.cpp +++ b/src/Read_CdBG_Extractor.cpp @@ -97,11 +97,12 @@ void Read_CdBG_Extractor::process_vertices(Kmer_SPMC_Iterator* const verte extracted_unipaths_info.add_maximal_unitig(maximal_unitig); // output_buffer += maximal_unitig.fasta_rec(); maximal_unitig.add_fasta_rec_to_buffer(output_buffer); + + if(progress_tracker.track_work(progress += maximal_unitig.size())) + progress = 0; } vertex_count++; - if(progress_tracker.track_work(++progress)) - progress = 0; } From 54feeb5548baf87b9da59220ee57e0c546766a9a Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 12:31:31 -0400 Subject: [PATCH 258/350] Move params ctr --- include/Build_Params.hpp | 25 +------------------- src/Build_Params.cpp | 50 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index fd051ab2..6e2430a6 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -70,30 +70,7 @@ class Build_Params #ifdef CF_DEVELOP_MODE , const double gamma #endif - ): - is_read_graph_(is_read_graph), - seq_input_(seq_paths, list_paths, dir_paths), - k_(k), - cutoff_(cutoff), - vertex_db_path_(vertex_db_path), - edge_db_path_(edge_db_path), - thread_count_(thread_count), - max_memory_(max_memory), - strict_memory_(strict_memory), - output_file_path_(output_file_path), - output_format_(cuttlefish::Output_Format(output_format)), - working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), - remove_kmc_db_(remove_kmc_db), - mph_file_path_(mph_file_path), - buckets_file_path_(buckets_file_path), - save_vertices_(save_vertices), - json_file_path_(json_file_path), - dcc_opt_(dcc_opt), - extract_cycles_(extract_cycles) -#ifdef CF_DEVELOP_MODE - , gamma_(gamma) -#endif - {} + ); // Returns the boolean flag to whether to build a compacted read or reference de Bruijn graph. diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 0db24201..4615a6d1 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -3,6 +3,56 @@ #include "utility.hpp" +Build_Params::Build_Params( const bool is_read_graph, + const std::vector& seq_paths, + const std::vector& list_paths, + const std::vector& dir_paths, + const uint16_t k, + const uint32_t cutoff, + const std::string& vertex_db_path, + const std::string& edge_db_path, + const uint16_t thread_count, + const std::size_t max_memory, + const bool strict_memory, + const std::string& output_file_path, + const uint8_t output_format, + const std::string& working_dir_path, + const bool remove_kmc_db, + const std::string& mph_file_path, + const std::string& buckets_file_path, + const bool save_vertices, + const std::string& json_file_path, + const bool dcc_opt, + const bool extract_cycles +#ifdef CF_DEVELOP_MODE + , const double gamma +#endif + ): + is_read_graph_(is_read_graph), + seq_input_(seq_paths, list_paths, dir_paths), + k_(k), + cutoff_(cutoff), + vertex_db_path_(vertex_db_path), + edge_db_path_(edge_db_path), + thread_count_(thread_count), + max_memory_(max_memory), + strict_memory_(strict_memory), + output_file_path_(output_file_path), + output_format_(cuttlefish::Output_Format(output_format)), + working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), + remove_kmc_db_(remove_kmc_db), + mph_file_path_(mph_file_path), + buckets_file_path_(buckets_file_path), + save_vertices_(save_vertices), + json_file_path_(json_file_path), + dcc_opt_(dcc_opt), + extract_cycles_(extract_cycles) +#ifdef CF_DEVELOP_MODE + , gamma_(gamma) +#endif + {} + + bool Build_Params::is_valid() const { // TODO: do better — is a mess. From 7d9606fff1872f34e2ae5e4c1ec77a25f6aeff87 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 12:49:37 -0400 Subject: [PATCH 259/350] Remove CLI for DCC --- include/Build_Params.hpp | 20 +------------------- src/Build_Params.cpp | 8 ++------ src/main.cpp | 9 ++------- 3 files changed, 5 insertions(+), 32 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 6e2430a6..84fb2251 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -36,8 +36,6 @@ class Build_Params const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. const bool save_vertices_; // Option to save the vertex set of the de Bruijn graph (in KMC database format). const std::string json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. - const bool dcc_opt_; // Option to optimize post-cdBG-construction extraction of DCCs (Detached Chordless Cycles). - const bool extract_cycles_; // Option to extract detached chordless cycles from the de Bruijn graph after compaction. #ifdef CF_DEVELOP_MODE const double gamma_; // The gamma parameter for the BBHash MPHF. #endif @@ -64,9 +62,7 @@ class Build_Params const std::string& mph_file_path, const std::string& buckets_file_path, const bool save_vertices, - const std::string& json_file_path, - const bool dcc_opt, - const bool extract_cycles + const std::string& json_file_path #ifdef CF_DEVELOP_MODE , const double gamma #endif @@ -199,20 +195,6 @@ class Build_Params } - // Returns whether the option of optimizing post-cdBG-construction extraction of DCCs is specified. - bool dcc_opt() const - { - return dcc_opt_; - } - - - // Returns whether the option of extracting detached chordless cycles is specified. - bool extract_cycles() const - { - return extract_cycles_; - } - - #ifdef CF_DEVELOP_MODE // Returns the gamma parameter for the BBHash MPHF. double gamma() const diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 4615a6d1..5ae4c953 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -21,9 +21,7 @@ Build_Params::Build_Params( const bool is_read_graph, const std::string& mph_file_path, const std::string& buckets_file_path, const bool save_vertices, - const std::string& json_file_path, - const bool dcc_opt, - const bool extract_cycles + const std::string& json_file_path #ifdef CF_DEVELOP_MODE , const double gamma #endif @@ -44,9 +42,7 @@ Build_Params::Build_Params( const bool is_read_graph, mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), save_vertices_(save_vertices), - json_file_path_(json_file_path), - dcc_opt_(dcc_opt), - extract_cycles_(extract_cycles) + json_file_path_(json_file_path) #ifdef CF_DEVELOP_MODE , gamma_(gamma) #endif diff --git a/src/main.cpp b/src/main.cpp index 6800b0d4..cbe79428 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,10 +42,8 @@ void build(int argc, char** argv) ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("save-vertices", "save the vertex set of the graph") - ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) // TODO: remove the following arg - ("no-dcc", "turn off optimization for post-construction extraction of DCCs (Detached Chordless Cycles)") - ("cycles", "extract the detached chordless cycles of the graph") + ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) #endif @@ -79,8 +77,6 @@ void build(int argc, char** argv) const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); const auto json_file = result["json"].as(); - const auto dcc_opt = !result["no-dcc"].as(); - const auto extract_cycles = result["cycles"].as(); #ifdef CF_DEVELOP_MODE const double gamma = result["gamma"].as(); #endif @@ -89,8 +85,7 @@ void build(int argc, char** argv) refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, - remove_kmc_db, mph_file, buckets_file, save_vertices, json_file, - dcc_opt, extract_cycles + remove_kmc_db, mph_file, buckets_file, save_vertices, json_file #ifdef CF_DEVELOP_MODE , gamma #endif From 2a3233c4375016a4671eb76421b672a8e24d25eb Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 18:40:09 -0400 Subject: [PATCH 260/350] Obsolete DCC src --- src/{ => Obsolete}/Detached_Cycles_Extractor.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{ => Obsolete}/Detached_Cycles_Extractor.cpp (100%) diff --git a/src/Detached_Cycles_Extractor.cpp b/src/Obsolete/Detached_Cycles_Extractor.cpp similarity index 100% rename from src/Detached_Cycles_Extractor.cpp rename to src/Obsolete/Detached_Cycles_Extractor.cpp From 1e3233a186e8b55b02f07dcc42ff3bd8d67e9d7d Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 18:58:23 -0400 Subject: [PATCH 261/350] Print DCC stats --- include/Maximal_Unitig_Scratch.hpp | 10 +++++++ include/Unipaths_Meta_info.hpp | 46 ++++++------------------------ src/Unipaths_Meta_info.cpp | 9 ++++++ 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/include/Maximal_Unitig_Scratch.hpp b/include/Maximal_Unitig_Scratch.hpp index 42e67208..00a90ebb 100644 --- a/include/Maximal_Unitig_Scratch.hpp +++ b/include/Maximal_Unitig_Scratch.hpp @@ -81,6 +81,9 @@ class Maximal_Unitig_Scratch // unique ID. void finalize(); + // Returns `true` iff the maximal unitig has been marked as a cycle. + bool is_cycle() const; + // Returns a FASTA record of the maximal unitig (in canonical form). // Applicable when the maximal unitig is linear. const FASTA_Record> fasta_rec() const; @@ -183,6 +186,13 @@ inline void Maximal_Unitig_Scratch::finalize() } +template +inline bool Maximal_Unitig_Scratch::is_cycle() const +{ + return !is_linear(); +} + + template inline const FASTA_Record> Maximal_Unitig_Scratch::fasta_rec() const { diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index 535b6761..94200140 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -33,17 +33,9 @@ class Unipaths_Meta_info // Constructs a meta-information tracker for maximal unitigs. Unipaths_Meta_info(); - // Adds information of the maximal unitig `unipath` to the tracker. - template - void add_maximal_unitig(const T_container_& unipath); - - // Adds information of the maximal unitig at the scratch space `unipath_scratch` + // Adds information of the maximal unitig at the scratch space `maximal_unitig` // to the tracker. - void add_maximal_unitig(const Maximal_Unitig_Scratch& unipath_scratch); - - // Adds information of the DCC (Detached Chordless Cycle) `cycle` to the tracker. - template - void add_DCC(const T_container_& cycle); + void add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); // Aggregates the information of the tracker `other` to this tracker. void aggregate(const Unipaths_Meta_info& other); @@ -80,24 +72,6 @@ class Unipaths_Meta_info }; -template -template -inline void Unipaths_Meta_info::add_maximal_unitig(const T_container_& unipath) -{ - unipath_count_++; - - kmer_count_ += unipath.size() - (k - 1); - - if(max_len_ < unipath.size()) - max_len_ = unipath.size(); - - if(min_len_ > unipath.size()) - min_len_ = unipath.size(); - - sum_len_ += unipath.size(); -} - - template inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) { @@ -114,18 +88,14 @@ inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scrat min_len_ = unipath_size; sum_len_ += unipath_size; -} - - -template -template -inline void Unipaths_Meta_info::add_DCC(const T_container_& cycle) -{ - dcc_count_++; - dcc_kmer_count_ += cycle.size() - (k - 1); - dcc_sum_len_ += cycle.size(); + if(maximal_unitig.is_cycle()) + { + dcc_count_++; + dcc_kmer_count_ += vertex_count; + dcc_sum_len_ += unipath_size; + } } diff --git a/src/Unipaths_Meta_info.cpp b/src/Unipaths_Meta_info.cpp index 6c430da3..a06ce3cc 100644 --- a/src/Unipaths_Meta_info.cpp +++ b/src/Unipaths_Meta_info.cpp @@ -108,6 +108,15 @@ void Unipaths_Meta_info::print() const std::cout << "Length of the longest maximal unitig (in bases): " << max_len_ << ".\n"; std::cout << "Length of the shortest maximal unitig (in bases): " << min_len_ << ".\n"; std::cout << "Sum length of the maximal unitigs (in bases): " << sum_len_ << ".\n"; + + if(dcc_count_ > 0) + { + std::cout << "\nThere are Detached Chordless Cycles (DCC) present in the graph:\n"; + + std::cout << "DCC count: " << dcc_count_ << ".\n"; + std::cout << "Number of vertices in the DCCs: " << dcc_kmer_count_ << ".\n"; + std::cout << "Sum length of the DCCs (in bases): " << dcc_sum_len_ << ".\n"; + } } From 7b7cdbb317c7925967adf1996235a8a74afe1d18 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 19:22:04 -0400 Subject: [PATCH 262/350] Dump DCC stats --- include/dBG_Info.hpp | 9 --------- src/dBG_Info.cpp | 35 +++++------------------------------ 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index 3ed79f2d..a21f07e2 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -54,15 +54,6 @@ class dBG_Info // Adds information about the extracted DCCs from `cdbg_extractor`. void add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor); - // Writes the JSON object to aits corresponding disk-file. - void dump_info() const; - - // Returns whether the graph has been recorded to contain DCCs. - bool has_dcc() const; - - // Returns whether DCC-extraction optimization has been performed on the graph. - bool dcc_opt_performed() const; - // Returns whether the DCCs have been extracted fron the graph. bool dcc_extracted() const; }; diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 7430dea4..11957ac5 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -55,18 +55,14 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor dBg_info[contigs_field]["sum maximal unitig length"] = unipaths_info.sum_len(); dBg_info[contigs_field]["avg. maximal unitig length"] = unipaths_info.avg_len(); dBg_info[contigs_field]["_comment"] = "lengths are in bases"; -} - -template -void dBG_Info::add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor) -{ - const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); - dBg_info[dcc_field]["DCCs extracted?"] = true; dBg_info[dcc_field]["DCC count"] = unipaths_info.dcc_count(); - dBg_info[dcc_field]["vertex count in the DCCs"] = unipaths_info.dcc_kmer_count(); - dBg_info[dcc_field]["sum DCC length (in bases)"] = unipaths_info.dcc_sum_len(); + if(unipaths_info.dcc_count() > 0) + { + dBg_info[dcc_field]["vertex count in the DCCs"] = unipaths_info.dcc_kmer_count(); + dBg_info[dcc_field]["sum DCC length (in bases)"] = unipaths_info.dcc_sum_len(); + } } @@ -97,27 +93,6 @@ void dBG_Info::dump_info() const } -template -bool dBG_Info::has_dcc() const -{ - return dBg_info[dcc_field]["DCCs present?"]; -} - - -template -bool dBG_Info::dcc_opt_performed() const -{ - return dBg_info[dcc_field]["DCC optimization performed?"]; -} - - -template -bool dBG_Info::dcc_extracted() const -{ - return dBg_info[dcc_field]["DCCs extracted?"]; -} - - // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, dBG_Info) From e345393552b4044d839ffc2eb016ad2792d266b0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 19:40:21 -0400 Subject: [PATCH 263/350] Misc. --- include/dBG_Info.hpp | 7 ++----- src/Build_Params.cpp | 6 ------ src/Read_CdBG.cpp | 2 +- src/kmer_Enumeration_Stats.cpp | 2 +- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index a21f07e2..ff4f6f38 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -51,11 +51,8 @@ class dBG_Info // Adds information about the extracted maximal unitigs from `cdbg_extractor`. void add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor); - // Adds information about the extracted DCCs from `cdbg_extractor`. - void add_DCC_info(const Read_CdBG_Extractor& cdbg_extractor); - - // Returns whether the DCCs have been extracted fron the graph. - bool dcc_extracted() const; + // Writes the JSON object to its corresponding disk-file. + void dump_info() const; }; diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 5ae4c953..93545f39 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -79,12 +79,6 @@ bool Build_Params::is_valid() const std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; valid = false; } - - // if(dcc_opt_ || extract_cycles_) - // { - // std::cout << "Existence of detached chordless cycles are impossible for reference de Bruijn graphs by definition.\n"; - // valid = false; - // } } diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index a1e42ea2..9548a4cf 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -122,7 +122,7 @@ void Read_CdBG::construct() hash_table->remove(params); std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); - std::cout << "Extracted the maximal unitigs and DCCs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; + std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; #ifndef CF_DEVELOP_MODE const double max_disk_usage = std::max(edge_stats.temp_disk_usage(), vertex_stats.temp_disk_usage()) / (1024.0 * 1024.0 * 1024.0); diff --git a/src/kmer_Enumeration_Stats.cpp b/src/kmer_Enumeration_Stats.cpp index 0efa6053..2d4dcdb7 100644 --- a/src/kmer_Enumeration_Stats.cpp +++ b/src/kmer_Enumeration_Stats.cpp @@ -85,7 +85,7 @@ void kmer_Enumeration_Stats::log_stats() const std::cout << "\tTotal sequence length:\t" << seq_len() << ".\n"; std::cout << "\tTotal number of " << k << "-mers:\t" << total_kmer_count() << ".\n"; std::cout << "\tNumber of unique " << k << "-mers:\t" << unique_kmer_count() << ".\n"; - std::cout << "\tNumber of counted " << k << "-mers:\t" << counted_kmer_count() << ".\n"; + std::cout << "\tNumber of solid " << k << "-mers:\t" << counted_kmer_count() << ".\n"; } From aec2c0904e079d1f0284e9be23fcdf570533012b Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 21:26:20 -0400 Subject: [PATCH 264/350] Remove unused CLI param --- include/Build_Params.hpp | 6 ++---- src/Build_Params.cpp | 6 ++---- src/main.cpp | 5 +---- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 84fb2251..37d0245c 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -35,7 +35,6 @@ class Build_Params const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. const bool save_vertices_; // Option to save the vertex set of the de Bruijn graph (in KMC database format). - const std::string json_file_path_; // Optional path to file storing meta-information about the graph and cuttlefish executions. #ifdef CF_DEVELOP_MODE const double gamma_; // The gamma parameter for the BBHash MPHF. #endif @@ -61,8 +60,7 @@ class Build_Params const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, - const bool save_vertices, - const std::string& json_file_path + const bool save_vertices #ifdef CF_DEVELOP_MODE , const double gamma #endif @@ -191,7 +189,7 @@ class Build_Params // Returns the path to the optional file storing meta-information about the graph and cuttlefish executions. const std::string json_file_path() const { - return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::json_ext) : json_file_path_; + return output_file_path_ + cuttlefish::file_ext::json_ext; } diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 93545f39..42713fca 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -20,8 +20,7 @@ Build_Params::Build_Params( const bool is_read_graph, const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, - const bool save_vertices, - const std::string& json_file_path + const bool save_vertices #ifdef CF_DEVELOP_MODE , const double gamma #endif @@ -41,8 +40,7 @@ Build_Params::Build_Params( const bool is_read_graph, remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), - save_vertices_(save_vertices), - json_file_path_(json_file_path) + save_vertices_(save_vertices) #ifdef CF_DEVELOP_MODE , gamma_(gamma) #endif diff --git a/src/main.cpp b/src/main.cpp index cbe79428..d3c001ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -42,8 +42,6 @@ void build(int argc, char** argv) ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("save-vertices", "save the vertex set of the graph") - // TODO: remove the following arg - ("json", "meta-info (JSON) file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) #endif @@ -76,7 +74,6 @@ void build(int argc, char** argv) const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); - const auto json_file = result["json"].as(); #ifdef CF_DEVELOP_MODE const double gamma = result["gamma"].as(); #endif @@ -85,7 +82,7 @@ void build(int argc, char** argv) refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, - remove_kmc_db, mph_file, buckets_file, save_vertices, json_file + remove_kmc_db, mph_file, buckets_file, save_vertices #ifdef CF_DEVELOP_MODE , gamma #endif From fc079823c2d9c0001a21a25f411722f183bb74ef Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 21:44:22 -0400 Subject: [PATCH 265/350] Bump min memory req --- include/Input_Defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index cf4ee5a9..2358b190 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -16,7 +16,7 @@ namespace cuttlefish constexpr uint16_t K = 25; // Set as per the KMC3 default. constexpr uint32_t CUTOFF_FREQ = 2; // Typical practice constexpr uint16_t THREAD_COUNT = 1; - constexpr std::size_t MAX_MEMORY = 2; // Set as per KMC3 library requirement. + constexpr std::size_t MAX_MEMORY = 3; // Set as per KMC3 stage 1 performance. #ifdef CF_DEVELOP_MODE constexpr double GAMMA = 0; #endif From debb08cc1a3d67b90d60f97e05a5dcd373b77f36 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 4 Nov 2021 22:18:32 -0400 Subject: [PATCH 266/350] Remove more redundant code --- src/Read_CdBG.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 9548a4cf..35b29451 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -114,12 +114,10 @@ void Read_CdBG::construct() extract_maximal_unitigs(); #ifdef CF_DEVELOP_MODE - if(params.vertex_db_path().empty()) + if(params.vertex_db_path().empty()) #endif - if(!params.save_vertices()) - Kmer_Container::remove(vertex_db_path()); - - hash_table->remove(params); + if(!params.save_vertices()) + Kmer_Container::remove(vertex_db_path()); std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; From 6a829c583dda2f69e149f346b33446d277b05cae Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 6 Nov 2021 20:26:58 -0400 Subject: [PATCH 267/350] Add dir-name getter --- include/utility.hpp | 3 +++ src/utility.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index 55d55c10..12633223 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -42,6 +42,9 @@ void clear_file(const std::string& file_path); // Returns the name of the file present at the path `file_path`. const std::string filename(const std::string& file_path); +// Returns the directory of the file present at the path `file_path`. +const std::string dirname(const std::string& file_path); + // Moves the file present at path `from_path` to the path `to_path`. void move_file(const std::string& from_path, const std::string& to_path); diff --git a/src/utility.cpp b/src/utility.cpp index 037b4075..71a43f70 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -105,6 +105,12 @@ const std::string filename(const std::string& file_path) } +const std::string dirname(const std::string& file_path) +{ + return ghc::filesystem::path(file_path).remove_filename().string(); +} + + void move_file(const std::string& from_path, const std::string& to_path) { ghc::filesystem::copy(from_path, to_path); From 6ae97a9923136bd58b54afb690f4269fdcfdcc56 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 6 Nov 2021 20:49:28 -0400 Subject: [PATCH 268/350] Add data logistics manager --- include/Data_Logistics.hpp | 46 ++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/Data_Logistics.cpp | 48 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 include/Data_Logistics.hpp create mode 100644 src/Data_Logistics.cpp diff --git a/include/Data_Logistics.hpp b/include/Data_Logistics.hpp new file mode 100644 index 00000000..fa259891 --- /dev/null +++ b/include/Data_Logistics.hpp @@ -0,0 +1,46 @@ + +#ifndef DATA_LOGISTICS_HPP +#define DATA_LOGISTICS_HPP + + + +#include "Build_Params.hpp" + +#include +#include + + +// ============================================================================= +// A class to govern the logistical policies regarding the various data used— +// either as input, output, or temporary—during the lifetime of Cuttlefish. +class Data_Logistics +{ +private: + + const Build_Params& params; // The construction parameters passed to Cuttlefish. + + +public: + + // Constructs a logistics manager object for the parameters in `params`. + Data_Logistics(const Build_Params& build_params); + + // Returns the collection of file paths that are input to Cuttlefish. + const std::vector input_paths_collection() const; + + // Returns the path prefix for temporary files used by Cuttlefish. + const std::string working_dir_path() const; + + // Returns the path prefix to the edge database being used by Cuttlefish. + const std::string edge_db_path() const; + + // Returns the path prefix to the vertex database being used by Cuttlefish. + const std::string vertex_db_path() const; + + // Returns the path to the final output file by Cuttlefish. + const std::string output_file_path() const; +}; + + + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 42e29296..bb36d841 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,6 +35,7 @@ set(PROJECT_SRC Unitig_Scratch.cpp Maximal_Unitig_Scratch.cpp Unipaths_Meta_info.cpp + Data_Logistics.cpp dBG_Utilities.cpp Character_Buffer_Flusher.cpp Progress_Tracker.cpp diff --git a/src/Data_Logistics.cpp b/src/Data_Logistics.cpp new file mode 100644 index 00000000..87ef4a3f --- /dev/null +++ b/src/Data_Logistics.cpp @@ -0,0 +1,48 @@ + +#include "Data_Logistics.hpp" +#include "utility.hpp" + + +Data_Logistics::Data_Logistics(const Build_Params& build_params): + params(build_params) +{} + + +const std::vector Data_Logistics::input_paths_collection() const +{ + return params.sequence_input().seqs(); +} + + +const std::string Data_Logistics::working_dir_path() const +{ + return dirname(params.output_prefix()); +} + + +const std::string Data_Logistics::edge_db_path() const +{ +#ifdef CF_DEVELOP_MODE + if(!params.edge_db_path().empty()) + return params.edge_db_path(); +#endif + + return params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::edges_ext; +} + + +const std::string Data_Logistics::vertex_db_path() const +{ +#ifdef CF_DEVELOP_MODE + if(!params.vertex_db_path().empty()) + return params.vertex_db_path(); +#endif + + return params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::vertices_ext; +} + + +const std::string Data_Logistics::output_file_path() const +{ + return params.output_file_path(); +} From 72940c10351a9eafe818b6d14c33ed0eadab3ba3 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sat, 6 Nov 2021 21:15:03 -0400 Subject: [PATCH 269/350] Update data logistic policies --- include/Read_CdBG.hpp | 8 ++------ src/Read_CdBG.cpp | 48 ++++++++++++------------------------------- 2 files changed, 15 insertions(+), 41 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index a7e4ec07..a9b09ec0 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -6,6 +6,7 @@ #include "globals.hpp" #include "Build_Params.hpp" +#include "Data_Logistics.hpp" #include "Kmer_Hash_Table.hpp" #include "dBG_Info.hpp" @@ -22,6 +23,7 @@ class Read_CdBG private: const Build_Params params; // Required parameters (wrapped inside). + const Data_Logistics logistics; // Data logistics manager for the algorithm execution. std::unique_ptr> hash_table; // Hash table for the vertices (canonical k-mers) of the graph. dBG_Info dbg_info; // Wrapper object for structural information of the graph. @@ -45,12 +47,6 @@ class Read_CdBG // Extracts the maximal unitigs from the graph. void extract_maximal_unitigs(); - // Returns the path prefix to the edge database being used by Cuttlefish. - const std::string edge_db_path() const; - - // Returns the path prefix to the vertex database being used by Cuttlefish. - const std::string vertex_db_path() const; - // Returns `true` iff the compacted de Bruijn graph to be built from the parameters // collection `params` had been constructed in an earlier execution. // NB: only the existence of the output meta-info file is checked for this purpose. diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 35b29451..89c8b3f9 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -13,6 +13,7 @@ template Read_CdBG::Read_CdBG(const Build_Params& params): params(params), + logistics(this->params), hash_table(nullptr), dbg_info(params.json_file_path()) {} @@ -104,7 +105,7 @@ void Read_CdBG::construct() #ifdef CF_DEVELOP_MODE if(params.edge_db_path().empty()) #endif - Kmer_Container::remove(edge_db_path()); + Kmer_Container::remove(logistics.edge_db_path()); std::chrono::high_resolution_clock::time_point t_dfa = std::chrono::high_resolution_clock::now(); std::cout << "Computed the states of the automata. Time taken = " << std::chrono::duration_cast>(t_dfa - t_mphf).count() << " seconds.\n"; @@ -117,7 +118,7 @@ void Read_CdBG::construct() if(params.vertex_db_path().empty()) #endif if(!params.save_vertices()) - Kmer_Container::remove(vertex_db_path()); + Kmer_Container::remove(logistics.vertex_db_path()); std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; @@ -133,9 +134,9 @@ template kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { return kmer_Enumerator().enumerate( - KMC::InputFileType::FASTQ, params.sequence_input().seqs(), params.cutoff(), + KMC::InputFileType::FASTQ, logistics.input_paths_collection(), params.cutoff(), params.thread_count(), params.max_memory(), params.strict_memory(), params.strict_memory(), - params.working_dir_path(), edge_db_path()); + logistics.working_dir_path(), logistics.edge_db_path()); } @@ -143,9 +144,9 @@ template kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { return kmer_Enumerator().enumerate( - KMC::InputFileType::KMC, std::vector(1, edge_db_path()), 1, + KMC::InputFileType::KMC, std::vector(1, logistics.edge_db_path()), 1, params.thread_count(), max_memory, params.strict_memory(), false, - params.working_dir_path(), vertex_db_path()); + logistics.working_dir_path(), logistics.vertex_db_path()); } @@ -154,7 +155,7 @@ void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool { if(load) { - hash_table = std::make_unique>(vertex_db_path(), vertex_count); + hash_table = std::make_unique>(logistics.vertex_db_path(), vertex_count); hash_table->load(params); } else @@ -165,11 +166,11 @@ void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool hash_table = #ifdef CF_DEVELOP_MODE - std::make_unique>(vertex_db_path(), vertex_count, max_memory, params.gamma()); + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, params.gamma()); #else - std::make_unique>(vertex_db_path(), vertex_count, max_memory); + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory); #endif - hash_table->construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path()); } } @@ -178,7 +179,7 @@ template void Read_CdBG::compute_DFA_states() { Read_CdBG_Constructor cdBg_constructor(params, *hash_table); - cdBg_constructor.compute_DFA_states(edge_db_path()); + cdBg_constructor.compute_DFA_states(logistics.edge_db_path()); dbg_info.add_basic_info(cdBg_constructor); } @@ -188,9 +189,8 @@ template void Read_CdBG::extract_maximal_unitigs() { Read_CdBG_Extractor cdBg_extractor(params, *hash_table); - const std::string output_file_path = params.output_file_path(); - cdBg_extractor.extract_maximal_unitigs(vertex_db_path(), output_file_path); + cdBg_extractor.extract_maximal_unitigs(logistics.vertex_db_path(), logistics.output_file_path()); dbg_info.add_unipaths_info(cdBg_extractor); } @@ -202,28 +202,6 @@ bool Read_CdBG::is_constructed() const } -template -const std::string Read_CdBG::edge_db_path() const -{ -#ifdef CF_DEVELOP_MODE - return params.edge_db_path().empty()? (params.output_prefix() + cuttlefish::file_ext::edges_ext) : params.edge_db_path(); -#endif - - return params.output_prefix() + cuttlefish::file_ext::edges_ext; -} - - -template -const std::string Read_CdBG::vertex_db_path() const -{ -#ifdef CF_DEVELOP_MODE - return params.vertex_db_path().empty() ? (params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::vertices_ext) : params.vertex_db_path(); -#endif - - return params.working_dir_path() + filename(params.output_prefix()) + cuttlefish::file_ext::vertices_ext; -} - - // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) From 6b54bd3732c71928c64aea44ac3f3ab1109d2dee Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 11 Nov 2021 20:31:18 -0500 Subject: [PATCH 270/350] Support reference (FASTA) input --- include/Build_Params.hpp | 13 +++++++++++-- src/Build_Params.cpp | 14 +++++++++++--- src/Read_CdBG.cpp | 3 ++- src/main.cpp | 6 ++++-- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 37d0245c..9ad8c870 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -19,7 +19,8 @@ class Build_Params { private: - const bool is_read_graph_; // Whether to build a compacted read or reference de Bruijn graph. + const bool is_read_graph_; // Whether to build a compacted read de Bruijn graph or not. + const bool is_ref_graph_; // Whether to build a compacted reference de Bruijn graph or not. const Seq_Input seq_input_; // Collection of the input sequences. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. const uint32_t cutoff_; // Frequency cutoff for the (k + 1)-mers (for short-read set input). @@ -44,6 +45,7 @@ class Build_Params // Constructs a parameters wrapper object with the self-explanatory parameters. Build_Params( const bool is_read_graph, + const bool is_ref_graph, const std::vector& seq_paths, const std::vector& list_paths, const std::vector& dir_paths, @@ -67,13 +69,20 @@ class Build_Params ); - // Returns the boolean flag to whether to build a compacted read or reference de Bruijn graph. + // Returns the boolean flag to whether to build a compacted read de Bruijn graph or not. bool is_read_graph() const { return is_read_graph_; } + // Returns the boolean flag to whether to build a compacted reference de Bruijn graph or not. + bool is_ref_graph() const + { + return is_ref_graph_; + } + + // Returns the sequence input collection. const Seq_Input& sequence_input() const { diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 42713fca..8a6168f2 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -4,6 +4,7 @@ Build_Params::Build_Params( const bool is_read_graph, + const bool is_ref_graph, const std::vector& seq_paths, const std::vector& list_paths, const std::vector& dir_paths, @@ -26,6 +27,7 @@ Build_Params::Build_Params( const bool is_read_graph, #endif ): is_read_graph_(is_read_graph), + is_ref_graph_(is_ref_graph), seq_input_(seq_paths, list_paths, dir_paths), k_(k), cutoff_(cutoff), @@ -62,7 +64,13 @@ bool Build_Params::is_valid() const // Check if read and reference de Bruijn graph parameters are being mixed with. - if(is_read_graph_) // Is a read de Bruijn graph. + if(is_read_graph_ && is_ref_graph_) + { + std::cout << "Both read and reference de Bruijn graph specified. Please select only one, or none for Cuttlefish 1.0.\n"; + valid = false; + } + + if(is_read_graph_ || is_ref_graph_) // Is Cuttlefish 2.0. { if(output_format_ != cuttlefish::Output_Format::txt) { @@ -70,9 +78,9 @@ bool Build_Params::is_valid() const valid = false; } } - else // Is a reference de Bruijn graph. + else // Is Cuttlefish 1.0. { - if(!edge_db_path_.empty()) + if(!vertex_db_path_.empty()) { std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; valid = false; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 89c8b3f9..721292c4 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -133,8 +133,9 @@ void Read_CdBG::construct() template kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { + const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); return kmer_Enumerator().enumerate( - KMC::InputFileType::FASTQ, logistics.input_paths_collection(), params.cutoff(), + ip_type, logistics.input_paths_collection(), params.cutoff(), params.thread_count(), params.max_memory(), params.strict_memory(), params.strict_memory(), logistics.working_dir_path(), logistics.edge_db_path()); } diff --git a/src/main.cpp b/src/main.cpp index d3c001ec..45dde994 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -24,6 +24,7 @@ void build(int argc, char** argv) options.add_options() // TODO: better indent the following wall of text ("read", "construct a compacted read de Bruijn graph") + ("ref", "construct a compacted reference de Bruijn graph") ("r,refs", "reference files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("l,lists", "reference file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) @@ -57,6 +58,7 @@ void build(int argc, char** argv) } const auto is_read_graph = result["read"].as(); + const auto is_ref_graph = result["ref"].as(); const auto refs = result["refs"].as>(); const auto lists = result["lists"].as>(); const auto dirs = result["dirs"].as>(); @@ -78,7 +80,7 @@ void build(int argc, char** argv) const double gamma = result["gamma"].as(); #endif - const Build_Params params( is_read_graph, + const Build_Params params( is_read_graph, is_ref_graph, refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, @@ -100,7 +102,7 @@ void build(int argc, char** argv) std::cout << "\nConstructing the compacted " << dBg_type << " de Bruijn graph for k = " << k << ".\n"; - params.is_read_graph() ? + (params.is_read_graph() || params.is_ref_graph()) ? Application(params).execute() : Application(params).execute(); From e91ad48610816d1e0906fdf37064556cbd9c5ba8 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 15:08:43 -0500 Subject: [PATCH 271/350] Add file-size checker --- include/utility.hpp | 5 +++++ src/utility.cpp | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index 12633223..ac168a85 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -3,6 +3,7 @@ #define UTILITY_HPP +#include #include #include @@ -21,6 +22,10 @@ bool is_prefix(const std::string& s, const std::string& pref); // `file_path`. bool file_exists(const std::string& file_path); +// Returns the file size is bytes of the file at path `file_path`. Returns +// `0` in case the file does not exist. +std::size_t file_size(const std::string& file_path); + // Returns `true` iff there exists some file in the file system path // `path` with its name being prefixed by `prefix`. bool file_prefix_exists(const std::string& path, const std::string& prefix); diff --git a/src/utility.cpp b/src/utility.cpp index 71a43f70..4bad50e1 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -10,6 +10,7 @@ #include #include #include +#include std::string get_random_string(const size_t len, const char* const alphabet) @@ -46,6 +47,14 @@ bool file_exists(const std::string& file_path) } +std::size_t file_size(const std::string& file_path) +{ + std::error_code ec; + const uintmax_t size = ghc::filesystem::file_size(file_path, ec); + return ec ? 0 : static_cast(size); +} + + bool file_prefix_exists(const std::string& path, const std::string& prefix) { for(const auto& entry: ghc::filesystem::directory_iterator(path)) From 3790d7b579dc282300af9b1c85ae32e26e0f92e0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 15:13:50 -0500 Subject: [PATCH 272/350] Add k-mer database size checker --- include/Kmer_Container.hpp | 3 +++ src/Kmer_Container.cpp | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/Kmer_Container.hpp b/include/Kmer_Container.hpp index ae44cc60..16f9fcdc 100644 --- a/include/Kmer_Container.hpp +++ b/include/Kmer_Container.hpp @@ -52,6 +52,9 @@ class Kmer_Container // Removes the KMC database at path `kmc_db_prefix` from disk. static void remove(const std::string& kmc_db_prefix); + // Returns the size in bytes of the k-mer database with path prefix `kmc_db_prefix`. + static std::size_t database_size(const std::string& kmc_db_prefix); + // Returns an iterator pointing to the beginning of the underlying k-mer // database. // buf_iterator buf_begin() const; diff --git a/src/Kmer_Container.cpp b/src/Kmer_Container.cpp index 796e2aa7..fecf7927 100644 --- a/src/Kmer_Container.cpp +++ b/src/Kmer_Container.cpp @@ -84,6 +84,26 @@ void Kmer_Container::remove(const std::string& kmc_db_path) } +template +std::size_t Kmer_Container::database_size(const std::string& kmc_db_prefix) +{ + const std::string kmc_pref_file(kmc_db_prefix + ".kmc_pre"); + const std::string kmc_suff_file(kmc_db_prefix + ".kmc_suf"); + + const std::size_t pref_sz = file_size(kmc_pref_file); + const std::size_t suff_sz = file_size(kmc_suff_file); + + if(!pref_sz || !suff_sz) + { + std::cerr << "Error computing size of KMC database at " << kmc_db_prefix << ". Possibly missing file(s). Aborting.\n"; + std::exit(EXIT_FAILURE); + } + + + return pref_sz + suff_sz; +} + + // template // typename Kmer_Container::iterator Kmer_Container::end() const // { From 76176c1e94a8490477c7035322946f4287786120 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 17:05:53 -0500 Subject: [PATCH 273/350] Track output db size --- include/kmer_Enumeration_Stats.hpp | 9 +++++++-- src/kmer_Enumeration_Stats.cpp | 12 ++++++++++-- src/kmer_Enumerator.cpp | 5 +++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/kmer_Enumeration_Stats.hpp b/include/kmer_Enumeration_Stats.hpp index 6bf17b8d..1a978ed8 100644 --- a/include/kmer_Enumeration_Stats.hpp +++ b/include/kmer_Enumeration_Stats.hpp @@ -19,14 +19,16 @@ class kmer_Enumeration_Stats const KMC::Stage1Results stage1_results; // Results stats of KMC stage 1 execution. const KMC::Stage2Results stage2_results; // Results stats of KMC stage 2 execution. const std::size_t max_memory_; // Maximum memory usage allowed for the KMC executions. + const std::size_t db_size_; // Size of the output KMC database size in bytes. public: // Constructs a a k-mer enumeration stats wrapper object for a KMC execution with // first stage results in `stage1_results`, second stage results in `stage2_results`, - // and maximum allowed memory usage to be `max_memory` (in GB). - kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory); + // maximum allowed memory usage to be `max_memory` (in GB), and output database size + // of `db_size`. + kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, std::size_t max_memory, std::size_t db_size); // Returns the number of sequences in the execution input. uint64_t seq_count() const; @@ -59,6 +61,9 @@ class kmer_Enumeration_Stats // Returns the temporary disk usage (in bytes) used by the execution. std::size_t temp_disk_usage() const; + // Returns the size of the output KMC database size in bytes. + std::size_t db_size() const; + // Logs a summary statistics of the execution. void log_stats() const; }; diff --git a/src/kmer_Enumeration_Stats.cpp b/src/kmer_Enumeration_Stats.cpp index 2d4dcdb7..9c9baf59 100644 --- a/src/kmer_Enumeration_Stats.cpp +++ b/src/kmer_Enumeration_Stats.cpp @@ -6,10 +6,11 @@ template -kmer_Enumeration_Stats::kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory): +kmer_Enumeration_Stats::kmer_Enumeration_Stats(const KMC::Stage1Results& stage1_results, const KMC::Stage2Results& stage2_results, const std::size_t max_memory, const std::size_t db_size): stage1_results(stage1_results), stage2_results(stage2_results), - max_memory_(max_memory) + max_memory_(max_memory), + db_size_(db_size) {} @@ -76,6 +77,13 @@ std::size_t kmer_Enumeration_Stats::temp_disk_usage() const } +template +std::size_t kmer_Enumeration_Stats::db_size() const +{ + return db_size_; +} + + template void kmer_Enumeration_Stats::log_stats() const { diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 4c8d5006..49f701bc 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -1,5 +1,6 @@ #include "kmer_Enumerator.hpp" +#include "Kmer_Container.hpp" template @@ -48,9 +49,9 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( stage2_params.SetMaxRamGB(memory); stage2_results = kmc.RunStage2(stage2_params); + const std::size_t db_size = Kmer_Container::database_size(output_db_path); - - return kmer_Enumeration_Stats(stage1_results, stage2_results, memory); + return kmer_Enumeration_Stats(stage1_results, stage2_results, memory, db_size); } From ab8470ceb1ba503a407262a2122c0eda273d1ff6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 17:30:44 -0500 Subject: [PATCH 274/350] Better track max disk usage --- include/Read_CdBG.hpp | 5 +++++ src/Read_CdBG.cpp | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index a9b09ec0..7f178c13 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -52,6 +52,11 @@ class Read_CdBG // NB: only the existence of the output meta-info file is checked for this purpose. bool is_constructed() const; + // Returns the maximum temporary disk-usage incurred by some execution of the algorithm, + // that has its edges-enumeration stats in `edge_stats` and vertices-enumeration stats + // in `vertex_stats`. + static std::size_t max_disk_usage(const kmer_Enumeration_Stats& edge_stats, const kmer_Enumeration_Stats& vertex_stats); + public: diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 721292c4..ce3972e0 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -124,8 +124,8 @@ void Read_CdBG::construct() std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; #ifndef CF_DEVELOP_MODE - const double max_disk_usage = std::max(edge_stats.temp_disk_usage(), vertex_stats.temp_disk_usage()) / (1024.0 * 1024.0 * 1024.0); - std::cout << "\nMaximum temporary disk-usage: " << max_disk_usage << "GB.\n"; + const double max_disk = static_cast(max_disk_usage(edge_stats, vertex_stats)) / (1024.0 * 1024.0 * 1024.0); + std::cout << "\nMaximum temporary disk-usage: " << max_disk << "GB.\n"; #endif } @@ -203,6 +203,17 @@ bool Read_CdBG::is_constructed() const } +template +std::size_t Read_CdBG::max_disk_usage(const kmer_Enumeration_Stats& edge_stats, const kmer_Enumeration_Stats& vertex_stats) +{ + const std::size_t at_edge_enum = std::max(edge_stats.temp_disk_usage(), edge_stats.db_size()); + const std::size_t at_vertex_enum = edge_stats.db_size() + std::max(vertex_stats.temp_disk_usage(), vertex_stats.db_size()); + + const std::size_t max_disk = std::max(at_edge_enum, at_vertex_enum); + return max_disk; +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, Read_CdBG) From da3009d5847309577504c1e7a150361e5df95337 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 18:12:12 -0500 Subject: [PATCH 275/350] Expose gamma-setting out of developer mode --- include/Kmer_Hash_Table.hpp | 2 -- src/Kmer_Hash_Table.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index b6fc8dca..a61a1ec7 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -106,14 +106,12 @@ class Kmer_Hash_Table // The hash table may use at most `max_memory` bytes of memory. Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count, std::size_t max_memory); -#ifdef CF_DEVELOP_MODE // Constructs a k-mer hash table where the table is to be built over the k-mer // database having path prefix `kmer_db_path` and `kmer_count` distinct k-mers. // The gamma factor of the BBHash MPHF of the table is attempted to be set to // `gamma`, if it is non-zero. Otherwise, it is set such that the the hash // table may use at most `max_memory` bytes of memory. Kmer_Hash_Table(const std::string& kmc_db_path, uint64_t kmer_count, std::size_t max_memory, double gamma); -#endif // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 34775fdd..c169a4e1 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -39,7 +39,6 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path } -#ifdef CF_DEVELOP_MODE template Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory, const double gamma): Kmer_Hash_Table(kmc_db_path, kmer_count) { @@ -48,7 +47,6 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path else set_gamma(max_memory); } -#endif template From ce7ab2497f4206b1de216ca0d1ab89632200fd30 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 18:16:07 -0500 Subject: [PATCH 276/350] Enlarge hash table in unrestricted-memory mode --- include/Kmer_Hash_Table.hpp | 3 +++ include/Read_CdBG.hpp | 1 + src/Kmer_Hash_Table.cpp | 6 ++++-- src/Read_CdBG.cpp | 5 ++++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index a61a1ec7..00e3bd5f 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -28,6 +28,9 @@ class Kmer_Hash_Table // The minimum gamma-value that we require for BBHash. static constexpr double gamma_min = 2.0; + // The maximum gamma-value that we may use with BBHash. + static constexpr double gamma_max = 10.0; + // The minimum bits per hash key we require for BBHash. static constexpr double min_bits_per_hash_key = 3.71; diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index 7f178c13..d4ca007b 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -8,6 +8,7 @@ #include "Build_Params.hpp" #include "Data_Logistics.hpp" #include "Kmer_Hash_Table.hpp" +#include "kmer_Enumeration_Stats.hpp" #include "dBG_Info.hpp" #include diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index c169a4e1..f2133a01 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -13,6 +13,7 @@ template constexpr double Kmer_Hash_Table::gamma_min; +template constexpr double Kmer_Hash_Table::gamma_max; template constexpr double Kmer_Hash_Table::min_bits_per_hash_key; template constexpr double Kmer_Hash_Table::bits_per_gamma[]; template constexpr double Kmer_Hash_Table::gamma_resolution; @@ -40,10 +41,11 @@ Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path template -Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory, const double gamma): Kmer_Hash_Table(kmc_db_path, kmer_count) +Kmer_Hash_Table::Kmer_Hash_Table(const std::string& kmc_db_path, const uint64_t kmer_count, const std::size_t max_memory, const double gamma): + Kmer_Hash_Table(kmc_db_path, kmer_count) { if(gamma > 0) - this->gamma = gamma; + this->gamma = std::min(std::max(gamma, gamma_min), gamma_max); else set_gamma(max_memory); } diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index ce3972e0..b78829ec 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -7,6 +7,7 @@ #include "utility.hpp" #include "kmc_runner.h" +#include #include @@ -169,7 +170,9 @@ void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool #ifdef CF_DEVELOP_MODE std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, params.gamma()); #else - std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory); + (params.strict_memory() ? + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); #endif hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path()); } From a041ee5eaa7f95bfdd575ad71e7d3805ddb16680 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 14 Nov 2021 18:25:47 -0500 Subject: [PATCH 277/350] Fix o/p extensions for ref graphs --- include/Build_Params.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 9ad8c870..c5613546 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -149,7 +149,7 @@ class Build_Params // Returns the path to the output file. const std::string output_file_path() const { - return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::unipaths_ext) : output_file_path_; + return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::unipaths_ext) : output_file_path_; } @@ -177,14 +177,14 @@ class Build_Params // Returns the path to the optional MPH file. const std::string mph_file_path() const { - return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::hash_ext) : mph_file_path_; + return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::hash_ext) : mph_file_path_; } // Returns the path to the optional file storing the hash table buckets. const std::string buckets_file_path() const { - return is_read_graph() ? (output_file_path_ + cuttlefish::file_ext::buckets_ext) : buckets_file_path_; + return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::buckets_ext) : buckets_file_path_; } From 0a049a5a9c71f284a8f549a70ade0c655497bf5f Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 22 Nov 2021 22:21:56 -0500 Subject: [PATCH 278/350] Fix path cover terminology --- include/Build_Params.hpp | 10 +++++----- include/Read_CdBG_Constructor.hpp | 16 ++++++++-------- src/Build_Params.cpp | 2 ++ src/Read_CdBG_Constructor.cpp | 8 ++++---- src/main.cpp | 6 +++--- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 9b21df01..5fe104d4 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -32,7 +32,7 @@ class Build_Params const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). const std::string working_dir_path_; // Path to the working directory (for temporary files). - const bool simplitigs_; // Whether to extract a set of maximal simplitigs, i.e. vertex-disjoint paths. + const bool path_cover_; // Whether to extract a maximal path cover of the de Bruijn graph. const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. @@ -60,7 +60,7 @@ class Build_Params const std::string& output_file_path, const uint8_t output_format, const std::string& working_dir_path, - const bool simplitig, + const bool path_cover, const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, @@ -169,10 +169,10 @@ class Build_Params } - // Returns whether to extract a set of maximal simplitigs, i.e. vertex-disjoint paths. - bool simplitigs() const + // Returns whether to extract a maximal path cover of the de Bruijn graph. + bool path_cover() const { - return simplitigs_; + return path_cover_; } diff --git a/include/Read_CdBG_Constructor.hpp b/include/Read_CdBG_Constructor.hpp index d4d0a8d3..e777348b 100644 --- a/include/Read_CdBG_Constructor.hpp +++ b/include/Read_CdBG_Constructor.hpp @@ -42,7 +42,7 @@ class Read_CdBG_Constructor void distribute_states_computation(Kmer_SPMC_Iterator* edge_parser, Thread_Pool& thread_pool); // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, - // based on the end-purpose of extracting either the maximal unitigs or the simplitigs. + // based on the end-purpose of extracting either the maximal unitigs or a maximal path cover. void process_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, @@ -52,8 +52,8 @@ class Read_CdBG_Constructor // Processes the edges provided to the thread with id `thread_id` from the parser `edge_parser`, // i.e. makes state-transitions for the DFA of the vertices `u` and `v` for each bidirected edge - // `(u, v)` provided to that thread, to construct a set of maximal simplitigs covering the dBG. - void process_simplitig_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); + // `(u, v)` provided to that thread, to construct a maximal path cover of the dBG. + void process_path_cover_edges(Kmer_SPMC_Iterator* edge_parser, uint16_t thread_id); // Adds the information of an incident edge `e` to the side `s` of some vertex `v`, all wrapped // inside the edge-endpoint object `endpoint` — making the appropriate state transitions for the @@ -73,10 +73,10 @@ class Read_CdBG_Constructor // Adds the information of the edge `e = {u, v}` to its endpoint vertices `u` and `v` iff this // edge connects sides of `u` and `v` that do not have any edges added yet, which ensures that - // neither of the vertices belong to two different simplitig paths; and makes the appropriate - // state transitions for the DFAs of `u` and `v`. Returns `false` iff the edge could not be - // added as a simplitig edge. - bool add_simplitig_edge(const Edge& e); + // neither of the vertices belong to two different paths in a path cover of the graph; and makes + // the appropriate state transitions for the DFAs of `u` and `v`. Returns `false` iff the edge + // could not be added as such. + bool add_path_cover_edge(const Edge& e); public: @@ -214,7 +214,7 @@ inline bool Read_CdBG_Constructor::add_one_sided_loop(const Endpoint& endp template -bool Read_CdBG_Constructor::add_simplitig_edge(const Edge& e) +bool Read_CdBG_Constructor::add_path_cover_edge(const Edge& e) { // Fetch the hash table entry for the vertices associated to the endpoints. diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 8a6168f2..5afd11ea 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -18,6 +18,7 @@ Build_Params::Build_Params( const bool is_read_graph, const std::string& output_file_path, const uint8_t output_format, const std::string& working_dir_path, + const bool path_cover, const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, @@ -39,6 +40,7 @@ Build_Params::Build_Params( const bool is_read_graph, output_file_path_(output_file_path), output_format_(cuttlefish::Output_Format(output_format)), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), + path_cover_(path_cover), remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index 8000e7d6..e31c3c20 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -80,8 +80,8 @@ void Read_CdBG_Constructor::distribute_states_computation(Kmer_SPMC_Iterator< template void Read_CdBG_Constructor::process_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { - if(params.simplitigs()) - process_simplitig_edges(edge_parser, thread_id); + if(params.path_cover()) + process_path_cover_edges(edge_parser, thread_id); else process_cdbg_edges(edge_parser, thread_id); } @@ -156,7 +156,7 @@ void Read_CdBG_Constructor::process_cdbg_edges(Kmer_SPMC_Iterator* con template -void Read_CdBG_Constructor::process_simplitig_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) +void Read_CdBG_Constructor::process_path_cover_edges(Kmer_SPMC_Iterator* const edge_parser, const uint16_t thread_id) { Edge e; // For the edges to be processed one-by-one; say this is between the vertices `u` and `v`. @@ -172,7 +172,7 @@ void Read_CdBG_Constructor::process_simplitig_edges(Kmer_SPMC_Iterator if(e.is_loop()) continue; else // It connects two endpoints `u` and `v` of two distinct vertex. - add_simplitig_edge(e); + add_path_cover_edge(e); edge_count++; if(progress_tracker.track_work(++progress)) diff --git a/src/main.cpp b/src/main.cpp index 4f6a69c6..e69c98d4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -38,7 +38,7 @@ void build(int argc, char** argv) ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("simplitigs", "extract a set of maximal simplitigs, i.e. vertex-disjoint paths") + ("path-cover", "extract a maximal path cover of the de Bruijn graph") ("rm", "remove the KMC database") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) @@ -74,7 +74,7 @@ void build(int argc, char** argv) const auto format = result["format"].as(); const auto remove_kmc_db = result["rm"].as(); const auto working_dir = result["work-dir"].as(); - const auto simplitigs = result["simplitigs"].as(); + const auto path_cover = result["path-cover"].as(); const auto mph_file = result["mph"].as(); const auto buckets_file = result["buckets"].as(); const auto save_vertices = result["save-vertices"].as(); @@ -86,7 +86,7 @@ void build(int argc, char** argv) refs, lists, dirs, k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, - simplitigs, + path_cover, remove_kmc_db, mph_file, buckets_file, save_vertices #ifdef CF_DEVELOP_MODE , gamma From f64cd46c8b448b3ed1608b2e8abd321fa75b3809 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 15 Dec 2021 22:18:45 -0500 Subject: [PATCH 279/350] Group CLI options --- src/main.cpp | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e69c98d4..7eb3aaba 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -20,34 +20,43 @@ // Driver function for the CdBG build. void build(int argc, char** argv) { - cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from references or reads"); - options.add_options() - // TODO: better indent the following wall of text + cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences"); + + options.add_options("common") + ("r,refs", "input files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("l,lists", "input file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("d,dirs", "input file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) + ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) + ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("h,help", "print usage"); + + options.add_options("cuttlefish 2.0") ("read", "construct a compacted read de Bruijn graph") ("ref", "construct a compacted reference de Bruijn graph") - ("r,refs", "reference files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("l,lists", "reference file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("d,dirs", "reference file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("c,cutoff", "frequency cutoff for (k + 1)-mers (inapplicable for references)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) - ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) ("unrestrict-memory", "do not impose memory usage restriction") - ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("path-cover", "extract a maximal path cover of the de Bruijn graph"); + + options.add_options("cuttlefish 1.0") + ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) - ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("path-cover", "extract a maximal path cover of the de Bruijn graph") - ("rm", "remove the KMC database") + ("rm", "remove the KMC database"); + + options.add_options("specialized") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("save-vertices", "save the vertex set of the graph") + ("save-vertices", "save the vertex set of the graph"); + + options.add_options("debug") + ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) #endif - ("h,help", "print usage"); + ; try { From 46bfc240b4cdbe20fcf6e2fb4a4298f3b19c4085 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 16 Dec 2021 00:18:17 -0500 Subject: [PATCH 280/350] Update (draft) manual --- README.md | 164 +++++++++++++++++++++++++++++++++------------------ src/main.cpp | 2 +- 2 files changed, 109 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 996a3334..a8b84158 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/cuttlefish/README.html) -Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from genome reference(s). +Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from raw sequencing reads or reference sequences. ## Table of contents @@ -24,55 +24,54 @@ Cuttlefish is a fast, parallel, and very lightweight memory tool to construct th ## Overview -The construction of the compacted de Bruijn graph from a large collection of reference genomes is a task of increasing interest in genomic analyses. For example, compacted colored reference de Bruijn graphs are increasingly used as sequence indices for the purposes of alignment of short and long reads. Also, as we sequence and assemble a greater diversity of individual genomes, the compacted colored de Bruijn graph can be used as the basis for methods aiming to perform comparative genomic analyses on these genomes. While algorithms have been developed to construct the compacted colored de Bruijn graph from reference sequences, there is still room for improvement, especially in the memory and the runtime performance as the number and the scale of the genomes over which the de Bruijn graph is built grow. +Cuttlefish is a program to produce the compacted de Bruijn graph from sequencing reads or reference sequences. -We introduce a new algorithm, implemented in the tool Cuttlefish, to construct the (colored) compacted de Bruijn graph from a collection of one or more genome references. Cuttlefish introduces a novel modeling scheme of the de Bruijn graph vertices as finite-state automata, and constrains the state-space for the automata to enable tracking of their transitioning states with very low memory usage. Cuttlefish is also fast and highly parallelizable. Experimental results demonstrate that the algorithm scales much better than existing approaches, especially as the number and scale of the input references grow. - -A pre-print of the manuscript is available in [bioRxiv](https://doi.org/10.1101/2020.10.21.349605). +The paper describing the earlier version is available [here](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696). ## Dependencies To install Cuttlefish, the following are required: -- [GCC](https://gcc.gnu.org/) compilers for C++14 and C11; -- [CMake](https://cmake.org/) (version >= 3.14); -- [zlib](https://zlib.net/). +- [GCC](https://gcc.gnu.org/) compilers for C++14 and C11 +- [CMake](https://cmake.org/) (version >= 3.14) +- [zlib](https://zlib.net/) +- [bzip2](https://www.sourceware.org/bzip2/) These should already be available in your platform; and if not, then these can be easily installed from their sources. Besides, these should also be available via some package manager for your operating system: - **Linux** ```bash - sudo apt-get install build-essential cmake zlib1g-dev + sudo apt-get install build-essential cmake zlib1g-dev libbz2-dev ``` - **MacOS** ```bash brew install --with-toolchain llvm - brew install cmake zlib + brew install cmake zlib bzip2 ``` -Cuttlefish also makes use of [KMC3](https://github.com/refresh-bio/KMC), which is a disk-based _k_-mer counting tool. To install KMC3, you may use the following: +Cuttlefish also makes use of [KMC3](https://github.com/refresh-bio/KMC) tool. If you are installing Cuttlefish from source, then it will be automatically installed. To use with Cuttlefish 1 (installed using `conda`), you may use the following to install KMC3: + - From [Bioconda](https://bioconda.github.io/user/install.html): + ```bash conda install -c bioconda kmc ``` -- From source: - ```bash - git clone https://github.com/refresh-bio/KMC.git - cd KMC && make - ``` ## Installation -- From [Bioconda](https://bioconda.github.io/user/install.html): +- From [Bioconda](https://bioconda.github.io/user/install.html), only for Cuttlefish 1 for now: + ```bash conda install -c bioconda cuttlefish ``` + The Conda package supports _k_ values up-to 127. To use larger _k_ values, please install Cuttlefish from the source. -- From source: +- From source (works for both Cuttlefish 1 and 2): + ```bash git clone https://github.com/COMBINE-lab/cuttlefish.git cd cuttlefish && mkdir build && cd build @@ -80,6 +79,7 @@ Cuttlefish also makes use of [KMC3](https://github.com/refresh-bio/KMC), which i make -j 8 install cd .. ``` + You may replace `8` in `make -j 8` with the preferred count for threads to use in the installation process. This compilation process installs Cuttlefish in a sub-directory named `bin`, inside the project root directory. To specify a different installation directory, its path may be passed as the value of `-DCMAKE_INSTALL_PREFIX` with the `cmake` command, i.e. you may use `cmake -DCMAKE_INSTALL_PREFIX=custom_path/ ..` . Then the installed Cuttlefish executable will be found in `custom_path/bin/`. Skipping `-DCMAKE_INSTALL_PREFIX` entirely will install Cuttlefish in `/usr/local/bin`, for which `sudo` access might be required (i.e. `sudo make -j 8 install`). @@ -88,6 +88,37 @@ Cuttlefish also makes use of [KMC3](https://github.com/refresh-bio/KMC), which i ## Usage +### Cuttlefish 2 + +To construct the compacted de Bruijn graph, use Cuttlefish as following (from the project root directory): + +```bash + ./bin/cuttlefish build -k -c -t -o -w +``` + +The arguments are set as following: + +- The `` argument should be either `--read` or `--ref`, depending on whether you are passing sequencing reads or reference sequences as input. +- The input files `` can be passed in any of the following ways (and the options can also be mixed together). In case of using sequencing reads for input, the files should be in the FASTQ format. For reference sequences, those should be in FASTA. The input files can also be possibly gzipped. + - `-r ` + - `-l ` + - `-d ` +- The _k_-mer length `k` must be odd and within `63` (see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond this). The default value is `25`. +- The frequency threshold `-c` is set to `2` by default, and should be set to `1` when passing reference sequences as input. +- Number of threads `t` is set to `1` by default, and the use of higher values is recommended. +- Cuttlefish 2 generates two output files with the prefix `output_prefix`: + - A FASTA file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa`). + - A metadata file containing some structural characteristics of the de Bruijn graph and its compacted form (with the extension `.json`). +- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. + +Some other useful arguments: + +- `--path-cover`: construct a maximal path cover of the de Bruijn graph, instead of its compacted variant +- `-m `: pass a soft maximum memory-limit (in GB) for the execution (default: 3) +- `--unrestrict-memory`: do not impose any memory-usage restriction + +### Cuttlefish 1 + To produce the _k_-mer set from an individual input genome reference using KMC3, the following may be used (from the KMC root directory): ```bash @@ -102,30 +133,8 @@ If working with multiple genome references, you may use: The input file `input_reference` or the files listed in `input_references_list` should be in the FASTA format, possibly gzipped. The `k` value should be odd (required by Cuttlefish), and is 25 by default. Having executed, KMC3 will produce two files with the same prefix `output_database`, and extensions `.kmc_pre` and `.kmc_suf`. When working within strict memory limits, you should add the arguments `-m -sm` with these invocations, where `max_memory` is your memory budget in gigabytes. -Cuttlefish has the following command line interface: -```bash -$ cuttlefish build --help -Efficiently construct the compacted de Bruijn graph from references -Usage: - cuttlefish build [OPTION...] - - -r, --refs arg reference files (default: "") - -l, --lists arg reference file lists (default: "") - -d, --dirs arg reference file directories (default: "") - -k, --kmer_len arg k-mer length (default: 25) - -s, --kmc_db arg set of k-mers (KMC database) prefix - -t, --threads arg number of threads to use (default: 1) - -o, --output arg output file (default: "") - -f, --format arg output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: - GFA-reduced) (default: 0) - -w, --work_dir arg working directory (default: .) - --rm remove the KMC database - --mph arg minimal perfect hash (BBHash) file (optional) (default: - "") - --buckets arg hash table buckets (cuttlefish) file (optional) - (default: "") - -h, --help print usage -``` + To build the compacted de Bruijn graph, use Cuttlefish as following (from the project root directory): @@ -147,11 +156,21 @@ The arguments are set as following: - `1`: GFA 1.0; - `2`: GFA 2.0; and - `3`: GFA-reduced (see [I/O formats](#io-formats)). -- The working directory `-w` is used for temporary files created by the process — it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. +- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. + +## Output formats + +### Cuttlefish 2 -## I/O formats +The currently supported output format is -The input references should be in the FASTA format, possibly gzipped. Currently supported output formats are — +- The set of the maximal unitigs + +Other output formats are currently in the development roadmap. + +### Cuttlefish 1 + +The currently supported output formats are — - The set of maximal unitigs (non-branching paths) from the original de Bruijn graph, in plain text; - The compacted de Bruijn graph in the [GFA 1.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) and the [GFA 2.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md) formats; @@ -175,14 +194,14 @@ The input references should be in the FASTA format, possibly gzipped. Currently - 1 CGA - 3 ATGTC - 6 CTAAGA + 1 CGA + 3 ATGTC + 6 CTAAGA - Reference:1_Sequence:ref1 1+ 3- 3+ 6- + Reference:1_Sequence:ref1 1+ 3- 3+ 6- @@ -193,17 +212,50 @@ The input references should be in the FASTA format, possibly gzipped. Currently Cuttlefish works with the canonical representations of the _k_-mers, i.e. each _k_-mer and its reverse complement are treated as the same vertex in the original graph. The maximal unitig fragments (the ''segments'' in the GFA-terminology) are always output in their canonical forms — the orientations are guaranteed to be the same across identical executions. -### ''Colored'' output +### ''Colored'' output for Cuttlefish 1 In the [GFA](https://github.com/GFA-spec/GFA-spec) output formats for the compacted de Bruijn graph, the graph is represented as a list of the vertices (i.e. the maximal unitigs) and the adjacencies between them. The output also includes a path-tiling for each individual sequence in the input references, i.e. an ordered list of the maximal unitig ids that completely tile that sequence. Put differently, the GFA outputs describe a colored de Bruijn graph in the sense that the color information for each vertex (maximal unitig) is encoded in the `P` (GFA 1.0) or the `O` (GFA 2.0) entries (or the tilings in the file `.cf_seq`, in the reduced output). -Throughout the [manuscript](https://doi.org/10.1101/2020.10.21.349605), when we mention the colored de Bruijn graph, we refer to a very specific definition of colors. While this definition is intuitive and natural when constructing the compacted colored de Bruijn graph from a set of reference genomes, it is not the case that the Cuttlefish algorithm allows arbitrary coloring of the _k_-mers in the de Bruijn graph. Specifically, in the definition adopted herein, the color set of a unitig is the subset of input references si1, si2, ..., sil in which the unitig appears. This color information is implicitly encoded in the path entries of the output GFA files (the `P` entries in GFA 1.0 and the `O` entries in GFA 2.0). As a result, all unitigs produced by Cuttlefish are "monochromatic" under this coloring definition, as a change to the color set internally to a unitig would imply either a branch (which would terminate the unitig) or the start or end of some reference string and a sentinel _k_-mer (which would also terminate the unitig). If one were constructing the compacted colored de Bruijn graph from raw sequencing reads or from highly-fractured assemblies, then one may wish to adopt a different notion of color, wherein color sets may vary across an individual unitig. +Throughout the [manuscript](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696), when we mention the colored de Bruijn graph, we refer to a very specific definition of colors. While this definition is intuitive and natural when constructing the compacted colored de Bruijn graph from a set of reference genomes, it is not the case that the Cuttlefish algorithm allows arbitrary coloring of the _k_-mers in the de Bruijn graph. Specifically, in the definition adopted herein, the color set of a unitig is the subset of input references si1, si2, ..., sil in which the unitig appears. This color information is implicitly encoded in the path entries of the output GFA files (the `P` entries in GFA 1.0 and the `O` entries in GFA 2.0). As a result, all unitigs produced by Cuttlefish are "monochromatic" under this coloring definition, as a change to the color set internally to a unitig would imply either a branch (which would terminate the unitig) or the start or end of some reference string and a sentinel _k_-mer (which would also terminate the unitig). If one were constructing the compacted colored de Bruijn graph from raw sequencing reads or from highly-fractured assemblies, then one may wish to adopt a different notion of color, wherein color sets may vary across an individual unitig. ## Example usage +### Cuttlefish 2 + +_To be completed_ + +### Cuttlefish 1 + Please use the `kmc` and the `cuttlefish` binaries from their respective paths in the following examples. We use _k_ = 3, and 4 CPU threads, with a working directory named `temp` in the following examples. +To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: + +- Produce a newline-separated list of the paths of the input references. For example, + + ```bash + readlink -f refs1.fa > refs.lst + readlink -f refs2.fa >> refs.lst + ``` + +- Generate the _k_-mer set: + + ```bash + kmc -k3 -fm -ci1 -t4 @refs.lst kmers temp/ + ``` + +- Build a hash function over `kmers`, compute the states of the graph vertices, and output the compacted graph (in GFA 2.0): + + ```bash + cuttlefish build -l refs.lst -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ + ``` + + Or, + + ```bash + cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ + ``` + - **For individual input genome reference** To output the compacted de Bruijn graph (in GFA 1.0) for the example FASTA file `refs1.fa` (provided in the `data` directory), the following may be used: @@ -267,13 +319,11 @@ Cuttlefish supports only the odd `k` values within `MAX_K` due to theoretical re Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer as necessary — thus increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. -## Intermediate disk usage - -The Cuttlefish pipeline uses a non-trivial amount of intermediate disk space, in the forms of — the _k_-mer set produced by KMC3, and temporary files produced during the minimal perfect hash construction and the GFA output constructions. The produced KMC3 database (the `.kmc_pre` and the `.kmc_suf` extension files) is not removed automatically (unless the flag `--rm` is passed to the graph build), and can be safely removed by the user after the Cuttlefish execution. + -Please cite Cuttlefish when using it, including— +## Citation ```bibtex @article{10.1093/bioinformatics/btab309, @@ -291,6 +341,8 @@ Please cite Cuttlefish when using it, including— } ``` +### Acknowledgements + This work is supported by _NIH R01 HG009937_, and by _NSF CCF-1750472_, and _CNS-1763680_. ## Licenses diff --git a/src/main.cpp b/src/main.cpp index 7eb3aaba..0fd6cecd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -35,7 +35,7 @@ void build(int argc, char** argv) options.add_options("cuttlefish 2.0") ("read", "construct a compacted read de Bruijn graph") ("ref", "construct a compacted reference de Bruijn graph") - ("c,cutoff", "frequency cutoff for (k + 1)-mers (inapplicable for references)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) + ("c,cutoff", "frequency cutoff for (k + 1)-mers", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) ("unrestrict-memory", "do not impose memory usage restriction") ("path-cover", "extract a maximal path cover of the de Bruijn graph"); From 2ab07031384d03c5891f1142f5f7ffcda44e6302 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 20 Dec 2021 21:51:53 -0500 Subject: [PATCH 281/350] Restore k-mer iterator functionality for small k functionality went missing for KMC1-format DBs with prefix-file buffering update --- include/kmc_api/Virtual_Prefix_File.hpp | 3 + src/kmc_api/Virtual_Prefix_File.cpp | 20 +++++++ src/kmc_api/kmc_file.cpp | 75 ++++++++++++++++++------- 3 files changed, 77 insertions(+), 21 deletions(-) diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 815a1922..4d45c14e 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -53,6 +53,9 @@ class Virtual_Prefix_File // database has `kmer_count` number of k-mers. void init(std::FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); + // TODO: merge with `init`. + void init_kmc1(std::FILE*& fptr, uint64_t elems, uint64_t kmer_count); + // Returns the data at index `idx` of the prefix-file. uint64_t operator[](std::size_t idx); diff --git a/src/kmc_api/Virtual_Prefix_File.cpp b/src/kmc_api/Virtual_Prefix_File.cpp index 2a7d75da..f1bab30a 100644 --- a/src/kmc_api/Virtual_Prefix_File.cpp +++ b/src/kmc_api/Virtual_Prefix_File.cpp @@ -42,3 +42,23 @@ void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t lut_area_bytes, prefix_chunk_start_index = 0; prefix_chunk_end_index = read_prefixes(); } + + +void Virtual_Prefix_File::init_kmc1(std::FILE*& fptr, const uint64_t elems, const uint64_t kmer_count) +{ + // *Take ownership* of `fptr`. + fp = fptr; + fptr = NULL; + + // Set metadata. + lut_area_size_in_bytes = elems * sizeof(uint64_t); + prefix_file_elem_count = elems; + total_kmers = kmer_count; + + // Allocate the prefix-file buffer. + prefix_file_buf.reserve(buffer_elem_count); + + // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. + prefix_chunk_start_index = 0; + prefix_chunk_end_index = read_prefixes(); +} diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index c677668f..3cea0912 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -271,7 +271,7 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ return false; my_fseek(file_pre, prev_pos, SEEK_SET); - if (kmc_version == 0x200) + if (kmc_version == 0x200) // Used with cuttlefish for k > 13. { my_fseek(file_pre, -8, SEEK_END); @@ -334,53 +334,86 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ return true; } - else if (kmc_version == 0) // Not used with cuttlefish. + else if (kmc_version == 0) // Used with cuttlefish for k <= 13. { + const size_t prefix_data_pos = my_ftell(file_pre); prefix_file_buf_size = (size - 4) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) - prefix_file_buf = new uint64[prefix_file_buf_size]; - result = fread(prefix_file_buf, 1, (size_t)(size - 4), file_pre); - if (result == 0) - return false; + // prefix_file_buf = new uint64[prefix_file_buf_size]; + // result = fread(prefix_file_buf, 1, (size_t)(size - 4), file_pre); + // if (result == 0) + // return false; my_fseek(file_pre, -8, SEEK_END); uint64 header_offset; header_offset = fgetc(file_pre); + my_fseek(file_pre, prefix_data_pos, SEEK_SET); size = size - 4; uint64 header_index = (size - header_offset) / sizeof(uint64); uint64 last_data_index = header_index; - uint64 d = prefix_file_buf[header_index]; + // uint64 d = prefix_file_buf[header_index]; + uint64 data; + my_fseek(file_pre, header_index * sizeof(uint64), SEEK_CUR); + fread(&data, 1, sizeof(uint64), file_pre); - kmer_length = (uint32)d; //- kmer's length - mode = d >> 32; //- mode: 0 or 1 + // kmer_length = (uint32)d; //- kmer's length + // mode = d >> 32; //- mode: 0 or 1 + kmer_length = (uint32)data; + mode = data >> 32; - header_index++; - counter_size = (uint32)prefix_file_buf[header_index]; //- the size of a counter in bytes; + // header_index++; + // counter_size = (uint32)prefix_file_buf[header_index]; //- the size of a counter in bytes; + fread(&data, 1, sizeof(uint64), file_pre); + counter_size = (uint32)data; //- for mode 0 counter_size is 1, 2, 3, or 4 (or 5, 6, 7, 8 for small k values) //- for mode = 1 counter_size is 4; - lut_prefix_length = prefix_file_buf[header_index] >> 32; //- the number of prefix's symbols cut frm kmers; + // lut_prefix_length = prefix_file_buf[header_index] >> 32; //- the number of prefix's symbols cut frm kmers; + lut_prefix_length = data >> 32; //- (kmer_length - lut_prefix_length) is divisible by 4 - header_index++; - original_min_count = (uint32)prefix_file_buf[header_index]; //- the minimal number of kmer's appearances + // header_index++; + // original_min_count = (uint32)prefix_file_buf[header_index]; //- the minimal number of kmer's appearances + fread(&data, 1, sizeof(uint64), file_pre); + original_min_count = (uint32)data; min_count = original_min_count; - original_max_count = prefix_file_buf[header_index] >> 32; //- the maximal number of kmer's appearances + // original_max_count = prefix_file_buf[header_index] >> 32; //- the maximal number of kmer's appearances + original_max_count = data >> 32; //max_count = original_max_count; - header_index++; - total_kmers = prefix_file_buf[header_index]; //- the total number of kmers + // header_index++; + // total_kmers = prefix_file_buf[header_index]; //- the total number of kmers + fread(&total_kmers, 1, sizeof(uint64), file_pre); - header_index++; - both_strands = (prefix_file_buf[header_index] & 0x000000000000000F) == 1; + // header_index++; + // both_strands = (prefix_file_buf[header_index] & 0x000000000000000F) == 1; + fread(&data, 1, sizeof(uint64), file_pre); + both_strands = (data & 0x000000000000000F) == 1; both_strands = !both_strands; - original_max_count += prefix_file_buf[header_index] & 0xFFFFFFFF00000000; + // original_max_count += prefix_file_buf[header_index] & 0xFFFFFFFF00000000; + original_max_count += data & 0xFFFFFFFF00000000; max_count = original_max_count; - prefix_file_buf[last_data_index] = total_kmers + 1; + // Set auxiliary fields aiding in k-mer parsing by Cuttlefish. + prefix_mask_ = (1 << 2 * lut_prefix_length) - 1; + byte_alignment_ = (kmer_length % 4 != 0 ? 4 - (kmer_length % 4) : 0); + + my_fseek(file_pre, prefix_data_pos, SEEK_SET); + + if(load_pref_file) + { + prefix_file_buf = new uint64[prefix_file_buf_size]; + result = fread(prefix_file_buf, 1, (size_t)(size - 4), file_pre); + if (result == 0) + return false; + + prefix_file_buf[last_data_index] = total_kmers + 1; + } + else if(init_pref_buf) + prefix_virt_buf.init_kmc1(file_pre, prefix_file_buf_size, total_kmers); sufix_size = (kmer_length - lut_prefix_length) / 4; From 7eb9a516e62582be6730ef2cc3ba21eaf3c98cb7 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Dec 2021 11:39:03 -0500 Subject: [PATCH 282/350] Refactor virtual-prefix-file code --- include/kmc_api/Virtual_Prefix_File.hpp | 4 ++-- src/kmc_api/Virtual_Prefix_File.cpp | 9 +++------ src/kmc_api/kmc_file.cpp | 13 ++++++------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 4d45c14e..02738a94 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -49,9 +49,9 @@ class Virtual_Prefix_File ~Virtual_Prefix_File(); // Initializes the file buffer with the file handle `fptr` that is supposed to contain - // `lut_area_bytes` amount of bytes for its prefix-content, and the associated KMC3 + // `prefix_count` number of prefixes as its prefix-content, and the associated KMC3 // database has `kmer_count` number of k-mers. - void init(std::FILE*& fptr, uint64_t lut_area_bytes, uint64_t kmer_count); + void init(std::FILE*& fptr, uint64_t prefix_count, uint64_t kmer_count); // TODO: merge with `init`. void init_kmc1(std::FILE*& fptr, uint64_t elems, uint64_t kmer_count); diff --git a/src/kmc_api/Virtual_Prefix_File.cpp b/src/kmc_api/Virtual_Prefix_File.cpp index f1bab30a..109f08cc 100644 --- a/src/kmc_api/Virtual_Prefix_File.cpp +++ b/src/kmc_api/Virtual_Prefix_File.cpp @@ -1,6 +1,5 @@ #include "kmc_api/Virtual_Prefix_File.hpp" -#include "kmc_api/kmer_defs.h" Virtual_Prefix_File::Virtual_Prefix_File(): @@ -23,16 +22,14 @@ Virtual_Prefix_File::~Virtual_Prefix_File() } -void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t lut_area_bytes, const uint64_t kmer_count) +void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t prefix_count, const uint64_t kmer_count) { // *Take ownership* of `fptr`. fp = fptr; fptr = NULL; - // Skip the first 4 bytes of header to get to the start of the prefixes. - my_fseek(fp, +4, SEEK_CUR); - lut_area_size_in_bytes = lut_area_bytes; - prefix_file_elem_count = (lut_area_size_in_bytes + 8) / sizeof(uint64_t); // What's that extra 1 element for? KMC3 comment: reads without 4 bytes of a header_offset (and without markers) + prefix_file_elem_count = prefix_count; + lut_area_size_in_bytes = prefix_count * sizeof(uint64_t); total_kmers = kmer_count; // Allocate the prefix-file buffer. diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index 3cea0912..b8b2c7f6 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -301,15 +301,17 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ single_LUT_size = 1 << (2 * lut_prefix_length); uint64 last_data_index = lut_area_size_in_bytes / sizeof(uint64); + prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) + // Set auxiliary fields aiding in k-mer parsing by Cuttlefish. prefix_mask_ = (1 << 2 * lut_prefix_length) - 1; byte_alignment_ = (kmer_length % 4 != 0 ? 4 - (kmer_length % 4) : 0); + std::rewind(file_pre); + my_fseek(file_pre, +4, SEEK_CUR); // Skip the first 4 bytes of header to get to the start of the prefixes. + if(load_pref_file) { - std::rewind(file_pre); - my_fseek(file_pre, +4, SEEK_CUR); - prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers) prefix_file_buf = new uint64[prefix_file_buf_size]; result = fread(prefix_file_buf, 1, (size_t)(lut_area_size_in_bytes + 8), file_pre); if (result == 0) @@ -323,10 +325,7 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ return false; } else if(init_pref_buf) - { - std::rewind(file_pre); - prefix_virt_buf.init(file_pre, lut_area_size_in_bytes, total_kmers); - } + prefix_virt_buf.init(file_pre, prefix_file_buf_size, total_kmers); sufix_size = (kmer_length - lut_prefix_length) / 4; From f4f0dfa74584ec065fa3c3dfb835ed60fe2a3ac5 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Dec 2021 11:48:39 -0500 Subject: [PATCH 283/350] Remove redundant field --- include/kmc_api/Virtual_Prefix_File.hpp | 1 - src/kmc_api/Virtual_Prefix_File.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 02738a94..47c6354b 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -21,7 +21,6 @@ class Virtual_Prefix_File std::size_t prefix_file_elem_count; // Size of the KMC3 prefix-file (*.kmc_pre) in elements (i.e. 64-bit prefixes). std::vector prefix_file_buf; // The in-memory prefix-file buffer. - uint64_t lut_area_size_in_bytes; // From KMC3. std::size_t prefix_chunk_start_index; // The index into the prefix-file where the prefix chunk currently loaded into memory starts. std::size_t prefix_chunk_end_index; // The (non-inclusive) index into the prefix-file where the prefix chunk currently loaded into memory ends. diff --git a/src/kmc_api/Virtual_Prefix_File.cpp b/src/kmc_api/Virtual_Prefix_File.cpp index 109f08cc..c1dd4106 100644 --- a/src/kmc_api/Virtual_Prefix_File.cpp +++ b/src/kmc_api/Virtual_Prefix_File.cpp @@ -4,7 +4,6 @@ Virtual_Prefix_File::Virtual_Prefix_File(): prefix_file_elem_count(0), - lut_area_size_in_bytes(0), prefix_chunk_start_index(0), prefix_chunk_end_index(0), total_kmers(0), @@ -29,7 +28,6 @@ void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t prefix_count, co fptr = NULL; prefix_file_elem_count = prefix_count; - lut_area_size_in_bytes = prefix_count * sizeof(uint64_t); total_kmers = kmer_count; // Allocate the prefix-file buffer. @@ -48,7 +46,6 @@ void Virtual_Prefix_File::init_kmc1(std::FILE*& fptr, const uint64_t elems, cons fptr = NULL; // Set metadata. - lut_area_size_in_bytes = elems * sizeof(uint64_t); prefix_file_elem_count = elems; total_kmers = kmer_count; From f6f5bc78b5ae94ed29b4d52ea14f12266392f3b0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 21 Dec 2021 12:09:38 -0500 Subject: [PATCH 284/350] Consolidate prefix-buffer initializers --- include/kmc_api/Virtual_Prefix_File.hpp | 3 --- src/kmc_api/Virtual_Prefix_File.cpp | 19 ------------------- src/kmc_api/kmc_file.cpp | 2 +- 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/include/kmc_api/Virtual_Prefix_File.hpp b/include/kmc_api/Virtual_Prefix_File.hpp index 47c6354b..2d7f409c 100644 --- a/include/kmc_api/Virtual_Prefix_File.hpp +++ b/include/kmc_api/Virtual_Prefix_File.hpp @@ -52,9 +52,6 @@ class Virtual_Prefix_File // database has `kmer_count` number of k-mers. void init(std::FILE*& fptr, uint64_t prefix_count, uint64_t kmer_count); - // TODO: merge with `init`. - void init_kmc1(std::FILE*& fptr, uint64_t elems, uint64_t kmer_count); - // Returns the data at index `idx` of the prefix-file. uint64_t operator[](std::size_t idx); diff --git a/src/kmc_api/Virtual_Prefix_File.cpp b/src/kmc_api/Virtual_Prefix_File.cpp index c1dd4106..8badac53 100644 --- a/src/kmc_api/Virtual_Prefix_File.cpp +++ b/src/kmc_api/Virtual_Prefix_File.cpp @@ -37,22 +37,3 @@ void Virtual_Prefix_File::init(std::FILE*& fptr, const uint64_t prefix_count, co prefix_chunk_start_index = 0; prefix_chunk_end_index = read_prefixes(); } - - -void Virtual_Prefix_File::init_kmc1(std::FILE*& fptr, const uint64_t elems, const uint64_t kmer_count) -{ - // *Take ownership* of `fptr`. - fp = fptr; - fptr = NULL; - - // Set metadata. - prefix_file_elem_count = elems; - total_kmers = kmer_count; - - // Allocate the prefix-file buffer. - prefix_file_buf.reserve(buffer_elem_count); - - // Read in some prefix-file data, and initialize the virtual indices into the prefix-file. - prefix_chunk_start_index = 0; - prefix_chunk_end_index = read_prefixes(); -} diff --git a/src/kmc_api/kmc_file.cpp b/src/kmc_api/kmc_file.cpp index b8b2c7f6..e0db6be5 100644 --- a/src/kmc_api/kmc_file.cpp +++ b/src/kmc_api/kmc_file.cpp @@ -412,7 +412,7 @@ bool CKMC_DB::ReadParamsFrom_prefix_file_buf(uint64 &size, const bool load_pref_ prefix_file_buf[last_data_index] = total_kmers + 1; } else if(init_pref_buf) - prefix_virt_buf.init_kmc1(file_pre, prefix_file_buf_size, total_kmers); + prefix_virt_buf.init(file_pre, prefix_file_buf_size, total_kmers); sufix_size = (kmer_length - lut_prefix_length) / 4; From a84a43de6bb610746e53476630c9c3acc41f3186 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 28 Dec 2021 13:44:39 -0500 Subject: [PATCH 285/350] Update readme --- README.md | 316 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 200 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index a8b84158..f0e2b163 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/cuttlefish/README.html) -Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from raw sequencing reads or reference sequences. +Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from sequencing reads or reference sequences, which is highly scalable in terms of the size of the input data. ## Table of contents @@ -18,26 +18,28 @@ Cuttlefish is a fast, parallel, and very lightweight memory tool to construct th - [''Colored'' output](#colored-output) - [Example usage](#example-usage) - [Larger _k_-mer sizes](#larger-k-mer-sizes) -- [Intermediate disk usage](#intermediate-disk-usage) -- [Acknowledgements](#acknowledgements) +- [Differences between Cuttlefish 1 & 2](#differences-between-cuttlefish-1-&-2) +- [Citations & Acknowledgement](#citations-&-acknowledgement) - [Licenses](#licenses) ## Overview Cuttlefish is a program to produce the compacted de Bruijn graph from sequencing reads or reference sequences. -The paper describing the earlier version is available [here](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696). +The papers describing the work are: [Cuttlefish 1](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696) and [Cuttlefish 2](https://doi.org/10.1101/2021.12.14.472718) (pre-print). ## Dependencies -To install Cuttlefish, the following are required: + +To install Cuttlefish from source, the following are required: - [GCC](https://gcc.gnu.org/) compilers for C++14 and C11 - [CMake](https://cmake.org/) (version >= 3.14) - [zlib](https://zlib.net/) - [bzip2](https://www.sourceware.org/bzip2/) -These should already be available in your platform; and if not, then these can be easily installed from their sources. Besides, these should also be available via some package manager for your operating system: +These should already be available in your platform; and if not, then these can be easily installed from their sources. +Besides, these should also be available via some package manager for your operating system: - **Linux** @@ -52,111 +54,178 @@ These should already be available in your platform; and if not, then these can b brew install cmake zlib bzip2 ``` -Cuttlefish also makes use of [KMC3](https://github.com/refresh-bio/KMC) tool. If you are installing Cuttlefish from source, then it will be automatically installed. To use with Cuttlefish 1 (installed using `conda`), you may use the following to install KMC3: +Cuttlefish also makes use of the [KMC 3](https://github.com/refresh-bio/KMC) tool. +If you are installing Cuttlefish from the source, then it will be automatically installed. +To use with Cuttlefish 1 while it is installed using `conda`, you may use the following to install KMC 3: - From [Bioconda](https://bioconda.github.io/user/install.html): ```bash - conda install -c bioconda kmc + conda install -c bioconda kmc ``` ## Installation -- From [Bioconda](https://bioconda.github.io/user/install.html), only for Cuttlefish 1 for now: +- From [Bioconda](https://bioconda.github.io/user/install.html) (only for Cuttlefish 1, _for now_): ```bash - conda install -c bioconda cuttlefish + conda install -c bioconda cuttlefish ``` - The Conda package supports _k_ values up-to 127. To use larger _k_ values, please install Cuttlefish from the source. + The Conda package supports _k_ values up-to 127. + To use larger _k_ values, please install Cuttlefish from the source. - From source (works for both Cuttlefish 1 and 2): ```bash - git clone https://github.com/COMBINE-lab/cuttlefish.git - cd cuttlefish && mkdir build && cd build - cmake -DCMAKE_INSTALL_PREFIX=../ .. - make -j 8 install - cd .. + git clone https://github.com/COMBINE-lab/cuttlefish.git + cd cuttlefish/ && mkdir build && cd build/ + cmake -DCMAKE_INSTALL_PREFIX=../ .. + make -j 8 install + cd .. ``` You may replace `8` in `make -j 8` with the preferred count for threads to use in the installation process. - This compilation process installs Cuttlefish in a sub-directory named `bin`, inside the project root directory. To specify a different installation directory, its path may be passed as the value of `-DCMAKE_INSTALL_PREFIX` with the `cmake` command, i.e. you may use `cmake -DCMAKE_INSTALL_PREFIX=custom_path/ ..` . Then the installed Cuttlefish executable will be found in `custom_path/bin/`. Skipping `-DCMAKE_INSTALL_PREFIX` entirely will install Cuttlefish in `/usr/local/bin`, for which `sudo` access might be required (i.e. `sudo make -j 8 install`). + This installs Cuttlefish in a sub-directory named `bin`, inside the project root directory. + To specify a different installation directory, its path may be passed as the value of `-DCMAKE_INSTALL_PREFIX` with the `cmake` command, i.e. you may use `cmake -DCMAKE_INSTALL_PREFIX=/ ..` . + Then the installed Cuttlefish executable will be found in `/bin/`. + Skipping `-DCMAKE_INSTALL_PREFIX` entirely will install Cuttlefish in `/usr/local/bin/`, for which `sudo` access might be required (i.e. `sudo make -j 8 install`). - This installation supports _k_ values up-to 63. To ensure support for larger values, please compile the source with the slight modification described in [Larger _k_-mer sizes](#larger-k-mer-sizes). + This installation supports _k_ values up-to `63`. + To ensure support for larger values, please compile the source with the slight modification described in [Larger _k_-mer sizes](#larger-k-mer-sizes). ## Usage +`cuttlefish build --help` displays the following message: + +```bash +Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences +Usage: + cuttlefish build [OPTION...] + + common options: + -r, --refs arg input files (default: "") + -l, --lists arg input file lists (default: "") + -d, --dirs arg input file directories (default: "") + -k, --kmer-len arg k-mer length (default: 25) + -t, --threads arg number of threads to use (default: 1) + -o, --output arg output file (default: "") + -w, --work-dir arg working directory (default: .) + -h, --help print usage + + cuttlefish 1.0 options: + -s, --kmc-db arg set of vertices, i.e. k-mers (KMC database) prefix + (default: .) + -f, --format arg output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: + GFA-reduced) (default: 0) + --rm remove the KMC database + + cuttlefish 2.0 options: + --read construct a compacted read de Bruijn graph + --ref construct a compacted reference de Bruijn graph + -c, --cutoff arg frequency cutoff for (k + 1)-mers (default: 2) + -m, --max-memory arg soft maximum memory limit (in GB) (default: 3) + --unrestrict-memory do not impose memory usage restriction + --path-cover extract a maximal path cover of the de Bruijn + graph + + debug options: + -e, --edge-db arg set of edges, i.e. (k + 1)-mers (KMC database) prefix + (default: "") + + specialized options: + --mph arg minimal perfect hash (BBHash) file (optional) + (default: "") + --buckets arg hash table buckets (cuttlefish) file (optional) + (default: "") + --save-vertices save the vertex set of the graph + +``` + ### Cuttlefish 2 -To construct the compacted de Bruijn graph, use Cuttlefish as following (from the project root directory): +To construct a compacted de Bruijn graph, use Cuttlefish as following: ```bash - ./bin/cuttlefish build -k -c -t -o -w +cuttlefish build -k -c -o -t -w ``` The arguments are set as following: -- The `` argument should be either `--read` or `--ref`, depending on whether you are passing sequencing reads or reference sequences as input. -- The input files `` can be passed in any of the following ways (and the options can also be mixed together). In case of using sequencing reads for input, the files should be in the FASTQ format. For reference sequences, those should be in FASTA. The input files can also be possibly gzipped. +- The `` argument should be either `--read` or `--ref`, based on whether you are providing sequencing reads or reference sequences as input, respectively. +- The input files `` can be passed in any of the following ways (and the options may be mixed together). - `-r ` - `-l ` - `-d ` -- The _k_-mer length `k` must be odd and within `63` (see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond this). The default value is `25`. -- The frequency threshold `-c` is set to `2` by default, and should be set to `1` when passing reference sequences as input. -- Number of threads `t` is set to `1` by default, and the use of higher values is recommended. -- Cuttlefish 2 generates two output files with the prefix `output_prefix`: + + In case of using sequencing reads as input, the files should be in the FASTQ format. + For reference sequences, those should be in the FASTA format. + The input files can also be possibly gzipped. +- The _k_-mer length `k` must be odd and within `63` (see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond this). +The default value is `25`. +- The frequency threshold `c` is set to `2` by default, and should be set to `1` when passing reference sequences as input. +- Cuttlefish 2 generates two output files with the prefix ``: - A FASTA file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa`). - A metadata file containing some structural characteristics of the de Bruijn graph and its compacted form (with the extension `.json`). -- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. +- The number of threads `t` is set to `1` by default, and the use of higher values is recommended. +- The working directory `w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. +The current directory is set as the default working directory. Some other useful arguments: -- `--path-cover`: construct a maximal path cover of the de Bruijn graph, instead of its compacted variant -- `-m `: pass a soft maximum memory-limit (in GB) for the execution (default: 3) -- `--unrestrict-memory`: do not impose any memory-usage restriction +- `--path-cover` to construct a maximal vertex-disjoint path cover of the de Bruijn graph, instead of its compacted variant +- `-m ` to pass a soft maximum memory-limit (in GB) to trade-off RAM usage for faster execution time; this will only be adhered to if the provided limit is larger than the minimum required memory for Cuttlefish, determined internally +- `--unrestrict-memory` to not impose any memory-usage restriction, trading off RAM usage for faster execution time ### Cuttlefish 1 -To produce the _k_-mer set from an individual input genome reference using KMC3, the following may be used (from the KMC root directory): +Unlike Cuttlefish 2, Cuttlefish 1 does not execute KMC 3 by itself (_for now_). +To produce the _k_-mer set from an individual input reference sequence using KMC 3, the following may be used: ```bash - ./bin/kmc -k -fm -ci1 -t +kmc -k -fm -ci1 -t ``` -If working with multiple genome references, you may use: +If working with multiple references, you may use: ```bash - ./bin/kmc -k -fm -ci1 -t @ +kmc -k -fm -ci1 -t @ ``` -The input file `input_reference` or the files listed in `input_references_list` should be in the FASTA format, possibly gzipped. The `k` value should be odd (required by Cuttlefish), and is 25 by default. Having executed, KMC3 will produce two files with the same prefix `output_database`, and extensions `.kmc_pre` and `.kmc_suf`. When working within strict memory limits, you should add the arguments `-m -sm` with these invocations, where `max_memory` is your memory budget in gigabytes. +The input file `` or the files listed in `` should be in the FASTA format, possibly gzipped. +The `k` value should be odd (required by Cuttlefish), and is `25` by default. +Having executed, KMC 3 will produce two files with the same prefix ``, and extensions `.kmc_pre` and `.kmc_suf`. +When working within strict memory limits, you should add the arguments `-m -sm` with these invocations, where `` is your memory budget in gigabytes. -To build the compacted de Bruijn graph, use Cuttlefish as following (from the project root directory): +Then to build the compacted de Bruijn graph, use Cuttlefish as following: ```bash - ./bin/cuttlefish build -k -s -t -o -f -w +cuttlefish build -k -s -o -f -t -w ``` The arguments are set as following: -- The input references can be passed in any of the following ways (and the options can also be mixed together). Each input reference should be in the FASTA format, possibly gzipped. +- The input references can be passed in any of the following ways (and the options may be mixed together). - `-r ` - `-l ` - `-d ` -- The _k_-mer length `k` must be odd and within 63 (see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond this). The default value is 25. -- The _k_-mer set prefix `s` must match exactly the output value used in the `kmc` invocation, i.e. it should be the `output_set` argument from the `kmc` invocation. -- Number of threads `t` is set to `1` by default, and the use of higher values is recommended. + + Each input reference should be in the FASTA format, possibly gzipped. +- The _k_-mer length `k` must be odd and within `63` (or, `127` if you install Cuttlefish using `conda`; see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond these). +The default value is `25`. +- The _k_-mer set prefix `s` must match exactly the output path used in the `kmc` invocation, i.e. it should be the `` argument from the `kmc` invocation. - The output formats (`f`) are — - `0`: only the maximal unitig (non-branching path) fragments; - `1`: GFA 1.0; - `2`: GFA 2.0; and - `3`: GFA-reduced (see [I/O formats](#io-formats)). -- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. +- The number of threads `t` is set to `1` by default, and the use of higher values is recommended. +- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. +The current directory is set as the default working directory. ## Output formats @@ -164,7 +233,7 @@ The arguments are set as following: The currently supported output format is -- The set of the maximal unitigs +- The set of the maximal unitigs (non-branching paths) from the original de Bruijn graph, in FASTA Other output formats are currently in the development roadmap. @@ -172,11 +241,14 @@ Other output formats are currently in the development roadmap. The currently supported output formats are — -- The set of maximal unitigs (non-branching paths) from the original de Bruijn graph, in plain text; -- The compacted de Bruijn graph in the [GFA 1.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) and the [GFA 2.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md) formats; -- The compacted de Bruijn graph in a ''reduced'' GFA format. It consists of two files, with the extensions — `.cf_seg` and `.cf_seq`. - - The `.cf_seg` file contains all the maximal unitig fragments of the graph (the segment outputs from GFA, i.e. the `S`-tagged entries), each one with a unique id. This file is a list of pairs ``. - - The `.cf_seq` file contains the ''tiling'' of each input sequence, made by the maximal unitig fragments (the paths (GFA 1) / ordered groups (GFA 2), i.e. the `P`- / `O`-tagged entries). Each line of the file is of the format ``, where `id` is a unique identifier (name) of this sequence, and `tiling` is a space-separated list of the unitig ids, completely covering the sequence. Each unitig id also has a `+` or `-` sign following it, depending on whether the corresponding unitig is present in the canonical or the reverse-complemented form in this tiling order. +- The set of the maximal unitigs (non-branching paths) from the original de Bruijn graph, in plain text +- The compacted de Bruijn graph in the [GFA 1.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) and the [GFA 2.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md) formats +- The compacted de Bruijn graph in a ''reduced'' GFA format. It consists of two files, with the extensions: `.cf_seg` and `.cf_seq`: + - The `.cf_seg` file contains all the maximal unitig fragments of the graph (the segment outputs from GFA, i.e. the `S`-tagged entries), each one with a unique id. + This file is a list of pairs ``. + - The `.cf_seq` file contains the ''tiling'' of each input sequence, made by the maximal unitig fragments (the paths in GFA 1 / ordered groups in GFA 2, i.e. the `P`- / `O`-tagged entries). + Each line of the file is of the format ``, where `id` is a unique identifier (name) of this sequence, and `tiling` is a space-separated list of the unitig ids, completely covering the sequence. + Each unitig id also has a `+` or `-` sign following it, depending on whether the corresponding unitig is present in the canonical or the reverse-complemented form in this tiling order. For the example reference file `refs1.fa` (provided in the `data` directory), the output files _may_ look like the following. @@ -206,18 +278,32 @@ The currently supported output formats are — - The only GFA information missing _explictly_ in this format is the links (GFA 1) / edges and gaps (GFA 2), i.e. the `L`- or the `E`- and the `G`-tagged entries. These can be readily inferred from the sequence-tilings. For example, a tiling 0 u1 ... un> corresponds to the edge and gap multi-set {(u0, u1), (u1 u2), ... , (un-1, un)}. Whether a pair (ui, ui+1) is an edge or a gap can be inferred by checking the suffix and the prefix (of length `k - 1`) of the unitigs ui and ui+1, respectively (in their correct orientations, based on their following `+`/`-` signs). Note that, a gap is possible in a sequence-tiling only if the sequence contains characters outside of `A`, `C`, `G`, and `T`. + The only GFA information missing _explictly_ in this format is the links (GFA 1) / edges and gaps (GFA 2), i.e. the `L`- or the `E`- and the `G`-tagged entries. + These can be readily inferred from the sequence-tilings. + For example, a tiling 0 u1 ... un> corresponds to the edge and gap multi-set {(u0, u1), (u1 u2), ... , (un-1, un)}. + Whether a pair (ui, ui+1) is an edge or a gap can be inferred by checking the suffix and the prefix (of length `k - 1`) of the unitigs ui and ui+1, respectively (in their correct orientations, based on their following `+`/`-` signs). + Note that, a gap is possible in a sequence-tiling only if the sequence contains characters outside of `A`, `C`, `G`, and `T`. - For moderate to large sized genomes, this output format is preferrable to the GFA ones — the GFA formats can be quite verbose for this particular scenario, while the reduced representation provides effitively the same information, while taking much lesser space. For example, for the 7-human genomes (experimented with in the manuscript) and using `k` = 31, the compacted graph takes 112GB in GFA2, while 29.3GB in this reduced format. + For moderate to large sized genomes, this output format is preferrable to the GFA ones—the GFA formats can be quite verbose for this particular scenario, while the reduced representation provides effitively the same information, while taking much lesser space. + For example, for the 7-human genomes (experimented with in the manuscripts) and using `k = 31`, the compacted graph takes 112 GB in GFA2, while 29.3 GB in this reduced format. + +### Orientation of the output -Cuttlefish works with the canonical representations of the _k_-mers, i.e. each _k_-mer and its reverse complement are treated as the same vertex in the original graph. The maximal unitig fragments (the ''segments'' in the GFA-terminology) are always output in their canonical forms — the orientations are guaranteed to be the same across identical executions. +Cuttlefish works with the canonical representations of the _k_-mers, i.e. each _k_-mer and its reverse complement are treated as the same vertex in the original graph. +The maximal unitig fragments (the ''segments'' in the GFA-terminology) are always output in their canonical forms—the orientations are guaranteed to be the same across identical executions. ### ''Colored'' output for Cuttlefish 1 -In the [GFA](https://github.com/GFA-spec/GFA-spec) output formats for the compacted de Bruijn graph, the graph is represented as a list of the vertices (i.e. the maximal unitigs) and the adjacencies between them. The output also includes a path-tiling for each individual sequence in the input references, i.e. an ordered list of the maximal unitig ids that completely tile that sequence. Put differently, the GFA outputs describe a colored de Bruijn graph in the sense that the color information for each vertex (maximal unitig) is encoded in the `P` (GFA 1.0) or the `O` (GFA 2.0) entries (or the tilings in the file `.cf_seq`, in the reduced output). +In the [GFA](https://github.com/GFA-spec/GFA-spec) output formats for the compacted de Bruijn graph, the graph is represented as a list of the vertices (i.e. the maximal unitigs) and the adjacencies between them. +The output also includes a path-tiling for each individual sequence in the input references, i.e. an ordered list of the maximal unitig ids that completely tile that sequence. +Put differently, the GFA outputs describe a colored de Bruijn graph in the sense that the color information for each vertex (maximal unitig) is encoded in the `P` (GFA 1.0) or the `O` (GFA 2.0) entries (or the tilings in the `.cf_seq` file, in the reduced output). -Throughout the [manuscript](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696), when we mention the colored de Bruijn graph, we refer to a very specific definition of colors. While this definition is intuitive and natural when constructing the compacted colored de Bruijn graph from a set of reference genomes, it is not the case that the Cuttlefish algorithm allows arbitrary coloring of the _k_-mers in the de Bruijn graph. Specifically, in the definition adopted herein, the color set of a unitig is the subset of input references si1, si2, ..., sil in which the unitig appears. This color information is implicitly encoded in the path entries of the output GFA files (the `P` entries in GFA 1.0 and the `O` entries in GFA 2.0). As a result, all unitigs produced by Cuttlefish are "monochromatic" under this coloring definition, as a change to the color set internally to a unitig would imply either a branch (which would terminate the unitig) or the start or end of some reference string and a sentinel _k_-mer (which would also terminate the unitig). If one were constructing the compacted colored de Bruijn graph from raw sequencing reads or from highly-fractured assemblies, then one may wish to adopt a different notion of color, wherein color sets may vary across an individual unitig. - +Throughout the [manuscript (Cuttlefish 1)](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696), when we mention the colored de Bruijn graph, we refer to a specific definition of colors. +While this definition is intuitive and natural when constructing the compacted colored de Bruijn graph from a set of reference genomes, it is not the case that the Cuttlefish algorithm allows arbitrary coloring of the _k_-mers in the de Bruijn graph. +Specifically, in the definition adopted herein, the color set of a unitig is the subset of input references si1, si2, ..., sil in which the unitig appears. +This color information is implicitly encoded in the path entries of the output GFA files (the `P` entries in GFA 1.0 and the `O` entries in GFA 2.0). +As a result, all unitigs produced by Cuttlefish are ''monochromatic'' under this coloring definition, as a change to the color set internally to a unitig would imply either a branch (which would terminate the unitig) or the start or end of some reference string and a sentinel _k_-mer (which would also terminate the unitig). +If one were constructing the compacted colored de Bruijn graph from raw sequencing reads or from highly-fractured assemblies, then one may wish to adopt a different notion of color, wherein color sets may vary across an individual unitig. ## Example usage @@ -227,34 +313,8 @@ _To be completed_ ### Cuttlefish 1 -Please use the `kmc` and the `cuttlefish` binaries from their respective paths in the following examples. We use _k_ = 3, and 4 CPU threads, with a working directory named `temp` in the following examples. - -To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: - -- Produce a newline-separated list of the paths of the input references. For example, - - ```bash - readlink -f refs1.fa > refs.lst - readlink -f refs2.fa >> refs.lst - ``` - -- Generate the _k_-mer set: - - ```bash - kmc -k3 -fm -ci1 -t4 @refs.lst kmers temp/ - ``` - -- Build a hash function over `kmers`, compute the states of the graph vertices, and output the compacted graph (in GFA 2.0): - - ```bash - cuttlefish build -l refs.lst -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ - ``` - - Or, - - ```bash - cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ - ``` +Please use the `kmc` and the `cuttlefish` executables from their respective paths in the following examples. +We use _k_ = 3, and 4 CPU threads, with a working directory named `temp` in the following examples. - **For individual input genome reference** @@ -263,20 +323,20 @@ To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files - Generate the _k_-mer set: ```bash - kmc -k3 -fm -ci1 -t4 refs1.fa kmers temp/ + kmc -k3 -fa -ci1 -t4 refs1.fa kmers temp/ ``` - - Build a hash function over `kmers`, compute the states of the graph vertices, and output the compacted graph (in GFA 1.0): + - Output the compacted graph (in GFA 1.0): ```bash - cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.gfa1 -f 1 -w temp/ + cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.gfa1 -f 1 -w temp/ ``` - To get only the maximal unitig fragments (which is `-f 0` by default): + To get only the maximal unitig fragments (which is `-f 0` by default): - ```bash - cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.txt -w temp/ - ``` + ```bash + cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.txt -w temp/ + ``` - **For multiple input genome references** @@ -285,63 +345,87 @@ To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files - Produce a newline-separated list of the paths of the input references. For example, ```bash - readlink -f refs1.fa > refs.lst - readlink -f refs2.fa >> refs.lst + readlink -f refs1.fa > refs.lst + readlink -f refs2.fa >> refs.lst ``` - Generate the _k_-mer set: ```bash - kmc -k3 -fm -ci1 -t4 @refs.lst kmers temp/ + kmc -k3 -fa -ci1 -t4 @refs.lst kmers temp/ ``` - - Build a hash function over `kmers`, compute the states of the graph vertices, and output the compacted graph (in GFA 2.0): + - Output the compacted graph (in GFA 2.0): ```bash - cuttlefish build -l refs.lst -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ + cuttlefish build -l refs.lst -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ ``` - Or, + Or, - ```bash - cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ - ``` + ```bash + cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ + ``` ## Larger _k_-mer sizes -The default maximum _k_-mer size supported with the installation is 63. To set the maximum _k_-mer size capacity to some `MAX_K`, add `-DINSTANCE_COUNT=` with the `cmake` command — where `instance_count` is the number of `k`-values that are to be supported by Cuttlefish, and should be set to `(MAX_K + 1) / 2`. For example, to support a `MAX_K` of 127, use the following: +The default maximum _k_-mer size supported with the installation from source is `63`. +To set the maximum _k_-mer size capacity to some `MAX_K`, add `-DINSTANCE_COUNT=` with the `cmake` command—where `` is the number of `k`-values that are to be supported by Cuttlefish, and should be set to `(MAX_K + 1) / 2`. +For example, to support a `MAX_K` of 127, use the following: ```bash - cmake -DINSTANCE_COUNT=64 .. +cmake -DINSTANCE_COUNT=64 .. ``` -Cuttlefish supports only the odd `k` values within `MAX_K` due to theoretical reasons. Increasing the `MAX_K` bound incurs additional compilation cost, slowing down the installation. Currently, KMC3 supports a `MAX_K` of 255. Please note that, the second step of the pipeline, i.e. the construction of a minimal perfect hash function (using [BBHash](https://github.com/rizkg/BBHash)) gets less efficient (time-wise) with increasing _k_, due to disk-read throughput bottlenecks associated with reading the _k_-mers from the KMC database. +Cuttlefish supports only the odd `k` values within `MAX_K` due to theoretical reasons. +Currently, KMC3 supports a `MAX_K` of `255`. + + -Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer as necessary — thus increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. +Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer as necessary—thus increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. -## Citation +## Differences between Cuttlefish 1 & 2 + +- Cuttlefish 1 is applicable only for (whole-genome or transcriptome) reference sequences. +Whereas Cuttlefish 2 is applicable for both sequencing reads and reference sequences. +- For reference sequences, Cuttlefish 1 supports outputting the compacted graph in the GFA formats, whereas Cuttlefish 2 does not support this _yet_. +- Cuttlefish 2 can be used by passing either one of the following arguments to the `cuttlefish build` command: `--read` or `--ref`. +Passing neither of these invokes Cuttlefish 1. + +## Citations & Acknowledgement ```bibtex - @article{10.1093/bioinformatics/btab309, - author = {Khan, Jamshed and Patro, Rob}, - title = "{Cuttlefish: fast, parallel and low-memory compaction of de Bruijn graphs from large-scale genome collections}", - journal = {Bioinformatics}, - volume = {37}, - number = {Supplement\_1}, - pages = {i177-i186}, - year = {2021}, - month = {07}, - issn = {1367-4803}, - doi = {10.1093/bioinformatics/btab309}, - url = {https://doi.org/10.1093/bioinformatics/btab309}, +@article{10.1093/bioinformatics/btab309, + author = {Khan, Jamshed and Patro, Rob}, + title = "{Cuttlefish: fast, parallel and low-memory compaction of de Bruijn graphs from large-scale genome collections}", + journal = {Bioinformatics}, + volume = {37}, + number = {Supplement\_1}, + pages = {i177-i186}, + year = {2021}, + month = {07}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab309}, + url = {https://doi.org/10.1093/bioinformatics/btab309}, } ``` -### Acknowledgements +```bibtex +@article{Khan2021.12.14.472718, + author = {Khan, Jamshed and Kokot, Marek and Deorowicz, Sebastian and Patro, Rob}, + title = "{Scalable, ultra-fast, and low-memory construction of compacted de Bruijn graphs with Cuttlefish 2}", + elocation-id = {2021.12.14.472718}, + year = {2021}, + doi = {10.1101/2021.12.14.472718}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2021/12/16/2021.12.14.472718}, + journal = {bioRxiv} +} +``` This work is supported by _NIH R01 HG009937_, and by _NSF CCF-1750472_, and _CNS-1763680_. From 0d6da88dd8df58200957a4b36e179bdccfd40aba Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 28 Dec 2021 14:03:52 -0500 Subject: [PATCH 286/350] Correct readme-section links --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f0e2b163..686facbd 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,12 @@ Cuttlefish is a fast, parallel, and very lightweight memory tool to construct th - [Dependencies](#dependencies) - [Installation](#installation) - [Usage](#usage) -- [I/O formats](#io-formats) - - [''Colored'' output](#colored-output) +- [Output formats](#output-formats) + - [''Colored'' output for Cuttlefish 1](#colored-output-for-cuttlefish-1) - [Example usage](#example-usage) - [Larger _k_-mer sizes](#larger-k-mer-sizes) -- [Differences between Cuttlefish 1 & 2](#differences-between-cuttlefish-1-&-2) -- [Citations & Acknowledgement](#citations-&-acknowledgement) +- [Differences between Cuttlefish 1 & 2](#differences-between-cuttlefish-1--2) +- [Citations & Acknowledgement](#citations--acknowledgement) - [Licenses](#licenses) ## Overview From bc4eab0724a6b0403af541d3cdae4881ce8e141b Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Dec 2021 12:37:21 -0500 Subject: [PATCH 287/350] Keep low-k operatiblity open for read dBGs needs update in KMC-to-KMC operation for k < 13 --- src/kmer_Enumerator.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 49f701bc..78647e72 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -3,6 +3,9 @@ #include "Kmer_Container.hpp" +template const std::size_t kmer_Enumerator::min_memory; + + template kmer_Enumeration_Stats kmer_Enumerator::enumerate( const KMC::InputFileType input_file_type, const std::vector& seqs, const uint32_t cutoff, @@ -11,6 +14,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( { // FunnyProgress progress; + const bool estimate_mem = (k > 13 && estimate_mem_usage); std::size_t memory = std::max(max_memory, min_memory); stage1_params .SetInputFileType(input_file_type) @@ -18,7 +22,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( .SetKmerLen(k) .SetNThreads(thread_count) .SetTmpPath(working_dir_path) - .SetEstimateHistogramCfg(estimate_mem_usage ? KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS : KMC::EstimateHistogramCfg::DONT_ESTIMATE) + .SetEstimateHistogramCfg(estimate_mem ? KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS : KMC::EstimateHistogramCfg::DONT_ESTIMATE) // .SetPercentProgressObserver(&progress) ; @@ -33,7 +37,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( memory = std::max( - (estimate_mem_usage ? std::max(memory_limit(solid_kmer_count_approx(cutoff)), max_memory) : max_memory), + (estimate_mem ? std::max(memory_limit(solid_kmer_count_approx(cutoff)), max_memory) : max_memory), min_memory); stage2_params .SetCutoffMin(cutoff) From 2c7fbf14f7848a63f1a87aa2bb37acf125f95145 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Dec 2021 14:02:07 -0500 Subject: [PATCH 288/350] Support low k's for read dBGs potentially makeshift --- src/Read_CdBG.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index b78829ec..4a722552 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -145,6 +145,16 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const template kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { + // KMC-to-KMC operation isn't supported for k < 13, yet. + if(k < 13) + { + const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); + return kmer_Enumerator().enumerate( + ip_type, logistics.input_paths_collection(), 1, + params.thread_count(), max_memory, params.strict_memory(), false, + logistics.working_dir_path(), logistics.vertex_db_path()); + } + return kmer_Enumerator().enumerate( KMC::InputFileType::KMC, std::vector(1, logistics.edge_db_path()), 1, params.thread_count(), max_memory, params.strict_memory(), false, From 81a185f0c97b9c9fef4a2122bc6a0792caaf70b0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 29 Dec 2021 14:51:33 -0500 Subject: [PATCH 289/350] Centralize small k threshold (KMC) --- include/kmer_Enumerator.hpp | 2 ++ src/Read_CdBG.cpp | 4 ++-- src/kmer_Enumerator.cpp | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index 5ba8ae76..aab6dc7b 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -43,6 +43,8 @@ class kmer_Enumerator public: + static constexpr uint16_t small_k_threshold = 13; // KMC's internal threshold to switch modes for small-k optimizations. + // Enumerates the k-mers from the sequences (of type `input_file_type`) present is `seqs`, that // are present at least `cutoff` times. Employs `thread_count` number of processor threads and // uses a soft memory-cap of `max_memory` (in GB). If `strict_memory` is `true`, then the memory diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 4a722552..13bc43c9 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -145,8 +145,8 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const template kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { - // KMC-to-KMC operation isn't supported for k < 13, yet. - if(k < 13) + // KMC-to-KMC operation isn't supported for small enough k's yet. + if(k < kmer_Enumerator::small_k_threshold) { const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); return kmer_Enumerator().enumerate( diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index 78647e72..af6d8e36 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -14,7 +14,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( { // FunnyProgress progress; - const bool estimate_mem = (k > 13 && estimate_mem_usage); + const bool estimate_mem = (k > small_k_threshold && estimate_mem_usage); // Histogram estimation is not supported for small enough k's yet. std::size_t memory = std::max(max_memory, min_memory); stage1_params .SetInputFileType(input_file_type) From 2684fba29f1d3019801dc99b897d5679b3f4b498 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Dec 2021 10:52:13 -0500 Subject: [PATCH 290/350] Add dir-existence checker --- include/utility.hpp | 4 ++++ src/utility.cpp | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/include/utility.hpp b/include/utility.hpp index ac168a85..a047a052 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -22,6 +22,10 @@ bool is_prefix(const std::string& s, const std::string& pref); // `file_path`. bool file_exists(const std::string& file_path); +// Returns `true` iff these exists a directory in the file system with the +// path `dir_path`. +bool dir_exists(const std::string& dir_path); + // Returns the file size is bytes of the file at path `file_path`. Returns // `0` in case the file does not exist. std::size_t file_size(const std::string& file_path); diff --git a/src/utility.cpp b/src/utility.cpp index 4bad50e1..ccb0343f 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -47,6 +47,12 @@ bool file_exists(const std::string& file_path) } +bool dir_exists(const std::string& dir_path) +{ + return ghc::filesystem::is_directory(dir_path); +} + + std::size_t file_size(const std::string& file_path) { std::error_code ec; From 5eb1f7e274148dee7c3a16dbeb9924ec195154a7 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 30 Dec 2021 11:55:53 -0500 Subject: [PATCH 291/350] Validate input better --- src/Build_Params.cpp | 109 +++++++++++++++++++++++++++++++------------ src/main.cpp | 2 +- 2 files changed, 79 insertions(+), 32 deletions(-) diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 5afd11ea..d5fa3103 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -1,5 +1,6 @@ #include "Build_Params.hpp" +#include "Input_Defaults.hpp" #include "utility.hpp" @@ -53,43 +54,17 @@ Build_Params::Build_Params( const bool is_read_graph, bool Build_Params::is_valid() const { - // TODO: do better — is a mess. - bool valid = true; + // Input data need to be non-empty. if(seq_input_.empty()) { std::cout << "No sequence input provided for compacted de Bruijn graph construction.\n"; valid = false; } - - // Check if read and reference de Bruijn graph parameters are being mixed with. - if(is_read_graph_ && is_ref_graph_) - { - std::cout << "Both read and reference de Bruijn graph specified. Please select only one, or none for Cuttlefish 1.0.\n"; - valid = false; - } - - if(is_read_graph_ || is_ref_graph_) // Is Cuttlefish 2.0. - { - if(output_format_ != cuttlefish::Output_Format::txt) - { - std::cout << "(Currently) Unsupported output file format requested for the compacted read de Bruijn graph.\n"; - valid = false; - } - } - else // Is Cuttlefish 1.0. - { - if(!vertex_db_path_.empty()) - { - std::cout << "No edge (i.e. (k + 1)-mer) database is required for a compacted reference de Bruijn graph construction.\n"; - valid = false; - } - } - - + // Even `k` values are not consistent with the theory. // Also, `k` needs to be in the range `[1, MAX_K]`. if((k_ & 1U) == 0 || (k_ > cuttlefish::MAX_K)) @@ -107,14 +82,86 @@ bool Build_Params::is_valid() const valid = false; } + + // Output directory must exist. + const std::string op_dir = dirname(output_file_path_); + if(!dir_exists(op_dir)) + { + std::cout << "Output directory " << op_dir << " does not exist.\n"; + valid = false; + } + - // Discard invalid output formats. - if(output_format_ >= cuttlefish::num_op_formats) + // Working directory must exist. + const std::string work_dir = dirname(working_dir_path_); + if(!dir_exists(work_dir)) { - std::cout << "Invalid output file format.\n"; + std::cout << "Working directory " << work_dir << " does not exist.\n"; valid = false; } + if(is_read_graph_ || is_ref_graph_) // Validate Cuttlefish 2 specific arguments. + { + // Read and reference de Bruijn graph parameters can not be mixed with. + if(is_read_graph_ && is_ref_graph_) + { + std::cout << "Both read and reference de Bruijn graph specified. Please select only one for Cuttlefish 2, or none to use Cuttlefish 1.\n"; + valid = false; + } + + + // A cutoff frequency of 0 is theoretically inconsistent. + if(cutoff_ == 0) + { + std::cout << "Cutoff frequency specified to be 0, which is theoretically inconsistent. Please use 1 if you wish to retain all the k-mers without filtering.\n"; + valid = false; + } + + // Cutoff frequency _should be_ 1 for reference de Bruijn graphs. + if(is_ref_graph_ && cutoff_ != 1) + std::cout << "WARNING: cutoff frequency specified not to be 1 on reference sequences.\n"; + + + // Memory budget options are being mixed with. + if(max_memory_ != cuttlefish::_default::MAX_MEMORY && !strict_memory_) + std::cout << "Both a memory bound and the option for unrestricted memory usage specified. Unrestricted memory mode will be used.\n"; + + + // Cuttlefish 1 specific arguments can not be specified. + if(vertex_db_path_ != cuttlefish::_default::WORK_DIR || output_format_ != cuttlefish::Output_Format::txt || remove_kmc_db_) + { + std::cout << "Cuttlefish 1 specific arguments specified while using Cuttlefish 2.\n"; + valid = false; + } + } + else // Validate Cuttlefish 1 specific arguments. + { + // Directory containing vertex database must exist. + const std::string vertex_db_dir = dirname(vertex_db_path_); + if(!dir_exists(vertex_db_dir)) + { + std::cout << "Vertex database directory " << vertex_db_dir << " does not exist.\n"; + valid = false; + } + + + // Discard invalid output formats. + if(output_format_ >= cuttlefish::num_op_formats) + { + std::cout << "Invalid output file format.\n"; + valid = false; + } + + + // Cuttlefish 2 specific arguments can not be specified. + if(cutoff_ != cuttlefish::_default::CUTOFF_FREQ || max_memory_ != cuttlefish::_default::MAX_MEMORY || !strict_memory_ || path_cover_ || !edge_db_path_.empty()) + { + std::cout << "Cuttelfish 2 specific arguments specified while using Cuttlefish 1.\n"; + valid = false; + } + } + + return valid; } diff --git a/src/main.cpp b/src/main.cpp index 0fd6cecd..0b51c84e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -28,7 +28,7 @@ void build(int argc, char** argv) ("d,dirs", "input file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) - ("o,output", "output file", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("o,output", "output file", cxxopts::value()) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("h,help", "print usage"); From eae3cef896fd426b65ffe85bbdb2cfb214f01c7a Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 10 Jan 2022 16:34:42 -0500 Subject: [PATCH 292/350] Fix empty dir-name bug --- src/utility.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/utility.cpp b/src/utility.cpp index ccb0343f..55348f48 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -122,7 +122,8 @@ const std::string filename(const std::string& file_path) const std::string dirname(const std::string& file_path) { - return ghc::filesystem::path(file_path).remove_filename().string(); + const std::string path = ghc::filesystem::path(file_path).remove_filename().string(); + return path.empty() ? "." : path; } From 6836613154e9d6110ef4ceffa767bdb027c53cb0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 17 Jan 2022 16:40:38 -0500 Subject: [PATCH 293/350] Rename private field per practice --- include/dBG_Info.hpp | 2 +- src/dBG_Info.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index ff4f6f38..e063849b 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -24,7 +24,7 @@ class dBG_Info nlohmann::ordered_json dBg_info; // A JSON object wrapping all the information. - const std::string file_path; // Path to the disk-file to store the JSON object. + const std::string file_path_; // Path to the disk-file to store the JSON object. static constexpr const char* basic_field = "basic info"; // Category header for basic graph information. static constexpr const char* contigs_field = "contigs info"; // Category header for information about the contigs (maximal unitigs). diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 11957ac5..4404ad5f 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -11,7 +11,7 @@ template dBG_Info::dBG_Info(const std::string& file_path): - file_path(file_path) + file_path_(file_path) { if(file_exists(file_path)) load_from_file(); @@ -21,13 +21,13 @@ dBG_Info::dBG_Info(const std::string& file_path): template void dBG_Info::load_from_file() { - std::ifstream input(file_path.c_str()); + std::ifstream input(file_path_.c_str()); input >> dBg_info; if(input.fail()) { - std::cerr << "Error loading JSON object from file " << file_path << ". Aborting.\n"; + std::cerr << "Error loading JSON object from file " << file_path_ << ". Aborting.\n"; std::exit(EXIT_FAILURE); } @@ -78,18 +78,18 @@ void dBG_Info::add_build_params(const Build_Params& params) template void dBG_Info::dump_info() const { - std::ofstream output(file_path.c_str()); + std::ofstream output(file_path_.c_str()); output << std::setw(4) << dBg_info << "\n"; // Pretty-print the JSON wrapper with overloaded `std::setw`. if(output.fail()) { - std::cerr << "Error writing to the information file " << file_path << ". Aborting.\n"; + std::cerr << "Error writing to the information file " << file_path_ << ". Aborting.\n"; std::exit(EXIT_FAILURE); } output.close(); - std::cout << "\nStructural information for the de Bruijn graph is written to " << file_path << ".\n"; + std::cout << "\nStructural information for the de Bruijn graph is written to " << file_path_ << ".\n"; } From 99e598597faa2ba2717d5f3d61d81f2cae082c18 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 17 Jan 2022 16:57:52 -0500 Subject: [PATCH 294/350] Clarify prev-execution message better --- include/dBG_Info.hpp | 3 +++ src/Read_CdBG.cpp | 2 +- src/dBG_Info.cpp | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index e063849b..a14ccad8 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -42,6 +42,9 @@ class dBG_Info // path `file_path`. dBG_Info(const std::string& file_path); + // Returns the path to the disk-file storing the corresponding JSON object. + std::string file_path() const; + // Adds build parameters information of the Cuttlefish algorithm from `params`. void add_build_params(const Build_Params& params); diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 13bc43c9..4dec2b6c 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -35,7 +35,7 @@ void Read_CdBG::construct() { if(is_constructed()) { - std::cout << "\nThe compacted de Bruijn graph has been constructed earlier.\n"; + std::cout << "\nThe compacted de Bruijn graph has been constructed earlier. Check " << dbg_info.file_path() << " for results.\n"; return; } diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 4404ad5f..0c74b21a 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -18,6 +18,13 @@ dBG_Info::dBG_Info(const std::string& file_path): } +template +std::string dBG_Info::file_path() const +{ + return file_path_; +} + + template void dBG_Info::load_from_file() { From 4078d9c642a1fedf56e55a3a41c12069cb6318b8 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 19 Jan 2022 15:40:55 -0500 Subject: [PATCH 295/350] Add 'at' accessor to hash table --- include/Kmer_Hash_Table.hpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index 1de372eb..bcde795b 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -142,6 +142,15 @@ class Kmer_Hash_Table // Returns the value (in the hash-table) for the key `kmer`. const State operator[](const Kmer& kmer) const; + // Returns an API to the entry (in the hash table) for the key `kmer`. The API + // wraps the hash table position and the state value at that position. + Kmer_Hash_Entry_API at(const Kmer& kmer); + + // Returns an API to the entry (in the hash table) for a k-mer hashing + // to the bucket number `bucket_id` of the hash table. The API wraps + // the hash table position and the state value at that position. + Kmer_Hash_Entry_API at(uint64_t bucket_id); + // Attempts to update the entry (in the hash-table) for the API object according // to its wrapped state values, and returns `true` or `false` as per success // status. If the corresponding hash table position now contains a different @@ -237,6 +246,20 @@ inline const State Kmer_Hash_Table::operator[](const Kmer& k } +template +inline Kmer_Hash_Entry_API Kmer_Hash_Table::at(const Kmer& kmer) +{ + return operator[](kmer); +} + + +template +inline Kmer_Hash_Entry_API Kmer_Hash_Table::at(const uint64_t bucket_id) +{ + return operator[](bucket_id); +} + + template inline bool Kmer_Hash_Table::update(Kmer_Hash_Entry_API& api) { From 9edbd916a19fc1a5b145977bfe0b9cff0a06f616 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 19 Jan 2022 18:28:52 -0500 Subject: [PATCH 296/350] Abstract out bits/k-mer from enumerator --- include/Read_CdBG.hpp | 2 ++ include/kmer_Enumerator.hpp | 10 +++++----- src/Read_CdBG.cpp | 12 ++++++------ src/kmer_Enumerator.cpp | 8 ++++---- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/Read_CdBG.hpp b/include/Read_CdBG.hpp index d4ca007b..3b89ed72 100644 --- a/include/Read_CdBG.hpp +++ b/include/Read_CdBG.hpp @@ -29,6 +29,8 @@ class Read_CdBG dBG_Info dbg_info; // Wrapper object for structural information of the graph. + static constexpr double bits_per_vertex = 9.71; // Expected number of bits required per vertex by Cuttlefish 2. + // Enumerates the edges of the de Bruijn graph and returns summary statistics of the // enumearation. diff --git a/include/kmer_Enumerator.hpp b/include/kmer_Enumerator.hpp index aab6dc7b..8c7bed8e 100644 --- a/include/kmer_Enumerator.hpp +++ b/include/kmer_Enumerator.hpp @@ -21,7 +21,6 @@ class kmer_Enumerator static constexpr std::size_t min_memory = 3; // In GB; set as per the KMC3 library requirement. static constexpr uint16_t bin_count = 2000; static constexpr uint16_t signature_len = 11; - static constexpr double bits_per_kmer = 9.71; static constexpr uint64_t counter_max = 1; // The `-cs` argument for KMC3; we're not interested in the counts and `cs = 1` will trigger skipping the counts. KMC::Stage1Params stage1_params; // Parameters collection for the k-mer statistics approximation step of KMC3. @@ -37,8 +36,9 @@ class kmer_Enumerator uint64_t solid_kmer_count_approx(uint16_t cutoff) const; // Returns the strict memory limit (in GB) for the actual KMC3 execution, based on the number of - // unique k-mers `unique_kmer_count` (typically approximated earlier). - std::size_t memory_limit(uint64_t unique_kmer_count) const; + // unique k-mers `unique_kmer_count` (typically approximated earlier) and the expected bits/kmer + // `bits_per_kmer` requested. + std::size_t memory_limit(uint64_t unique_kmer_count, double bits_per_kmer) const; public: @@ -54,8 +54,8 @@ class kmer_Enumerator // written to `working_dir_path`. The output database is stored at path prefix `output_db_path`. // Returns summary statistics of the enumeration. kmer_Enumeration_Stats enumerate( - KMC::InputFileType input_file_type, const std::vector& seqs, uint32_t cutoff, - uint16_t thread_count, std::size_t max_memory, bool strict_memory, bool estimate_mem_usage, + KMC::InputFileType input_file_type, const std::vector& seqs, uint32_t cutoff, uint16_t thread_count, + std::size_t max_memory, bool strict_memory, bool estimate_mem_usage, double bits_per_kmer, const std::string& working_dir_path, const std::string& output_db_path); }; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 4dec2b6c..e0d61a58 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -136,8 +136,8 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const { const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); return kmer_Enumerator().enumerate( - ip_type, logistics.input_paths_collection(), params.cutoff(), - params.thread_count(), params.max_memory(), params.strict_memory(), params.strict_memory(), + ip_type, logistics.input_paths_collection(), params.cutoff(), params.thread_count(), + params.max_memory(), params.strict_memory(), params.strict_memory(), bits_per_vertex, logistics.working_dir_path(), logistics.edge_db_path()); } @@ -150,14 +150,14 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max { const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); return kmer_Enumerator().enumerate( - ip_type, logistics.input_paths_collection(), 1, - params.thread_count(), max_memory, params.strict_memory(), false, + ip_type, logistics.input_paths_collection(), 1, params.thread_count(), + max_memory, params.strict_memory(), false, bits_per_vertex, logistics.working_dir_path(), logistics.vertex_db_path()); } return kmer_Enumerator().enumerate( - KMC::InputFileType::KMC, std::vector(1, logistics.edge_db_path()), 1, - params.thread_count(), max_memory, params.strict_memory(), false, + KMC::InputFileType::KMC, std::vector(1, logistics.edge_db_path()), 1, params.thread_count(), + max_memory, params.strict_memory(), false, bits_per_vertex, logistics.working_dir_path(), logistics.vertex_db_path()); } diff --git a/src/kmer_Enumerator.cpp b/src/kmer_Enumerator.cpp index af6d8e36..3bc9d9e7 100644 --- a/src/kmer_Enumerator.cpp +++ b/src/kmer_Enumerator.cpp @@ -8,8 +8,8 @@ template const std::size_t kmer_Enumerator::min_memory; template kmer_Enumeration_Stats kmer_Enumerator::enumerate( - const KMC::InputFileType input_file_type, const std::vector& seqs, const uint32_t cutoff, - const uint16_t thread_count, const std::size_t max_memory, const bool strict_memory, const bool estimate_mem_usage, + const KMC::InputFileType input_file_type, const std::vector& seqs, const uint32_t cutoff, const uint16_t thread_count, + const std::size_t max_memory, const bool strict_memory, const bool estimate_mem_usage, const double bits_per_kmer, const std::string& working_dir_path, const std::string& output_db_path) { // FunnyProgress progress; @@ -37,7 +37,7 @@ kmer_Enumeration_Stats kmer_Enumerator::enumerate( memory = std::max( - (estimate_mem ? std::max(memory_limit(solid_kmer_count_approx(cutoff)), max_memory) : max_memory), + (estimate_mem ? std::max(memory_limit(solid_kmer_count_approx(cutoff), bits_per_kmer), max_memory) : max_memory), min_memory); stage2_params .SetCutoffMin(cutoff) @@ -71,7 +71,7 @@ uint64_t kmer_Enumerator::solid_kmer_count_approx(const uint16_t cutoff) cons template -std::size_t kmer_Enumerator::memory_limit(const uint64_t unique_kmer_count) const +std::size_t kmer_Enumerator::memory_limit(const uint64_t unique_kmer_count, const double bits_per_kmer) const { const double memory_in_bits = bits_per_kmer * unique_kmer_count; const double memory_in_bytes = memory_in_bits / 8.0; From 8cefa589e354b1bfb818eb714db49480b8ed98ac Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 19 Jan 2022 18:56:58 -0500 Subject: [PATCH 297/350] Integrate kmc-lib to cuttlefish 1 --- include/CdBG.hpp | 33 ++++++++++- src/CdBG.cpp | 116 ++++++++++++++++++++++++++++++++++---- src/CdBG_Builder.cpp | 24 ++++---- src/CdBG_GFA_Writer.cpp | 16 +++--- src/CdBG_Plain_Writer.cpp | 14 ++--- 5 files changed, 163 insertions(+), 40 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 0b0e1831..cb282397 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -9,10 +9,14 @@ #include "Annotated_Kmer.hpp" #include "Oriented_Unitig.hpp" #include "Build_Params.hpp" +#include "Data_Logistics.hpp" +#include "kmer_Enumeration_Stats.hpp" +#include "dBG_Info.hpp" #include "Thread_Pool.hpp" #include "Job_Queue.hpp" #include "spdlog/async_logger.h" +#include #include @@ -25,7 +29,13 @@ class CdBG private: const Build_Params params; // Required parameters wrapped in one object. - Kmer_Hash_Table Vertices; // The hash table for the vertices (canonical k-mers) of the de Bruijn graph. + const Data_Logistics logistics; // Data logistics manager for the algorithm execution. + std::unique_ptr> hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + + dBG_Info dbg_info; // Wrapper object for structural information of the graph. + + static constexpr double bits_per_vertex = 8.71; // Expected number of bits required per vertex by Cuttlefish 2. + static constexpr std::size_t parser_memory = 256 * 1024U * 1024U; // An empirical estimation of the memory used by the sequence parser. 256 MB. // Minimum size of a partition to be processed by one thread. static constexpr uint16_t PARTITION_SIZE_THRESHOLD = 1; @@ -95,11 +105,28 @@ class CdBG /* Build methods */ + // Returns `true` iff the compacted de Bruijn graph to be built from the parameters + // collection `params` had been constructed in an earlier execution. + // NB: only the existence of the output meta-info file is checked for this purpose. + bool is_constructed() const; + + // Enumerates the vertices of the de Bruijn graph and returns summary statistics of the + // enumearation. + kmer_Enumeration_Stats enumerate_vertices() const; + + // Constructs the Cuttlefish hash table for the `vertex_count` vertices of the graph. + // If `load` is specified, then it is loaded from disk. + void construct_hash_table(uint64_t vertex_count, bool load = false); + // TODO: rename the "classify" methods with appropriate terminology that are consistent with the theory. // Classifies the vertices into different types (or, classes). void classify_vertices(); + // Returns the maximum temporary disk-usage incurred by some execution of the algorithm, + // that has its vertices-enumeration stats in `vertex_stats`. + static std::size_t max_disk_usage(const kmer_Enumeration_Stats& vertex_stats); + // Distributes the classification task for the sequence `seq` of length // `seq_len` to the thread pool `thread_pool`. void distribute_classification(const char* seq, size_t seq_len, Thread_Pool& thread_pool); @@ -414,6 +441,10 @@ class CdBG // compacted representation of the underlying reference de Bruijn graph wrapped in `params`. CdBG(const Build_Params& params); + // Destructs the compacted graph builder object, freeing its hash table and dumping the + // graph information to disk. + ~CdBG(); + // Constructs the compacted reference de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); diff --git a/src/CdBG.cpp b/src/CdBG.cpp index 71e18e7f..3f848345 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -1,39 +1,124 @@ #include "CdBG.hpp" +#include "kmer_Enumerator.hpp" +#include "Kmer_SPMC_Iterator.hpp" #include "utility.hpp" template CdBG::CdBG(const Build_Params& params): params(params), - Vertices(params.vertex_db_path()) + logistics(this->params), + hash_table(nullptr), + dbg_info(params.json_file_path()) { Kmer::set_k(params.k()); } +template +CdBG::~CdBG() +{ + if(hash_table != nullptr) + hash_table->clear(); + + dbg_info.dump_info(); +} + + template void CdBG::construct() { - std::cout << "\nConstructing the minimal perfect hash function (MPHF).\n"; - Vertices.construct(params.thread_count(), params.working_dir_path(), params.mph_file_path()); + if(is_constructed()) + { + std::cout << "\nThe compacted de Bruijn graph has been constructed earlier. Check " << dbg_info.file_path() << " for results.\n"; + return; + } + + dbg_info.add_build_params(params); + + std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); + + + std::cout << "\nEnumerating the vertices of the de Bruijn graph.\n"; + kmer_Enumeration_Stats vertex_stats = enumerate_vertices(); + vertex_stats.log_stats(); + + std::chrono::high_resolution_clock::time_point t_vertex = std::chrono::high_resolution_clock::now(); + std::cout << "Enumerated the vertex set of the graph. Time taken = " << std::chrono::duration_cast>(t_vertex - t_start).count() << " seconds.\n"; + + + const uint64_t vertex_count = vertex_stats.counted_kmer_count(); + std::cout << "Number of vertices: " << vertex_count << ".\n"; + + + std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; + construct_hash_table(vertex_count); if(params.remove_kmc_db()) + Kmer_Container::remove(logistics.vertex_db_path()); + + std::chrono::high_resolution_clock::time_point t_mphf = std::chrono::high_resolution_clock::now(); + std::cout << "Constructed the minimal perfect hash function for the vertices. Time taken = " << std::chrono::duration_cast>(t_mphf - t_vertex).count() << " seconds.\n"; + + + std::cout << "\nComputing the DFA states.\n"; + classify_vertices(); + + std::chrono::high_resolution_clock::time_point t_dfa = std::chrono::high_resolution_clock::now(); + std::cout << "Computed the states of the automata. Time taken = " << std::chrono::duration_cast>(t_dfa - t_mphf).count() << " seconds.\n"; + + + std::cout << "\nExtracting the maximal unitigs.\n"; + output_maximal_unitigs(); + + std::chrono::high_resolution_clock::time_point t_extract = std::chrono::high_resolution_clock::now(); + std::cout << "Extracted the maximal unitigs. Time taken = " << std::chrono::duration_cast>(t_extract - t_dfa).count() << " seconds.\n"; + + + const double max_disk = static_cast(max_disk_usage(vertex_stats)) / (1024.0 * 1024.0 * 1024.0); + std::cout << "\nMaximum temporary disk-usage: " << max_disk << "GB.\n"; +} + + +template +kmer_Enumeration_Stats CdBG::enumerate_vertices() const +{ + const KMC::InputFileType ip_type = KMC::InputFileType::MULTILINE_FASTA; + return kmer_Enumerator().enumerate( + ip_type, logistics.input_paths_collection(), 1, params.thread_count(), + params.max_memory(), params.strict_memory(), params.strict_memory(), bits_per_vertex, + logistics.working_dir_path(), logistics.vertex_db_path() + ); +} + + +template +void CdBG::construct_hash_table(const uint64_t vertex_count, const bool load) +{ + if(load) { - Kmer_Container::remove(params.vertex_db_path()); - std::cout << "\nRemoved the KMC database from disk.\n"; + hash_table = std::make_unique>(logistics.vertex_db_path(), vertex_count); + hash_table->load(params); } - - if(!params.output_file_path().empty()) + else { - std::cout << "\nComputing the vertex-states.\n"; - classify_vertices(); + std::size_t max_memory = std::max(process_peak_memory(), params.max_memory() * 1024U * 1024U * 1024U); + max_memory = (max_memory > parser_memory ? max_memory - parser_memory : 0); + + hash_table = (params.strict_memory() ? + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); - std::cout << "\nOutputting the compacted de Bruijn graph.\n"; - output_maximal_unitigs(); + hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path()); } +} + - Vertices.clear(); +template +bool CdBG::is_constructed() const +{ + return file_exists(params.json_file_path()); } @@ -67,6 +152,13 @@ size_t CdBG::search_valid_kmer(const char* const seq, const size_t left_end, } +template +std::size_t CdBG::max_disk_usage(const kmer_Enumeration_Stats& vertex_stats) +{ + return std::max(vertex_stats.temp_disk_usage(), vertex_stats.db_size()); +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index 33a4af2a..49e8a180 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -23,7 +23,7 @@ void CdBG::classify_vertices() std::cout << "Found the hash table buckets at file " << buckets_file_path << "\n"; std::cout << "Loading the buckets.\n"; - Vertices.load_hash_buckets(buckets_file_path); + hash_table->load_hash_buckets(buckets_file_path); std::cout << "Loaded the buckets into memory.\n"; } @@ -84,7 +84,7 @@ void CdBG::classify_vertices() { std::cout << "Saving the hash table buckets into file " << buckets_file_path << "\n"; - Vertices.save_hash_buckets(buckets_file_path); + hash_table->save_hash_buckets(buckets_file_path); std::cout << "Saved the buckets in disk.\n"; } @@ -249,11 +249,11 @@ bool CdBG::process_loop(const Directed_Kmer& kmer, const Directed_Kmer& { // Fetch the entry for `kmer_hat`. const Kmer& kmer_hat = kmer.canonical(); - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(kmer_hat); State& state = hash_table_entry.get_state(); state = State(Vertex(cuttlefish::State_Class::multi_in_multi_out)); - return Vertices.update(hash_table_entry); + return hash_table->update(hash_table_entry); } @@ -272,7 +272,7 @@ bool CdBG::process_leftmost_kmer(const Directed_Kmer& kmer, const Directed const Kmer& next_kmer_hat = next_kmer.canonical(); // Fetch the entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(kmer_hat); State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -363,7 +363,7 @@ bool CdBG::process_leftmost_kmer(const Directed_Kmer& kmer, const Directed // in the hash table by the time this method completes, making no updates at this point is theoretically // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the // ordering of the edges processed does not matter in the algorithm. - return state == old_state ? true : Vertices.update(hash_table_entry); + return state == old_state ? true : hash_table->update(hash_table_entry); } @@ -374,7 +374,7 @@ bool CdBG::process_rightmost_kmer(const Directed_Kmer& kmer, const char pr const cuttlefish::dir_t dir = kmer.dir(); // Fetch the entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(kmer_hat); State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -462,7 +462,7 @@ bool CdBG::process_rightmost_kmer(const Directed_Kmer& kmer, const char pr // in the hash table by the time this method completes, making no updates at this point is theoretically // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the // ordering of the edges processed does not matter in the algorithm. - return state == old_state ? true : Vertices.update(hash_table_entry); + return state == old_state ? true : hash_table->update(hash_table_entry); } @@ -474,7 +474,7 @@ bool CdBG::process_internal_kmer(const Directed_Kmer& kmer, const Directed const Kmer& next_kmer_hat = next_kmer.canonical(); // Fetch the hash table entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(kmer_hat); State& state = hash_table_entry.get_state(); // The k-mer is already classified as a complex node. @@ -580,7 +580,7 @@ bool CdBG::process_internal_kmer(const Directed_Kmer& kmer, const Directed // in the hash table by the time this method completes, making no updates at this point is theoretically // equivalent to returning instantaneously as soon as the hash table value had been read; and also (2) the // ordering of the edges processed does not matter in the algorithm. - return state == old_state ? true : Vertices.update(hash_table_entry); + return state == old_state ? true : hash_table->update(hash_table_entry); } @@ -590,7 +590,7 @@ bool CdBG::process_isolated_kmer(const Directed_Kmer& kmer) const Kmer& kmer_hat = kmer.canonical(); // Fetch the hash table entry for `kmer_hat`. - Kmer_Hash_Entry_API hash_table_entry = Vertices[kmer_hat]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(kmer_hat); State& state = hash_table_entry.get_state(); @@ -600,7 +600,7 @@ bool CdBG::process_isolated_kmer(const Directed_Kmer& kmer) // Classify the isolated k-mer as a complex node. state = State(Vertex(cuttlefish::State_Class::multi_in_multi_out)); - return Vertices.update(hash_table_entry); + return hash_table->update(hash_table_entry); } diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index ca69a6d1..1f369a6a 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -165,7 +165,7 @@ size_t CdBG::output_maximal_unitigs_gfa(const uint16_t thread_id, const char* // assert(kmer_idx <= seq_len - k); - Annotated_Kmer curr_kmer(Kmer(seq, kmer_idx), kmer_idx, Vertices); + Annotated_Kmer curr_kmer(Kmer(seq, kmer_idx), kmer_idx, *hash_table); // The subsequence contains only an isolated k-mer, i.e. there's no valid left or right // neighboring k-mer to this k-mer. So it's a maximal unitig by itself. @@ -178,7 +178,7 @@ size_t CdBG::output_maximal_unitigs_gfa(const uint16_t thread_id, const char* if(kmer_idx + k == seq_len || Kmer::is_placeholder(seq[kmer_idx + k])) { // A valid left neighbor exists as it's not an isolated k-mer. - Annotated_Kmer prev_kmer(Kmer(seq, kmer_idx - 1), kmer_idx, Vertices); + Annotated_Kmer prev_kmer(Kmer(seq, kmer_idx - 1), kmer_idx, *hash_table); if(is_unipath_start(curr_kmer.state_class(), curr_kmer.dir(), prev_kmer.state_class(), prev_kmer.dir())) // A maximal unitig ends at the ending of a maximal valid subsequence. @@ -191,7 +191,7 @@ size_t CdBG::output_maximal_unitigs_gfa(const uint16_t thread_id, const char* // A valid right neighbor exists for the k-mer. Annotated_Kmer next_kmer = curr_kmer; - next_kmer.roll_to_next_kmer(seq[kmer_idx + k], Vertices); + next_kmer.roll_to_next_kmer(seq[kmer_idx + k], *hash_table); bool on_unipath = false; Annotated_Kmer unipath_start_kmer; @@ -207,7 +207,7 @@ size_t CdBG::output_maximal_unitigs_gfa(const uint16_t thread_id, const char* // Both left and right valid neighbors exist for this k-mer. else { - prev_kmer = Annotated_Kmer(Kmer(seq, kmer_idx - 1), kmer_idx, Vertices); + prev_kmer = Annotated_Kmer(Kmer(seq, kmer_idx - 1), kmer_idx, *hash_table); if(is_unipath_start(curr_kmer.state_class(), curr_kmer.dir(), prev_kmer.state_class(), prev_kmer.dir())) { on_unipath = true; @@ -250,7 +250,7 @@ size_t CdBG::output_maximal_unitigs_gfa(const uint16_t thread_id, const char* } else // A valid right neighbor exists. { - next_kmer.roll_to_next_kmer(seq[kmer_idx + k], Vertices); + next_kmer.roll_to_next_kmer(seq[kmer_idx + k], *hash_table); if(on_unipath && is_unipath_end(curr_kmer.state_class(), curr_kmer.dir(), next_kmer.state_class(), next_kmer.dir())) { @@ -276,8 +276,8 @@ void CdBG::output_gfa_unitig(const uint16_t thread_id, const char* const seq, // For a particular unitig, always query the same well-defined canonical flanking // k-mer, irrespective of which direction the unitig may be traversed at. const Kmer min_flanking_kmer = std::min(start_kmer.canonical(), end_kmer.canonical()); - const uint64_t bucket_id = Vertices.bucket_id(min_flanking_kmer); - Kmer_Hash_Entry_API hash_table_entry = Vertices[bucket_id]; + const uint64_t bucket_id = hash_table->bucket_id(min_flanking_kmer); + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(bucket_id); State& state = hash_table_entry.get_state(); // Name the GFA segment with the hash value of the first k-mer of the canonical form unitig. @@ -293,7 +293,7 @@ void CdBG::output_gfa_unitig(const uint16_t thread_id, const char* const seq, state = state.outputted(); // If the hash table update is successful, only then this thread may output this unitig. - if(Vertices.update(hash_table_entry)) + if(hash_table->update(hash_table_entry)) params.output_format() == cuttlefish::Output_Format::gfa_reduced ? write_segment(thread_id, seq, unitig_id, start_kmer.idx(), end_kmer.idx(), unitig_dir) : write_gfa_segment(thread_id, seq, unitig_id, start_kmer.idx(), end_kmer.idx(), unitig_dir); diff --git a/src/CdBG_Plain_Writer.cpp b/src/CdBG_Plain_Writer.cpp index c9e22ff9..ede69115 100644 --- a/src/CdBG_Plain_Writer.cpp +++ b/src/CdBG_Plain_Writer.cpp @@ -29,7 +29,7 @@ size_t CdBG::output_maximal_unitigs_plain(const uint16_t thread_id, const cha // assert(kmer_idx <= seq_len - k); - Annotated_Kmer curr_kmer(Kmer(seq, kmer_idx), kmer_idx, Vertices); + Annotated_Kmer curr_kmer(Kmer(seq, kmer_idx), kmer_idx, *hash_table); // The subsequence contains only an isolated k-mer, i.e. there's no valid left or right // neighboring k-mer to this k-mer. So it's a maximal unitig by itself. @@ -42,7 +42,7 @@ size_t CdBG::output_maximal_unitigs_plain(const uint16_t thread_id, const cha if(kmer_idx + k == seq_len || Kmer::is_placeholder(seq[kmer_idx + k])) { // A valid left neighbor exists as it's not an isolated k-mer. - Annotated_Kmer prev_kmer(Kmer(seq, kmer_idx - 1), kmer_idx, Vertices); + Annotated_Kmer prev_kmer(Kmer(seq, kmer_idx - 1), kmer_idx, *hash_table); if(is_unipath_start(curr_kmer.state_class(), curr_kmer.dir(), prev_kmer.state_class(), prev_kmer.dir())) // A maximal unitig ends at the ending of a maximal valid subsequence. @@ -55,7 +55,7 @@ size_t CdBG::output_maximal_unitigs_plain(const uint16_t thread_id, const cha // A valid right neighbor exists for the k-mer. Annotated_Kmer next_kmer = curr_kmer; - next_kmer.roll_to_next_kmer(seq[kmer_idx + k], Vertices); + next_kmer.roll_to_next_kmer(seq[kmer_idx + k], *hash_table); bool on_unipath = false; Annotated_Kmer unipath_start_kmer; @@ -71,7 +71,7 @@ size_t CdBG::output_maximal_unitigs_plain(const uint16_t thread_id, const cha // Both left and right valid neighbors exist for this k-mer. else { - prev_kmer = Annotated_Kmer(Kmer(seq, kmer_idx - 1), kmer_idx, Vertices); + prev_kmer = Annotated_Kmer(Kmer(seq, kmer_idx - 1), kmer_idx, *hash_table); if(is_unipath_start(curr_kmer.state_class(), curr_kmer.dir(), prev_kmer.state_class(), prev_kmer.dir())) { on_unipath = true; @@ -114,7 +114,7 @@ size_t CdBG::output_maximal_unitigs_plain(const uint16_t thread_id, const cha } else // A valid right neighbor exists. { - next_kmer.roll_to_next_kmer(seq[kmer_idx + k], Vertices); + next_kmer.roll_to_next_kmer(seq[kmer_idx + k], *hash_table); if(on_unipath && is_unipath_end(curr_kmer.state_class(), curr_kmer.dir(), next_kmer.state_class(), next_kmer.dir())) { @@ -140,7 +140,7 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se // For a particular unitig, always query the same well-defined canonical flanking // k-mer, irrespective of which direction the unitig may be traversed at. const Kmer min_flanking_kmer = std::min(start_kmer.canonical(), end_kmer.canonical()); - Kmer_Hash_Entry_API hash_table_entry = Vertices[min_flanking_kmer]; + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(min_flanking_kmer); State& state = hash_table_entry.get_state(); if(state.is_outputted()) @@ -150,7 +150,7 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se state = state.outputted(); // If the hash table update is successful, only then this thread may output this unitig. - if(Vertices.update(hash_table_entry)) + if(hash_table->update(hash_table_entry)) write_path(thread_id, seq, start_kmer.idx(), end_kmer.idx(), start_kmer.kmer() < end_kmer.rev_compl()); } From eed9c0c2e93223fcbc04dd155c9d3465462ec145 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 14:04:29 -0500 Subject: [PATCH 298/350] Track unipath-info w/ size only too --- include/Unipaths_Meta_info.hpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/include/Unipaths_Meta_info.hpp b/include/Unipaths_Meta_info.hpp index 94200140..510cdf85 100644 --- a/include/Unipaths_Meta_info.hpp +++ b/include/Unipaths_Meta_info.hpp @@ -37,6 +37,9 @@ class Unipaths_Meta_info // to the tracker. void add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig); + // Adds information of a maximal unitig with vertex count `size` to the tracker. + void add_maximal_unitig(std::size_t size); + // Aggregates the information of the tracker `other` to this tracker. void aggregate(const Unipaths_Meta_info& other); @@ -73,11 +76,11 @@ class Unipaths_Meta_info template -inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +inline void Unipaths_Meta_info::add_maximal_unitig(const std::size_t size) { unipath_count_++; - const std::size_t vertex_count = maximal_unitig.size(); + const std::size_t vertex_count = size; const std::size_t unipath_size = vertex_count + (k - 1); kmer_count_ += vertex_count; @@ -88,13 +91,19 @@ inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scrat min_len_ = unipath_size; sum_len_ += unipath_size; +} + +template +inline void Unipaths_Meta_info::add_maximal_unitig(const Maximal_Unitig_Scratch& maximal_unitig) +{ + add_maximal_unitig(maximal_unitig.size()); if(maximal_unitig.is_cycle()) { dcc_count_++; - dcc_kmer_count_ += vertex_count; - dcc_sum_len_ += unipath_size; + dcc_kmer_count_ += maximal_unitig.size(); + dcc_sum_len_ += maximal_unitig.size() + (k - 1); } } From c19a38adf3f9490bbbfe37154de6f0f250853457 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 14:08:45 -0500 Subject: [PATCH 299/350] Factor out common unipaths-info addition --- include/dBG_Info.hpp | 4 ++++ src/dBG_Info.cpp | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index a14ccad8..8f09e9f1 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -12,6 +12,7 @@ // Forward declarations. template class Read_CdBG_Constructor; template class Read_CdBG_Extractor; +template class Unipaths_Meta_info; class Build_Params; @@ -35,6 +36,9 @@ class dBG_Info // Loads the JSON file from disk, if the corresponding file exists. void load_from_file(); + // Adds information about maximal unitigs tracked in `unipaths_info`. + void add_unipaths_info(const Unipaths_Meta_info& unipaths_info); + public: diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index 0c74b21a..a735a774 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -2,6 +2,7 @@ #include "dBG_Info.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include "Unipaths_Meta_info.hpp" #include "Build_Params.hpp" #include "utility.hpp" @@ -55,6 +56,9 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor { const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); +template +void dBG_Info::add_unipaths_info(const Unipaths_Meta_info& unipaths_info) +{ dBg_info[contigs_field]["maximal unitig count"] = unipaths_info.unipath_count(); dBg_info[contigs_field]["vertex count in the maximal unitigs"] = unipaths_info.kmer_count(); dBg_info[contigs_field]["shortest maximal unitig length"] = unipaths_info.min_len(); @@ -62,7 +66,14 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor dBg_info[contigs_field]["sum maximal unitig length"] = unipaths_info.sum_len(); dBg_info[contigs_field]["avg. maximal unitig length"] = unipaths_info.avg_len(); dBg_info[contigs_field]["_comment"] = "lengths are in bases"; +} + +template +void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor) +{ + const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); + add_unipaths_info(unipaths_info); dBg_info[dcc_field]["DCC count"] = unipaths_info.dcc_count(); if(unipaths_info.dcc_count() > 0) From 6064e44951b4ad3ebbf72fe6d6fce4940b6e88a9 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 14:18:42 -0500 Subject: [PATCH 300/350] Track meta-info for cuttlefish 1 --- include/CdBG.hpp | 11 +++++++++++ include/dBG_Info.hpp | 7 +++++++ src/CdBG.cpp | 14 ++++++++++++++ src/CdBG_Builder.cpp | 2 ++ src/CdBG_GFA_Writer.cpp | 4 ++++ src/CdBG_Plain_Writer.cpp | 4 ++++ src/CdBG_Writer.cpp | 7 +++++++ src/dBG_Info.cpp | 15 +++++++++++++-- 8 files changed, 62 insertions(+), 2 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index cb282397..dd36780b 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -11,6 +11,7 @@ #include "Build_Params.hpp" #include "Data_Logistics.hpp" #include "kmer_Enumeration_Stats.hpp" +#include "Unipaths_Meta_info.hpp" #include "dBG_Info.hpp" #include "Thread_Pool.hpp" #include "Job_Queue.hpp" @@ -18,6 +19,7 @@ #include #include +#include // De Bruijn graph class to support the compaction algorithm. @@ -32,6 +34,9 @@ class CdBG const Data_Logistics logistics; // Data logistics manager for the algorithm execution. std::unique_ptr> hash_table; // Hash table for the vertices (canonical k-mers) of the graph. + Unipaths_Meta_info unipaths_meta_info_; // Meta-information over the extracted maximal unitigs. + std::vector> unipaths_info_local; // Meta-information over the extracted maximal unitigs per thread. + dBG_Info dbg_info; // Wrapper object for structural information of the graph. static constexpr double bits_per_vertex = 8.71; // Expected number of bits required per vertex by Cuttlefish 2. @@ -448,6 +453,12 @@ class CdBG // Constructs the compacted reference de Bruijn graph, employing the parameters received // with the object-constructor. void construct(); + + // Returns a wrapper over the meta-information of the extracted unitigs. + const Unipaths_Meta_info& unipaths_meta_info() const; + + // Returns the number of distinct vertices in the underlying graph. + uint64_t vertex_count() const; }; diff --git a/include/dBG_Info.hpp b/include/dBG_Info.hpp index 8f09e9f1..1cebcf8b 100644 --- a/include/dBG_Info.hpp +++ b/include/dBG_Info.hpp @@ -12,6 +12,7 @@ // Forward declarations. template class Read_CdBG_Constructor; template class Read_CdBG_Extractor; +template class CdBG; template class Unipaths_Meta_info; class Build_Params; @@ -55,9 +56,15 @@ class dBG_Info // Adds basic graph structural information from `cdbg_constructor`. void add_basic_info(const Read_CdBG_Constructor& cdbg_constructor); + // Adds basic graph structural information from `cdbg`. + void add_basic_info(const CdBG& cdbg); + // Adds information about the extracted maximal unitigs from `cdbg_extractor`. void add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor); + // Adds information about the extracted maximal unitigs from `cdbg`. + void add_unipaths_info(const CdBG& cdbg); + // Writes the JSON object to its corresponding disk-file. void dump_info() const; }; diff --git a/src/CdBG.cpp b/src/CdBG.cpp index 3f848345..aef7195f 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -159,6 +159,20 @@ std::size_t CdBG::max_disk_usage(const kmer_Enumeration_Stats& vertex_stat } +template +const Unipaths_Meta_info& CdBG::unipaths_meta_info() const +{ + return unipaths_meta_info_; +} + + +template +uint64_t CdBG::vertex_count() const +{ + return hash_table->size(); +} + + // Template instantiations for the required instances. ENUMERATE(INSTANCE_COUNT, INSTANTIATE, CdBG) diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index 49e8a180..091f9dd9 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -91,6 +91,8 @@ void CdBG::classify_vertices() } + dbg_info.add_basic_info(*this); + std::chrono::high_resolution_clock::time_point t_end = std::chrono::high_resolution_clock::now(); double elapsed_seconds = std::chrono::duration_cast>(t_end - t_start).count(); std::cout << "Done computing the vertex-states. Time taken = " << elapsed_seconds << " seconds.\n"; diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index 1f369a6a..1c4a1923 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -294,9 +294,13 @@ void CdBG::output_gfa_unitig(const uint16_t thread_id, const char* const seq, // If the hash table update is successful, only then this thread may output this unitig. if(hash_table->update(hash_table_entry)) + { params.output_format() == cuttlefish::Output_Format::gfa_reduced ? write_segment(thread_id, seq, unitig_id, start_kmer.idx(), end_kmer.idx(), unitig_dir) : write_gfa_segment(thread_id, seq, unitig_id, start_kmer.idx(), end_kmer.idx(), unitig_dir); + + unipaths_info_local[thread_id].add_maximal_unitig(end_kmer.idx() - start_kmer.idx() + 1); + } } diff --git a/src/CdBG_Plain_Writer.cpp b/src/CdBG_Plain_Writer.cpp index ede69115..fc516f0b 100644 --- a/src/CdBG_Plain_Writer.cpp +++ b/src/CdBG_Plain_Writer.cpp @@ -151,7 +151,11 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se // If the hash table update is successful, only then this thread may output this unitig. if(hash_table->update(hash_table_entry)) + { write_path(thread_id, seq, start_kmer.idx(), end_kmer.idx(), start_kmer.kmer() < end_kmer.rev_compl()); + + unipaths_info_local[thread_id].add_maximal_unitig(end_kmer.idx() - start_kmer.idx() + 1); + } } diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index bed78766..46cfb1f4 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -20,6 +20,7 @@ template void CdBG::output_maximal_unitigs() { const uint8_t output_format = params.output_format(); + unipaths_info_local.resize(params.thread_count()); if(output_format == cuttlefish::txt) output_maximal_unitigs_plain(); @@ -27,6 +28,12 @@ void CdBG::output_maximal_unitigs() output_maximal_unitigs_gfa(); else if(output_format == cuttlefish::gfa_reduced) output_maximal_unitigs_gfa_reduced(); + + + for(uint16_t t_id = 0; t_id < params.thread_count(); ++t_id) + unipaths_meta_info_.aggregate(unipaths_info_local[t_id]); + + dbg_info.add_unipaths_info(*this); } diff --git a/src/dBG_Info.cpp b/src/dBG_Info.cpp index a735a774..0c5c75b1 100644 --- a/src/dBG_Info.cpp +++ b/src/dBG_Info.cpp @@ -2,6 +2,7 @@ #include "dBG_Info.hpp" #include "Read_CdBG_Constructor.hpp" #include "Read_CdBG_Extractor.hpp" +#include "CdBG.hpp" #include "Unipaths_Meta_info.hpp" #include "Build_Params.hpp" #include "utility.hpp" @@ -52,9 +53,11 @@ void dBG_Info::add_basic_info(const Read_CdBG_Constructor& cdbg_constructo template -void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor) +void dBG_Info::add_basic_info(const CdBG& cdbg) { - const Unipaths_Meta_info& unipaths_info = cdbg_extractor.unipaths_meta_info(); + dBg_info[basic_field]["vertex count"] = cdbg.vertex_count(); +} + template void dBG_Info::add_unipaths_info(const Unipaths_Meta_info& unipaths_info) @@ -84,6 +87,14 @@ void dBG_Info::add_unipaths_info(const Read_CdBG_Extractor& cdbg_extractor } +template +void dBG_Info::add_unipaths_info(const CdBG& cdbg) +{ + const Unipaths_Meta_info& unipaths_info = cdbg.unipaths_meta_info(); + add_unipaths_info(unipaths_info); +} + + template void dBG_Info::add_build_params(const Build_Params& params) { From b8c9b5aadb035c223a2311b28023ef3e245bf8cf Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 18:46:18 -0500 Subject: [PATCH 301/350] Indent better --- src/main.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 0b51c84e..c3d62509 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -30,7 +30,8 @@ void build(int argc, char** argv) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("o,output", "output file", cxxopts::value()) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("h,help", "print usage"); + ("h,help", "print usage") + ; options.add_options("cuttlefish 2.0") ("read", "construct a compacted read de Bruijn graph") @@ -38,18 +39,21 @@ void build(int argc, char** argv) ("c,cutoff", "frequency cutoff for (k + 1)-mers", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) ("unrestrict-memory", "do not impose memory usage restriction") - ("path-cover", "extract a maximal path cover of the de Bruijn graph"); + ("path-cover", "extract a maximal path cover of the de Bruijn graph") + ; options.add_options("cuttlefish 1.0") ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) - ("rm", "remove the KMC database"); + ("rm", "remove the KMC database") // TODO: rename it to `keep` and move to a common option. + ; options.add_options("specialized") // TODO: repurpose the following two options ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("save-vertices", "save the vertex set of the graph"); + ("save-vertices", "save the vertex set of the graph") + ; options.add_options("debug") ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) From 2c21637b72388bd4c5b9872e27dd766108715993 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 18:57:21 -0500 Subject: [PATCH 302/350] Remove --rm option for vertex DB --- include/Build_Params.hpp | 9 --------- src/Build_Params.cpp | 4 +--- src/CdBG.cpp | 5 ++++- src/main.cpp | 4 +--- 4 files changed, 6 insertions(+), 16 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 5fe104d4..4453eba9 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -33,7 +33,6 @@ class Build_Params const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). const std::string working_dir_path_; // Path to the working directory (for temporary files). const bool path_cover_; // Whether to extract a maximal path cover of the de Bruijn graph. - const bool remove_kmc_db_; // Option to remove the KMC database, once no longer required. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. const bool save_vertices_; // Option to save the vertex set of the de Bruijn graph (in KMC database format). @@ -61,7 +60,6 @@ class Build_Params const uint8_t output_format, const std::string& working_dir_path, const bool path_cover, - const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, const bool save_vertices @@ -176,13 +174,6 @@ class Build_Params } - // Returns the boolean flag for removing the KMC database. - bool remove_kmc_db() const - { - return remove_kmc_db_; - } - - // Returns the path to the optional MPH file. const std::string mph_file_path() const { diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index d5fa3103..1812d0db 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -20,7 +20,6 @@ Build_Params::Build_Params( const bool is_read_graph, const uint8_t output_format, const std::string& working_dir_path, const bool path_cover, - const bool remove_kmc_db, const std::string& mph_file_path, const std::string& buckets_file_path, const bool save_vertices @@ -42,7 +41,6 @@ Build_Params::Build_Params( const bool is_read_graph, output_format_(cuttlefish::Output_Format(output_format)), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), path_cover_(path_cover), - remove_kmc_db_(remove_kmc_db), mph_file_path_(mph_file_path), buckets_file_path_(buckets_file_path), save_vertices_(save_vertices) @@ -129,7 +127,7 @@ bool Build_Params::is_valid() const // Cuttlefish 1 specific arguments can not be specified. - if(vertex_db_path_ != cuttlefish::_default::WORK_DIR || output_format_ != cuttlefish::Output_Format::txt || remove_kmc_db_) + if(vertex_db_path_ != cuttlefish::_default::WORK_DIR || output_format_ != cuttlefish::Output_Format::txt) { std::cout << "Cuttlefish 1 specific arguments specified while using Cuttlefish 2.\n"; valid = false; diff --git a/src/CdBG.cpp b/src/CdBG.cpp index aef7195f..4667f628 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -55,7 +55,10 @@ void CdBG::construct() std::cout << "\nConstructing the minimal perfect hash function (MPHF) over the vertex set.\n"; construct_hash_table(vertex_count); - if(params.remove_kmc_db()) +#ifdef CF_DEVELOP_MODE + if(params.vertex_db_path().empty()) +#endif + if(!params.save_vertices()) Kmer_Container::remove(logistics.vertex_db_path()); std::chrono::high_resolution_clock::time_point t_mphf = std::chrono::high_resolution_clock::now(); diff --git a/src/main.cpp b/src/main.cpp index c3d62509..0186735a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -45,7 +45,6 @@ void build(int argc, char** argv) options.add_options("cuttlefish 1.0") ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) - ("rm", "remove the KMC database") // TODO: rename it to `keep` and move to a common option. ; options.add_options("specialized") @@ -85,7 +84,6 @@ void build(int argc, char** argv) const auto strict_memory = !result["unrestrict-memory"].as(); const auto output_file = result["output"].as(); const auto format = result["format"].as(); - const auto remove_kmc_db = result["rm"].as(); const auto working_dir = result["work-dir"].as(); const auto path_cover = result["path-cover"].as(); const auto mph_file = result["mph"].as(); @@ -100,7 +98,7 @@ void build(int argc, char** argv) k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, output_file, format, working_dir, path_cover, - remove_kmc_db, mph_file, buckets_file, save_vertices + mph_file, buckets_file, save_vertices #ifdef CF_DEVELOP_MODE , gamma #endif From 2e3b74b326cc4cb462295f374e60b7122a509120 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 19:07:37 -0500 Subject: [PATCH 303/350] Mark vertex DB path as special option --- src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 0186735a..6d5a0375 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -43,7 +43,6 @@ void build(int argc, char** argv) ; options.add_options("cuttlefish 1.0") - ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ; @@ -55,6 +54,7 @@ void build(int argc, char** argv) ; options.add_options("debug") + ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) From 90aeaac7d3880e7ffa48338b6ec647ce04704b71 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 19:13:18 -0500 Subject: [PATCH 304/350] Rename DB-input options --- src/main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 6d5a0375..1f48b3a3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,8 +54,8 @@ void build(int argc, char** argv) ; options.add_options("debug") - ("s,kmc-db", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("e,edge-db", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("vertex-set", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("edge-set", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) #endif @@ -77,8 +77,8 @@ void build(int argc, char** argv) const auto dirs = result["dirs"].as>(); const auto k = result["kmer-len"].as(); const auto cutoff = result["cutoff"].as(); - const auto kmer_database = result["kmc-db"].as(); - const auto edge_database = result["edge-db"].as(); + const auto vertex_db = result["vertex-set"].as(); + const auto edge_db = result["edge-set"].as(); const auto thread_count = result["threads"].as(); const auto max_memory = result["max-memory"].as(); const auto strict_memory = !result["unrestrict-memory"].as(); @@ -95,7 +95,7 @@ void build(int argc, char** argv) const Build_Params params( is_read_graph, is_ref_graph, refs, lists, dirs, - k, cutoff, kmer_database, edge_database, thread_count, max_memory, strict_memory, + k, cutoff, vertex_db, edge_db, thread_count, max_memory, strict_memory, output_file, format, working_dir, path_cover, mph_file, buckets_file, save_vertices From ec9e2092a65400e02f0e8a66f3d3415f6e8490fa Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 20 Jan 2022 19:53:49 -0500 Subject: [PATCH 305/350] Make memory-limit options common --- src/Build_Params.cpp | 4 ++-- src/main.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 1812d0db..ffae7c50 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -127,7 +127,7 @@ bool Build_Params::is_valid() const // Cuttlefish 1 specific arguments can not be specified. - if(vertex_db_path_ != cuttlefish::_default::WORK_DIR || output_format_ != cuttlefish::Output_Format::txt) + if(output_format_ != cuttlefish::Output_Format::txt) { std::cout << "Cuttlefish 1 specific arguments specified while using Cuttlefish 2.\n"; valid = false; @@ -153,7 +153,7 @@ bool Build_Params::is_valid() const // Cuttlefish 2 specific arguments can not be specified. - if(cutoff_ != cuttlefish::_default::CUTOFF_FREQ || max_memory_ != cuttlefish::_default::MAX_MEMORY || !strict_memory_ || path_cover_ || !edge_db_path_.empty()) + if(cutoff_ != cuttlefish::_default::CUTOFF_FREQ || path_cover_) { std::cout << "Cuttelfish 2 specific arguments specified while using Cuttlefish 1.\n"; valid = false; diff --git a/src/main.cpp b/src/main.cpp index 1f48b3a3..8162a929 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -30,6 +30,8 @@ void build(int argc, char** argv) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("o,output", "output file", cxxopts::value()) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) + ("unrestrict-memory", "do not impose memory usage restriction") ("h,help", "print usage") ; @@ -37,8 +39,6 @@ void build(int argc, char** argv) ("read", "construct a compacted read de Bruijn graph") ("ref", "construct a compacted reference de Bruijn graph") ("c,cutoff", "frequency cutoff for (k + 1)-mers", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) - ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) - ("unrestrict-memory", "do not impose memory usage restriction") ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; From cb5a34390f81495765ab3d8624eb6cfb7913c604 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 21 Jan 2022 14:05:10 -0500 Subject: [PATCH 306/350] Check develop mode opts in regular use --- src/Build_Params.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index ffae7c50..0c9bf3e6 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -161,5 +161,15 @@ bool Build_Params::is_valid() const } + // Develop-mode options are not to be provided in regular use. +#ifndef CF_DEVELOP_MODE + if(!vertex_db_path_.empty() || !edge_db_path_.empty()) + { + std::cout << "Paths to vertex- and edge-databases are supported only in debug mode.\n"; + valid = false; + } +#endif + + return valid; } From d12c3b925d0e4eb0d4dca2287aebbf8ef9f6abf4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 21 Jan 2022 14:16:19 -0500 Subject: [PATCH 307/350] Clean CLI validation --- src/Build_Params.cpp | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 0c9bf3e6..e95b5f68 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -72,7 +72,7 @@ bool Build_Params::is_valid() const } - // Discard unsupported thread counts. + // Unsupported thread counts are to be discarded. const auto num_threads = std::thread::hardware_concurrency(); if(num_threads > 0 && thread_count_ > num_threads) { @@ -99,6 +99,11 @@ bool Build_Params::is_valid() const } + // Memory budget options should not be mixed with. + if(max_memory_ != cuttlefish::_default::MAX_MEMORY && !strict_memory_) + std::cout << "Both a memory bound and the option for unrestricted memory usage specified. Unrestricted memory mode will be used.\n"; + + if(is_read_graph_ || is_ref_graph_) // Validate Cuttlefish 2 specific arguments. { // Read and reference de Bruijn graph parameters can not be mixed with. @@ -121,11 +126,6 @@ bool Build_Params::is_valid() const std::cout << "WARNING: cutoff frequency specified not to be 1 on reference sequences.\n"; - // Memory budget options are being mixed with. - if(max_memory_ != cuttlefish::_default::MAX_MEMORY && !strict_memory_) - std::cout << "Both a memory bound and the option for unrestricted memory usage specified. Unrestricted memory mode will be used.\n"; - - // Cuttlefish 1 specific arguments can not be specified. if(output_format_ != cuttlefish::Output_Format::txt) { @@ -135,16 +135,7 @@ bool Build_Params::is_valid() const } else // Validate Cuttlefish 1 specific arguments. { - // Directory containing vertex database must exist. - const std::string vertex_db_dir = dirname(vertex_db_path_); - if(!dir_exists(vertex_db_dir)) - { - std::cout << "Vertex database directory " << vertex_db_dir << " does not exist.\n"; - valid = false; - } - - - // Discard invalid output formats. + // Invalid output formats are to be discarded. if(output_format_ >= cuttlefish::num_op_formats) { std::cout << "Invalid output file format.\n"; @@ -161,7 +152,7 @@ bool Build_Params::is_valid() const } - // Develop-mode options are not to be provided in regular use. + // Develop-mode options can not to be provided in regular use. #ifndef CF_DEVELOP_MODE if(!vertex_db_path_.empty() || !edge_db_path_.empty()) { From 76df1515cf15c5440a9751b419afe474dcd38367 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 21 Jan 2022 16:14:09 -0500 Subject: [PATCH 308/350] Rename refs / reads input option --- src/main.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 8162a929..510079d6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,9 +23,9 @@ void build(int argc, char** argv) cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences"); options.add_options("common") - ("r,refs", "input files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("l,lists", "input file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("d,dirs", "input file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("s,seq", "input files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("l,list", "input file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("d,dir", "input file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("o,output", "output file", cxxopts::value()) @@ -72,9 +72,9 @@ void build(int argc, char** argv) const auto is_read_graph = result["read"].as(); const auto is_ref_graph = result["ref"].as(); - const auto refs = result["refs"].as>(); - const auto lists = result["lists"].as>(); - const auto dirs = result["dirs"].as>(); + const auto seqs = result["seq"].as>(); + const auto lists = result["list"].as>(); + const auto dirs = result["dir"].as>(); const auto k = result["kmer-len"].as(); const auto cutoff = result["cutoff"].as(); const auto vertex_db = result["vertex-set"].as(); @@ -94,7 +94,7 @@ void build(int argc, char** argv) #endif const Build_Params params( is_read_graph, is_ref_graph, - refs, lists, dirs, + seqs, lists, dirs, k, cutoff, vertex_db, edge_db, thread_count, max_memory, strict_memory, output_file, format, working_dir, path_cover, From fb20d7da4cee0807266d9d812c992ddb587d88f6 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 23 Jan 2022 17:51:48 -0500 Subject: [PATCH 309/350] Fix an edge case in k-mer iteration --- include/kmc_api/kmc_file.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/kmc_api/kmc_file.h b/include/kmc_api/kmc_file.h index 70c9b8a0..e765d657 100644 --- a/include/kmc_api/kmc_file.h +++ b/include/kmc_api/kmc_file.h @@ -20,6 +20,7 @@ #include #include #include +#include struct CKMCFileInfo @@ -491,7 +492,8 @@ inline uint64_t CKMC_DB::read_raw_suffixes(uint8_t* const suff_buf, std::vector< if(is_opened != opened_for_listing) return 0; - const size_t max_suff_count = max_bytes_to_read / suff_record_size(); + const size_t max_suff_count = (suff_record_size() > 0 ? max_bytes_to_read / suff_record_size() : + std::numeric_limits::max()); uint64_t suff_read_count = 0; // Count of suffixes to be read into the buffer `suff_buf`. pref_buf.clear(); From 25c6f94e1ef17cc23738820b00837b7f67789d66 Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 23 Jan 2022 18:29:41 -0500 Subject: [PATCH 310/350] Fix vertex frequency for small k --- src/Read_CdBG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index e0d61a58..aad07b9d 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -150,7 +150,7 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max { const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); return kmer_Enumerator().enumerate( - ip_type, logistics.input_paths_collection(), 1, params.thread_count(), + ip_type, logistics.input_paths_collection(), params.cutoff(), params.thread_count(), max_memory, params.strict_memory(), false, bits_per_vertex, logistics.working_dir_path(), logistics.vertex_db_path()); } From 5b202d8d65ed719d076a4fd75be1f47b7041c60e Mon Sep 17 00:00:00 2001 From: jamshed Date: Sun, 23 Jan 2022 19:00:45 -0500 Subject: [PATCH 311/350] Revert workaround for small k vertex enumeration KMC-to-KMC is now supported for small k --- src/Read_CdBG.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index aad07b9d..75b22e74 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -145,16 +145,6 @@ kmer_Enumeration_Stats Read_CdBG::enumerate_edges() const template kmer_Enumeration_Stats Read_CdBG::enumerate_vertices(const std::size_t max_memory) const { - // KMC-to-KMC operation isn't supported for small enough k's yet. - if(k < kmer_Enumerator::small_k_threshold) - { - const KMC::InputFileType ip_type = (params.is_read_graph() ? KMC::InputFileType::FASTQ : KMC::InputFileType::MULTILINE_FASTA); - return kmer_Enumerator().enumerate( - ip_type, logistics.input_paths_collection(), params.cutoff(), params.thread_count(), - max_memory, params.strict_memory(), false, bits_per_vertex, - logistics.working_dir_path(), logistics.vertex_db_path()); - } - return kmer_Enumerator().enumerate( KMC::InputFileType::KMC, std::vector(1, logistics.edge_db_path()), 1, params.thread_count(), max_memory, params.strict_memory(), false, bits_per_vertex, From 98ef299181d43128f64cd8c290bae24284ca0246 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 24 Jan 2022 18:00:42 -0500 Subject: [PATCH 312/350] Output fasta from cuttlefish 1 --- include/Build_Params.hpp | 4 ++-- include/CdBG.hpp | 10 +++++----- include/Input_Defaults.hpp | 2 +- include/Output_Format.hpp | 2 +- src/Build_Params.cpp | 2 +- src/CdBG_Plain_Writer.cpp | 14 ++++++++++---- src/CdBG_Writer.cpp | 4 ++-- src/main.cpp | 2 +- 8 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 4453eba9..4afca149 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -30,7 +30,7 @@ class Build_Params const std::size_t max_memory_; // Soft maximum memory limit (in GB). const bool strict_memory_; // Whether strict memory limit restriction is specifiied. const std::string output_file_path_; // Path to the output file. - const cuttlefish::Output_Format output_format_; // Output format (0: txt, 1: GFAv1, 2: GFAv2). + const cuttlefish::Output_Format output_format_; // Output format (0: FASTA, 1: GFAv1, 2: GFAv2). const std::string working_dir_path_; // Path to the working directory (for temporary files). const bool path_cover_; // Whether to extract a maximal path cover of the de Bruijn graph. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. @@ -149,7 +149,7 @@ class Build_Params // Returns the path to the output file. const std::string output_file_path() const { - return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::unipaths_ext) : output_file_path_; + return output_file_path_ + cuttlefish::file_ext::unipaths_ext; } diff --git a/include/CdBG.hpp b/include/CdBG.hpp index dd36780b..694437d6 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -285,10 +285,10 @@ class CdBG // Writes the path in the sequence `seq` with its starting and ending k-mers // located at the indices `start_kmer_idx` and `end_kmer_idx` respectively to // the output buffer of the thread number `thread_id`, putting into the logger - // of the thread, if necessary. If `dir` is `FWD`, then the string spelled by the - // path is written; otherwise its reverse complement is written. - // Note that, the output operation appends a newline at the end. - void write_path(uint16_t thread_id, const char* seq, size_t start_kmer_idx, size_t end_kmer_idx, cuttlefish::dir_t dir); + // of the thread, if necessary. The unitig is named as `unitig_id`. If `dir` is + // `FWD`, then the string spelled by the path is written; otherwise its reverse + // complement is written. Note that, the output operation appends a newline at the end. + void write_path(uint16_t thread_id, const char* seq, const uint64_t unitig_id, size_t start_kmer_idx, size_t end_kmer_idx, cuttlefish::dir_t dir); // Writes the maximal unitigs from the sequence `seq` (of length `seq_len`) that // have their starting indices between (inclusive) `left_end` and `right_end`. @@ -393,7 +393,7 @@ class CdBG // Ensures that the string `buf` has enough free space to append a log of length // `log_len` at its end without overflowing its capacity by flushing its content // to the logger `log` if necessary. The request is non-binding in the sense that - // if the capacity of the buffer `str` is smaller than `log_len`, then this method + // if the capacity of the buffer `buf` is smaller than `log_len`, then this method // does not ensure enough buffer space. static void ensure_buffer_space(std::string& buf, size_t log_len, const cuttlefish::logger_t& log); diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 2358b190..c664cb25 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -20,7 +20,7 @@ namespace cuttlefish #ifdef CF_DEVELOP_MODE constexpr double GAMMA = 0; #endif - constexpr uint16_t OP_FORMAT = Output_Format::txt; + constexpr uint16_t OP_FORMAT = Output_Format::fa; constexpr char WORK_DIR[] = "."; } } diff --git a/include/Output_Format.hpp b/include/Output_Format.hpp index e967a86a..ad0ad32f 100644 --- a/include/Output_Format.hpp +++ b/include/Output_Format.hpp @@ -12,7 +12,7 @@ namespace cuttlefish // Output format options for the algorithm. enum Output_Format: uint8_t { - txt = 0, + fa = 0, gfa1 = 1, gfa2 = 2, gfa_reduced = 3, diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index e95b5f68..1d9e9486 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -127,7 +127,7 @@ bool Build_Params::is_valid() const // Cuttlefish 1 specific arguments can not be specified. - if(output_format_ != cuttlefish::Output_Format::txt) + if(output_format_ != cuttlefish::Output_Format::fa) { std::cout << "Cuttlefish 1 specific arguments specified while using Cuttlefish 2.\n"; valid = false; diff --git a/src/CdBG_Plain_Writer.cpp b/src/CdBG_Plain_Writer.cpp index fc516f0b..a27494af 100644 --- a/src/CdBG_Plain_Writer.cpp +++ b/src/CdBG_Plain_Writer.cpp @@ -140,7 +140,8 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se // For a particular unitig, always query the same well-defined canonical flanking // k-mer, irrespective of which direction the unitig may be traversed at. const Kmer min_flanking_kmer = std::min(start_kmer.canonical(), end_kmer.canonical()); - Kmer_Hash_Entry_API hash_table_entry = hash_table->at(min_flanking_kmer); + const uint64_t bucket_id = hash_table->bucket_id(min_flanking_kmer); + Kmer_Hash_Entry_API hash_table_entry = hash_table->at(bucket_id); State& state = hash_table_entry.get_state(); if(state.is_outputted()) @@ -152,7 +153,7 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se // If the hash table update is successful, only then this thread may output this unitig. if(hash_table->update(hash_table_entry)) { - write_path(thread_id, seq, start_kmer.idx(), end_kmer.idx(), start_kmer.kmer() < end_kmer.rev_compl()); + write_path(thread_id, seq, bucket_id, start_kmer.idx(), end_kmer.idx(), start_kmer.kmer() < end_kmer.rev_compl()); unipaths_info_local[thread_id].add_maximal_unitig(end_kmer.idx() - start_kmer.idx() + 1); } @@ -160,15 +161,20 @@ void CdBG::output_plain_unitig(const uint16_t thread_id, const char* const se template -void CdBG::write_path(const uint16_t thread_id, const char* const seq, const size_t start_kmer_idx, const size_t end_kmer_idx, const cuttlefish::dir_t dir) +void CdBG::write_path(const uint16_t thread_id, const char* const seq, const uint64_t unitig_id, const size_t start_kmer_idx, const size_t end_kmer_idx, const cuttlefish::dir_t dir) { std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); std::string& buffer = output_buffer[thread_id]; const size_t path_len = end_kmer_idx - start_kmer_idx + k; + constexpr std::size_t header_len = 12; // FASTA header len: '>' + + '\n' - ensure_buffer_space(buffer, path_len, output_[thread_id]); + ensure_buffer_space(buffer, path_len + header_len, output_[thread_id]); + + buffer += ">"; + buffer += fmt::format_int(unitig_id).c_str(); + buffer += "\n"; if(dir == cuttlefish::FWD) for(size_t offset = 0; offset < path_len; ++offset) diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index 46cfb1f4..4d9afbca 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -22,7 +22,7 @@ void CdBG::output_maximal_unitigs() const uint8_t output_format = params.output_format(); unipaths_info_local.resize(params.thread_count()); - if(output_format == cuttlefish::txt) + if(output_format == cuttlefish::fa) output_maximal_unitigs_plain(); else if(output_format == cuttlefish::gfa1 || output_format == cuttlefish::gfa2) output_maximal_unitigs_gfa(); @@ -482,7 +482,7 @@ void CdBG::clear_output_file() const const cuttlefish::Output_Format op_format = params.output_format(); const std::string& output_file_path = params.output_file_path(); - if(op_format == cuttlefish::txt || op_format == cuttlefish::gfa1 || op_format == cuttlefish::gfa2) + if(op_format == cuttlefish::fa || op_format == cuttlefish::gfa1 || op_format == cuttlefish::gfa2) clear_file(output_file_path); else if(op_format == cuttlefish::gfa_reduced) { diff --git a/src/main.cpp b/src/main.cpp index 510079d6..1a23ad74 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -43,7 +43,7 @@ void build(int argc, char** argv) ; options.add_options("cuttlefish 1.0") - ("f,format", "output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) + ("f,format", "output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ; options.add_options("specialized") From 2c95bb40cb31c4940817a2c58197e02ca9810a3b Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 25 Jan 2022 16:17:24 -0500 Subject: [PATCH 313/350] Fix GFA1 corrupted overlaps --- include/CdBG.hpp | 3 +++ src/CdBG_GFA_Writer.cpp | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 8bd2a039..9412a644 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -37,6 +37,9 @@ class CdBG // output content yet to be written to the disk from the thread number `t_id`. std::vector path_buffer, overlap_buffer; + // `link_added[t_id]` is `true` iff at least one link has been added to the output for thread id `t_id`. + std::vector link_added; + // Capacities for the memory pre-allocation of each output buffer, and the threshold buffer size // that triggers a disk-flush. static constexpr size_t BUFFER_THRESHOLD = 100 * 1024; // 100 KB. diff --git a/src/CdBG_GFA_Writer.cpp b/src/CdBG_GFA_Writer.cpp index 0513c046..96b38a9f 100644 --- a/src/CdBG_GFA_Writer.cpp +++ b/src/CdBG_GFA_Writer.cpp @@ -53,11 +53,13 @@ void CdBG::reset_path_loggers(const uint64_t file_id) path_output_.clear(); if(gfa_v == cuttlefish::Output_Format::gfa1) - overlap_output_.clear(); + overlap_output_.clear(), + link_added.clear(); path_output_.resize(thread_count); if(gfa_v == cuttlefish::Output_Format::gfa1) - overlap_output_.resize(thread_count); + overlap_output_.resize(thread_count), + link_added.resize(thread_count); // Instantiate a `spdlog` thread pool for outputting paths and overlaps. @@ -460,6 +462,9 @@ void CdBG::write_gfa_link(const uint16_t thread_id, const Oriented_Unitig& le // Append a link to the growing path for this thread. append_link_to_path(thread_id, left_unitig, right_unitig); + + // Mark the addition of a link for this thread. + link_added[thread_id] = true; } @@ -591,7 +596,8 @@ void CdBG::append_link_to_path(const uint16_t thread_id, const Oriented_Uniti p_buffer += (right_unitig.dir == cuttlefish::FWD ? "+" : "-"); std::string& o_buffer = overlap_buffer[thread_id]; - o_buffer += ","; + if(link_added[thread_id]) + o_buffer += ","; o_buffer += fmt::format_int(right_unitig.start_kmer_idx == left_unitig.end_kmer_idx + 1 ? k - 1 : 0).c_str(); o_buffer += "M"; @@ -797,11 +803,8 @@ void CdBG::write_gfa_path(const std::string& path_name) output << "*"; // Write an empty CIGAR string at the 'Overlaps' field. else { - // The first overlap of the path (not inferrable from the path output files). - const uint16_t overlap = (right_unitig.start_kmer_idx == left_unitig.end_kmer_idx + 1 ? k - 1 : 0); - output << overlap << "M"; - // Copy the thread-specific overlap output file contents to the GFA output file. + bool overlap_written = false; // Whether some overlap information has been written to the final output. for(uint16_t t_id = 0; t_id < thread_count; ++t_id) { const std::string overlap_file_name = (overlap_file_prefix + std::to_string(t_id)); @@ -815,7 +818,13 @@ void CdBG::write_gfa_path(const std::string& path_name) // Copy the overlaps output for thread number `t_id` to the end of the output GFA file. if(input.peek() != EOF) + { + if(overlap_written) + output << ","; + output << input.rdbuf(); + overlap_written = true; + } input.close(); } From 7b1fdaefc951917201b96530dee2e9c581fcce08 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 16:12:01 -0500 Subject: [PATCH 314/350] Bump compiler req. to c++17 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04a8ced1..b5c606bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ project(${PROJECT_NAME} # Fix language standards, and set hard requirements for such. # All targets defined from this point onward will pick up these requirements. -set(CMAKE_CXX_STANDARD 14) # Bump to 17 +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_C_STANDARD 11) From 7805cb439e016e055cec9b24c0c3e428c8447b6d Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 16:28:58 -0500 Subject: [PATCH 315/350] Make frequency cutoff optional --- include/Build_Params.hpp | 8 +++++--- include/Input_Defaults.hpp | 3 ++- src/Build_Params.cpp | 8 ++++---- src/main.cpp | 5 +++-- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 4afca149..916b9d59 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -8,11 +8,13 @@ #include "Seq_Input.hpp" #include "Output_Format.hpp" #include "File_Extensions.hpp" +#include "Input_Defaults.hpp" #include #include #include #include +#include class Build_Params @@ -23,7 +25,7 @@ class Build_Params const bool is_ref_graph_; // Whether to build a compacted reference de Bruijn graph or not. const Seq_Input seq_input_; // Collection of the input sequences. const uint16_t k_; // The k parameter for the edge-centric de Bruijn graph to be compacted. - const uint32_t cutoff_; // Frequency cutoff for the (k + 1)-mers (for short-read set input). + const std::optional cutoff_; // Frequency cutoff for the (k + 1)-mers. const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). const uint16_t thread_count_; // Number of threads to work with. @@ -50,7 +52,7 @@ class Build_Params const std::vector& list_paths, const std::vector& dir_paths, const uint16_t k, - const uint32_t cutoff, + const std::optional cutoff, const std::string& vertex_db_path, const std::string& edge_db_path, const uint16_t thread_count, @@ -100,7 +102,7 @@ class Build_Params // Returns the frequency cutoff for the (k + 1)-mers (for short-reads set input). uint32_t cutoff() const { - return cutoff_; + return cutoff_.value_or(is_read_graph() ? cuttlefish::_default::CUTOFF_FREQ_READS : cuttlefish::_default::CUTOFF_FREQ_REFS); } diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index c664cb25..4b8d7582 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -14,7 +14,8 @@ namespace cuttlefish { constexpr char EMPTY[] = ""; constexpr uint16_t K = 25; // Set as per the KMC3 default. - constexpr uint32_t CUTOFF_FREQ = 2; // Typical practice + constexpr uint32_t CUTOFF_FREQ_READS = 2; // Typical practice + constexpr uint32_t CUTOFF_FREQ_REFS = 1; // Typical assumption constexpr uint16_t THREAD_COUNT = 1; constexpr std::size_t MAX_MEMORY = 3; // Set as per KMC3 stage 1 performance. #ifdef CF_DEVELOP_MODE diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 1d9e9486..1311ef45 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -10,7 +10,7 @@ Build_Params::Build_Params( const bool is_read_graph, const std::vector& list_paths, const std::vector& dir_paths, const uint16_t k, - const uint32_t cutoff, + const std::optional cutoff, const std::string& vertex_db_path, const std::string& edge_db_path, const uint16_t thread_count, @@ -115,14 +115,14 @@ bool Build_Params::is_valid() const // A cutoff frequency of 0 is theoretically inconsistent. - if(cutoff_ == 0) + if(cutoff() == 0) { std::cout << "Cutoff frequency specified to be 0, which is theoretically inconsistent. Please use 1 if you wish to retain all the k-mers without filtering.\n"; valid = false; } // Cutoff frequency _should be_ 1 for reference de Bruijn graphs. - if(is_ref_graph_ && cutoff_ != 1) + if(is_ref_graph_ && cutoff() != 1) std::cout << "WARNING: cutoff frequency specified not to be 1 on reference sequences.\n"; @@ -144,7 +144,7 @@ bool Build_Params::is_valid() const // Cuttlefish 2 specific arguments can not be specified. - if(cutoff_ != cuttlefish::_default::CUTOFF_FREQ || path_cover_) + if(cutoff_ || path_cover_) { std::cout << "Cuttelfish 2 specific arguments specified while using Cuttlefish 1.\n"; valid = false; diff --git a/src/main.cpp b/src/main.cpp index 1a23ad74..71f06f69 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,6 +15,7 @@ #include #include #include +#include // Driver function for the CdBG build. @@ -35,10 +36,11 @@ void build(int argc, char** argv) ("h,help", "print usage") ; + std::optional cutoff; options.add_options("cuttlefish 2.0") ("read", "construct a compacted read de Bruijn graph") ("ref", "construct a compacted reference de Bruijn graph") - ("c,cutoff", "frequency cutoff for (k + 1)-mers", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::CUTOFF_FREQ))) + ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: 1, reads: 2)", cxxopts::value>(cutoff)) ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; @@ -76,7 +78,6 @@ void build(int argc, char** argv) const auto lists = result["list"].as>(); const auto dirs = result["dir"].as>(); const auto k = result["kmer-len"].as(); - const auto cutoff = result["cutoff"].as(); const auto vertex_db = result["vertex-set"].as(); const auto edge_db = result["edge-set"].as(); const auto thread_count = result["threads"].as(); From 44705cb6f51537bf5d340202b0db21749e2767e4 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 16:52:46 -0500 Subject: [PATCH 316/350] Bump default k to 27 --- include/Input_Defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 4b8d7582..4abf618f 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -13,7 +13,7 @@ namespace cuttlefish namespace _default { constexpr char EMPTY[] = ""; - constexpr uint16_t K = 25; // Set as per the KMC3 default. + constexpr uint16_t K = 27; constexpr uint32_t CUTOFF_FREQ_READS = 2; // Typical practice constexpr uint32_t CUTOFF_FREQ_REFS = 1; // Typical assumption constexpr uint16_t THREAD_COUNT = 1; From f36b92df7200211b70fda2a1c8a24c6c1a5945f2 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 16:54:05 -0500 Subject: [PATCH 317/350] Clarify help message --- src/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 71f06f69..ec785351 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,14 +37,14 @@ void build(int argc, char** argv) ; std::optional cutoff; - options.add_options("cuttlefish 2.0") - ("read", "construct a compacted read de Bruijn graph") - ("ref", "construct a compacted reference de Bruijn graph") + options.add_options("cuttlefish_2") + ("read", "construct a compacted read de Bruijn graph (for FASTQ input)") + ("ref", "construct a compacted reference de Bruijn graph (for FASTA input)") ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: 1, reads: 2)", cxxopts::value>(cutoff)) ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; - options.add_options("cuttlefish 1.0") + options.add_options("cuttlefish_1") ("f,format", "output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ; From b0d14db924fddc58825cc37feeca93958895aa3e Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 18:38:56 -0500 Subject: [PATCH 318/350] Avoid hard-coded value in CLI message --- src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index ec785351..6803e5f3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -40,7 +40,7 @@ void build(int argc, char** argv) options.add_options("cuttlefish_2") ("read", "construct a compacted read de Bruijn graph (for FASTQ input)") ("ref", "construct a compacted reference de Bruijn graph (for FASTA input)") - ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: 1, reads: 2)", cxxopts::value>(cutoff)) + ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_REFS) + ", reads: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_READS) + ")", cxxopts::value>(cutoff)) ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; From dead272421574148b50314ea2be611ec6ed50a1f Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 19:04:19 -0500 Subject: [PATCH 319/350] Make max-memory optional --- include/Build_Params.hpp | 6 +++--- src/Build_Params.cpp | 4 ++-- src/main.cpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 916b9d59..a007d132 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -29,7 +29,7 @@ class Build_Params const std::string vertex_db_path_; // Path to the KMC database containing the vertices (canonical k-mers). const std::string edge_db_path_; // Path to the KMC database containing the edges (canonical (k + 1)-mers). const uint16_t thread_count_; // Number of threads to work with. - const std::size_t max_memory_; // Soft maximum memory limit (in GB). + const std::optional max_memory_; // Soft maximum memory limit (in GB). const bool strict_memory_; // Whether strict memory limit restriction is specifiied. const std::string output_file_path_; // Path to the output file. const cuttlefish::Output_Format output_format_; // Output format (0: FASTA, 1: GFAv1, 2: GFAv2). @@ -56,7 +56,7 @@ class Build_Params const std::string& vertex_db_path, const std::string& edge_db_path, const uint16_t thread_count, - const std::size_t max_memory, + const std::optional max_memory, const bool strict_memory, const std::string& output_file_path, const uint8_t output_format, @@ -130,7 +130,7 @@ class Build_Params // Returns the soft maximum memory limit (in GB). std::size_t max_memory() const { - return max_memory_; + return max_memory_.value_or(cuttlefish::_default::MAX_MEMORY); } diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 1311ef45..3bb6aced 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -14,7 +14,7 @@ Build_Params::Build_Params( const bool is_read_graph, const std::string& vertex_db_path, const std::string& edge_db_path, const uint16_t thread_count, - const std::size_t max_memory, + const std::optional max_memory, const bool strict_memory, const std::string& output_file_path, const uint8_t output_format, @@ -100,7 +100,7 @@ bool Build_Params::is_valid() const // Memory budget options should not be mixed with. - if(max_memory_ != cuttlefish::_default::MAX_MEMORY && !strict_memory_) + if(max_memory_ && !strict_memory_) std::cout << "Both a memory bound and the option for unrestricted memory usage specified. Unrestricted memory mode will be used.\n"; diff --git a/src/main.cpp b/src/main.cpp index 6803e5f3..05872e28 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,6 +23,7 @@ void build(int argc, char** argv) { cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences"); + std::optional max_memory; options.add_options("common") ("s,seq", "input files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) ("l,list", "input file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) @@ -31,7 +32,7 @@ void build(int argc, char** argv) ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) ("o,output", "output file", cxxopts::value()) ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("m,max-memory", "soft maximum memory limit (in GB)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::MAX_MEMORY))) + ("m,max-memory", "soft maximum memory limit in GB (default: " + std::to_string(cuttlefish::_default::MAX_MEMORY) + ")", cxxopts::value>(max_memory)) ("unrestrict-memory", "do not impose memory usage restriction") ("h,help", "print usage") ; @@ -81,7 +82,6 @@ void build(int argc, char** argv) const auto vertex_db = result["vertex-set"].as(); const auto edge_db = result["edge-set"].as(); const auto thread_count = result["threads"].as(); - const auto max_memory = result["max-memory"].as(); const auto strict_memory = !result["unrestrict-memory"].as(); const auto output_file = result["output"].as(); const auto format = result["format"].as(); From d197c14712c8e1b49a06ed03f9fe52a01292e187 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 19:09:47 -0500 Subject: [PATCH 320/350] Better indent wall of CLI code --- src/main.cpp | 72 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 05872e28..bd13bdce 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,14 +25,22 @@ void build(int argc, char** argv) std::optional max_memory; options.add_options("common") - ("s,seq", "input files", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("l,list", "input file lists", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("d,dir", "input file directories", cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) - ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) - ("t,threads", "number of threads to use", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) - ("o,output", "output file", cxxopts::value()) - ("w,work-dir", "working directory", cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) - ("m,max-memory", "soft maximum memory limit in GB (default: " + std::to_string(cuttlefish::_default::MAX_MEMORY) + ")", cxxopts::value>(max_memory)) + ("s,seq", "input files", + cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("l,list", "input file lists", + cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("d,dir", "input file directories", + cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + ("k,kmer-len", "k-mer length", + cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) + ("t,threads", "number of threads to use", + cxxopts::value()->default_value(std::to_string(cuttlefish::_default::THREAD_COUNT))) + ("o,output", "output file", + cxxopts::value()) + ("w,work-dir", "working directory", + cxxopts::value()->default_value(cuttlefish::_default::WORK_DIR)) + ("m,max-memory", "soft maximum memory limit in GB (default: " + std::to_string(cuttlefish::_default::MAX_MEMORY) + ")", + cxxopts::value>(max_memory)) ("unrestrict-memory", "do not impose memory usage restriction") ("h,help", "print usage") ; @@ -41,26 +49,33 @@ void build(int argc, char** argv) options.add_options("cuttlefish_2") ("read", "construct a compacted read de Bruijn graph (for FASTQ input)") ("ref", "construct a compacted reference de Bruijn graph (for FASTA input)") - ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_REFS) + ", reads: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_READS) + ")", cxxopts::value>(cutoff)) + ("c,cutoff", "frequency cutoff for (k + 1)-mers (default: refs: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_REFS) + ", reads: " + std::to_string(cuttlefish::_default::CUTOFF_FREQ_READS) + ")", + cxxopts::value>(cutoff)) ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; options.add_options("cuttlefish_1") - ("f,format", "output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) + ("f,format", "output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", + cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) ; options.add_options("specialized") // TODO: repurpose the following two options - ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("buckets", "hash table buckets (cuttlefish) file (optional)", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("mph", "minimal perfect hash (BBHash) file (optional)", + cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("buckets", "hash table buckets (cuttlefish) file (optional)", + cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) ("save-vertices", "save the vertex set of the graph") ; options.add_options("debug") - ("vertex-set", "set of vertices, i.e. k-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("edge-set", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("vertex-set", "set of vertices, i.e. k-mers (KMC database) prefix", + cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("edge-set", "set of edges, i.e. (k + 1)-mers (KMC database) prefix", + cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) #ifdef CF_DEVELOP_MODE - ("gamma", "gamma for the BBHash MPHF", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) + ("gamma", "gamma for the BBHash MPHF", + cxxopts::value()->default_value(std::to_string(cuttlefish::_default::GAMMA))) #endif ; @@ -137,15 +152,24 @@ void validate(int argc, char** argv) { cxxopts::Options options("cuttlefish validate", "Validate a compacted de Bruijn graph constructed by cuttlefish"); options.add_options() - ("r,refs", "reference files", cxxopts::value>()->default_value("")) - ("l,lists", "reference file lists", cxxopts::value>()->default_value("")) - ("d,dirs", "reference file directories", cxxopts::value>()->default_value("")) - ("k,kmer_len", "k-mer length", cxxopts::value()) - ("s,kmc_db", "set of k-mers (KMC database) prefix", cxxopts::value()) - ("g,cdbg", "compacted de Bruijn graph file", cxxopts::value()) - ("t,threads", "number of threads to use", cxxopts::value()->default_value("1")) - ("w,work_dir", "working directory", cxxopts::value()->default_value(".")) - ("mph", "minimal perfect hash (BBHash) file (optional)", cxxopts::value()->default_value("")) + ("r,refs", "reference files", + cxxopts::value>()->default_value("")) + ("l,lists", "reference file lists", + cxxopts::value>()->default_value("")) + ("d,dirs", "reference file directories", + cxxopts::value>()->default_value("")) + ("k,kmer_len", "k-mer length", + cxxopts::value()) + ("s,kmc_db", "set of k-mers (KMC database) prefix", + cxxopts::value()) + ("g,cdbg", "compacted de Bruijn graph file", + cxxopts::value()) + ("t,threads", "number of threads to use", + cxxopts::value()->default_value("1")) + ("w,work_dir", "working directory", + cxxopts::value()->default_value(".")) + ("mph", "minimal perfect hash (BBHash) file (optional)", + cxxopts::value()->default_value("")) ("h,help", "print usage"); try From 307757396ce1a6f8d5076a68b36ef0d8dc4ed739 Mon Sep 17 00:00:00 2001 From: jamshed Date: Thu, 27 Jan 2022 19:36:48 -0500 Subject: [PATCH 321/350] Make output format optional --- include/Build_Params.hpp | 6 +++--- include/Input_Defaults.hpp | 2 +- src/Build_Params.cpp | 6 +++--- src/main.cpp | 6 ++++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index a007d132..4a31bf9a 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -32,7 +32,7 @@ class Build_Params const std::optional max_memory_; // Soft maximum memory limit (in GB). const bool strict_memory_; // Whether strict memory limit restriction is specifiied. const std::string output_file_path_; // Path to the output file. - const cuttlefish::Output_Format output_format_; // Output format (0: FASTA, 1: GFAv1, 2: GFAv2). + const std::optional output_format_; // Output format (0: FASTA, 1: GFAv1, 2: GFAv2, 3: GFA-reduced). const std::string working_dir_path_; // Path to the working directory (for temporary files). const bool path_cover_; // Whether to extract a maximal path cover of the de Bruijn graph. const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. @@ -59,7 +59,7 @@ class Build_Params const std::optional max_memory, const bool strict_memory, const std::string& output_file_path, - const uint8_t output_format, + const std::optional output_format, const std::string& working_dir_path, const bool path_cover, const std::string& mph_file_path, @@ -158,7 +158,7 @@ class Build_Params // Returns the output format. cuttlefish::Output_Format output_format() const { - return output_format_; + return output_format_.value_or(cuttlefish::_default::OP_FORMAT); } diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 4abf618f..79b45846 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -21,7 +21,7 @@ namespace cuttlefish #ifdef CF_DEVELOP_MODE constexpr double GAMMA = 0; #endif - constexpr uint16_t OP_FORMAT = Output_Format::fa; + constexpr Output_Format OP_FORMAT = Output_Format::fa; constexpr char WORK_DIR[] = "."; } } diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 3bb6aced..05d01c67 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -17,7 +17,7 @@ Build_Params::Build_Params( const bool is_read_graph, const std::optional max_memory, const bool strict_memory, const std::string& output_file_path, - const uint8_t output_format, + const std::optional output_format, const std::string& working_dir_path, const bool path_cover, const std::string& mph_file_path, @@ -38,7 +38,7 @@ Build_Params::Build_Params( const bool is_read_graph, max_memory_(max_memory), strict_memory_(strict_memory), output_file_path_(output_file_path), - output_format_(cuttlefish::Output_Format(output_format)), + output_format_(output_format), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), path_cover_(path_cover), mph_file_path_(mph_file_path), @@ -127,7 +127,7 @@ bool Build_Params::is_valid() const // Cuttlefish 1 specific arguments can not be specified. - if(output_format_ != cuttlefish::Output_Format::fa) + if(output_format_) { std::cout << "Cuttlefish 1 specific arguments specified while using Cuttlefish 2.\n"; valid = false; diff --git a/src/main.cpp b/src/main.cpp index bd13bdce..d625fbf5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,9 +54,10 @@ void build(int argc, char** argv) ("path-cover", "extract a maximal path cover of the de Bruijn graph") ; + std::optional format_code; options.add_options("cuttlefish_1") ("f,format", "output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: GFA-reduced)", - cxxopts::value()->default_value(std::to_string(cuttlefish::_default::OP_FORMAT))) + cxxopts::value>(format_code)) ; options.add_options("specialized") @@ -99,7 +100,8 @@ void build(int argc, char** argv) const auto thread_count = result["threads"].as(); const auto strict_memory = !result["unrestrict-memory"].as(); const auto output_file = result["output"].as(); - const auto format = result["format"].as(); + const auto format = format_code ? std::optional(cuttlefish::Output_Format(format_code.value())) : + std::optional(); const auto working_dir = result["work-dir"].as(); const auto path_cover = result["path-cover"].as(); const auto mph_file = result["mph"].as(); From 7d81f6a9d052bcda5bb94f909260d15dbe11fb3f Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 28 Jan 2022 11:52:24 -0500 Subject: [PATCH 322/350] Make input paths optional --- include/Build_Params.hpp | 6 +++--- include/Seq_Input.hpp | 6 ++++++ src/Build_Params.cpp | 6 +++--- src/Seq_Input.cpp | 10 ++++++++++ src/main.cpp | 12 ++++++------ 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 4a31bf9a..8ee948fb 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -48,9 +48,9 @@ class Build_Params // Constructs a parameters wrapper object with the self-explanatory parameters. Build_Params( const bool is_read_graph, const bool is_ref_graph, - const std::vector& seq_paths, - const std::vector& list_paths, - const std::vector& dir_paths, + const std::optional>& seq_paths, + const std::optional>& list_paths, + const std::optional>& dir_paths, const uint16_t k, const std::optional cutoff, const std::string& vertex_db_path, diff --git a/include/Seq_Input.hpp b/include/Seq_Input.hpp index aa5976c7..a85948ba 100644 --- a/include/Seq_Input.hpp +++ b/include/Seq_Input.hpp @@ -6,6 +6,7 @@ #include #include +#include // A class to pack the input sequences. @@ -17,12 +18,17 @@ class Seq_Input const std::vector list_paths_; // Collection of paths to lists containing sequence file paths. const std::vector dir_paths_; // Collection of paths to directories containing sequence files. + static const std::vector empty_collection; // A representative empty collection of sequences. + public: // Constructs a collection of input sequences. Seq_Input(const std::vector& seqs, const std::vector& lists, const std::vector& dirs); + // Constructs a collection of input sequences. + Seq_Input(const std::optional>& seqs, const std::optional>& lists, const std::optional>& dirs); + // Returns the collection of paths to raw sequences. const std::vector& seq_paths() const; diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 05d01c67..884c1be4 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -6,9 +6,9 @@ Build_Params::Build_Params( const bool is_read_graph, const bool is_ref_graph, - const std::vector& seq_paths, - const std::vector& list_paths, - const std::vector& dir_paths, + const std::optional>& seq_paths, + const std::optional>& list_paths, + const std::optional>& dir_paths, const uint16_t k, const std::optional cutoff, const std::string& vertex_db_path, diff --git a/src/Seq_Input.cpp b/src/Seq_Input.cpp index 4e57496c..6866af56 100644 --- a/src/Seq_Input.cpp +++ b/src/Seq_Input.cpp @@ -6,6 +6,9 @@ #include +const std::vector Seq_Input::empty_collection; + + Seq_Input::Seq_Input( const std::vector& seqs, const std::vector& lists, const std::vector& dirs): @@ -15,6 +18,13 @@ Seq_Input::Seq_Input( const std::vector& seqs, {} +Seq_Input::Seq_Input( const std::optional>& seqs, + const std::optional>& lists, + const std::optional>& dirs): + Seq_Input(seqs.value_or(empty_collection), lists.value_or(empty_collection), dirs.value_or(empty_collection)) +{} + + const std::vector& Seq_Input::seq_paths() const { return seq_paths_; diff --git a/src/main.cpp b/src/main.cpp index d625fbf5..094839bc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,14 +23,17 @@ void build(int argc, char** argv) { cxxopts::Options options("cuttlefish build", "Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences"); + std::optional> seqs; + std::optional> lists; + std::optional> dirs; std::optional max_memory; options.add_options("common") ("s,seq", "input files", - cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + cxxopts::value>>(seqs)) ("l,list", "input file lists", - cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + cxxopts::value>>(lists)) ("d,dir", "input file directories", - cxxopts::value>()->default_value(cuttlefish::_default::EMPTY)) + cxxopts::value>>(dirs)) ("k,kmer-len", "k-mer length", cxxopts::value()->default_value(std::to_string(cuttlefish::_default::K))) ("t,threads", "number of threads to use", @@ -91,9 +94,6 @@ void build(int argc, char** argv) const auto is_read_graph = result["read"].as(); const auto is_ref_graph = result["ref"].as(); - const auto seqs = result["seq"].as>(); - const auto lists = result["list"].as>(); - const auto dirs = result["dir"].as>(); const auto k = result["kmer-len"].as(); const auto vertex_db = result["vertex-set"].as(); const auto edge_db = result["edge-set"].as(); From b9d76cbf0b893f4ca06cd4d7e89454247651ea89 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 28 Jan 2022 15:49:30 -0500 Subject: [PATCH 323/350] Repurpose hash table CLI params option to save, instead of passing direct paths --- include/Build_Params.hpp | 25 +++++++++++++++++++------ include/Kmer_Hash_Table.hpp | 13 +++++++------ src/Build_Params.cpp | 10 +++++----- src/CdBG.cpp | 2 +- src/CdBG_Builder.cpp | 4 +--- src/Kmer_Hash_Table.cpp | 14 ++++++-------- src/Read_CdBG.cpp | 2 +- src/Read_CdBG_Constructor.cpp | 9 +++++++++ src/main.cpp | 13 +++++-------- 9 files changed, 54 insertions(+), 38 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 8ee948fb..00c072a8 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -35,8 +35,8 @@ class Build_Params const std::optional output_format_; // Output format (0: FASTA, 1: GFAv1, 2: GFAv2, 3: GFA-reduced). const std::string working_dir_path_; // Path to the working directory (for temporary files). const bool path_cover_; // Whether to extract a maximal path cover of the de Bruijn graph. - const std::string mph_file_path_; // Optional path to file storing an MPH over the k-mer set. - const std::string buckets_file_path_; // Optional path to file storing the hash table buckets for the k-mer set. + const bool save_mph_; // Option to save the MPH over the vertex set of the de Bruijn graph. + const bool save_buckets_; // Option to save the DFA-states collection of the vertices of the de Bruijn graph. const bool save_vertices_; // Option to save the vertex set of the de Bruijn graph (in KMC database format). #ifdef CF_DEVELOP_MODE const double gamma_; // The gamma parameter for the BBHash MPHF. @@ -62,8 +62,8 @@ class Build_Params const std::optional output_format, const std::string& working_dir_path, const bool path_cover, - const std::string& mph_file_path, - const std::string& buckets_file_path, + const bool save_mph, + const bool save_buckets, const bool save_vertices #ifdef CF_DEVELOP_MODE , const double gamma @@ -179,14 +179,27 @@ class Build_Params // Returns the path to the optional MPH file. const std::string mph_file_path() const { - return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::hash_ext) : mph_file_path_; + return output_file_path_ + cuttlefish::file_ext::hash_ext; } // Returns the path to the optional file storing the hash table buckets. const std::string buckets_file_path() const { - return (is_read_graph() || is_ref_graph()) ? (output_file_path_ + cuttlefish::file_ext::buckets_ext) : buckets_file_path_; + return output_file_path_ + cuttlefish::file_ext::buckets_ext; + } + + // Returns whether the option to save the MPH over the vertex set of the de Bruijn graph is specified ot not. + bool save_mph() const + { + return save_mph_; + } + + + // Returns whether the option to save the DFA-states collection of the vertices of the de Bruijn graph. + bool save_buckets() const + { + return save_buckets_; } diff --git a/include/Kmer_Hash_Table.hpp b/include/Kmer_Hash_Table.hpp index bcde795b..618e563f 100644 --- a/include/Kmer_Hash_Table.hpp +++ b/include/Kmer_Hash_Table.hpp @@ -82,9 +82,10 @@ class Kmer_Hash_Table // Builds the minimal perfect hash function `mph` over the set of // k-mers present at the KMC database container `kmer_container`, - // with `mph_file_path` being the file to use for BBHash build // using `thread_count` number of threads. Uses the directory - // at `working_dir_path` to store temporary files. + // at `working_dir_path` to store temporary files. If the MPHF is + // found present at the file `mph_file_path`, then it is loaded + // instead. void build_mph_function(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); // Loads an MPH function from the file at `file_path` into `mph`. @@ -118,10 +119,10 @@ class Kmer_Hash_Table // Constructs a minimal perfect hash function (specifically, the BBHash) for // the collection of k-mers present at the KMC database at path `kmc_db_path`, - // using up-to `thread_count` number of threads. If a non-empty path is passed - // with `mph_file_path`, either an MPH is loaded from there (instead of building - // from scratch), or the newly built MPH is saved there. - void construct(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path); + // using up-to `thread_count` number of threads. The existence of an MPHF is + // checked at the path `mph_file_path`—if found, it is loaded from the file. + // If `save_mph` is specified, then the MPHF is saved into the file `mph_file_path`. + void construct(uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path, const bool save_mph = false); // Returns the id / number of the bucket in the hash table that is // supposed to store value items for the key `kmer`. diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 884c1be4..91a7a697 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -20,11 +20,11 @@ Build_Params::Build_Params( const bool is_read_graph, const std::optional output_format, const std::string& working_dir_path, const bool path_cover, - const std::string& mph_file_path, - const std::string& buckets_file_path, + const bool save_mph, + const bool save_buckets, const bool save_vertices #ifdef CF_DEVELOP_MODE - , const double gamma + , const double gamma #endif ): is_read_graph_(is_read_graph), @@ -41,8 +41,8 @@ Build_Params::Build_Params( const bool is_read_graph, output_format_(output_format), working_dir_path_(working_dir_path.back() == '/' ? working_dir_path : working_dir_path + "/"), path_cover_(path_cover), - mph_file_path_(mph_file_path), - buckets_file_path_(buckets_file_path), + save_mph_(save_mph), + save_buckets_(save_buckets), save_vertices_(save_vertices) #ifdef CF_DEVELOP_MODE , gamma_(gamma) diff --git a/src/CdBG.cpp b/src/CdBG.cpp index 4667f628..52e29720 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -113,7 +113,7 @@ void CdBG::construct_hash_table(const uint64_t vertex_count, const bool load) std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); - hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path()); + hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path(), params.save_mph()); } } diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index 091f9dd9..fa9bfa3b 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -80,12 +80,10 @@ void CdBG::classify_vertices() // Save the hash table buckets, if a file path is provided. - if(!buckets_file_path.empty()) + if(params.save_buckets()) { std::cout << "Saving the hash table buckets into file " << buckets_file_path << "\n"; - hash_table->save_hash_buckets(buckets_file_path); - std::cout << "Saved the buckets in disk.\n"; } } diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index f2133a01..79f5e459 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -69,8 +69,7 @@ template void Kmer_Hash_Table::build_mph_function(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) { // The serialized BBHash file (saved from some earlier execution) exists. - struct stat buffer; - if(!mph_file_path.empty() && stat(mph_file_path.c_str(), &buffer) == 0) + if(!mph_file_path.empty() && file_exists(mph_file_path)) { std::cout << "Found the MPHF at file " << mph_file_path << ".\n"; std::cout << "Loading the MPHF.\n"; @@ -79,7 +78,7 @@ void Kmer_Hash_Table::build_mph_function(const uint16_t thread_ std::cout << "Loaded the MPHF into memory.\n"; } - else // No BBHash file name provided, or does not exist. Build and save (if specified) one now. + else // No BBHash file name provided, or does not exist. Build one now. { // Open a container over the k-mer database. const Kmer_Container kmer_container(kmc_db_path); @@ -95,10 +94,6 @@ void Kmer_Hash_Table::build_mph_function(const uint16_t thread_ std::cout << "Built the MPHF in memory.\n"; // std::cout << "Total data copy time to BBHash buffers " << mph->data_copy_time << "\n\n"; - - - // Save the MPHF if specified. - // TODO: add `--save-hash` CL-parameter in main, replacing `--mph`. } } @@ -201,7 +196,7 @@ void Kmer_Hash_Table::remove(const Build_Params& params) const template -void Kmer_Hash_Table::construct(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path) +void Kmer_Hash_Table::construct(const uint16_t thread_count, const std::string& working_dir_path, const std::string& mph_file_path, const bool save_mph) { // std::chrono::high_resolution_clock::time_point t_start = std::chrono::high_resolution_clock::now(); @@ -212,6 +207,9 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co // Build the minimal perfect hash function. build_mph_function(thread_count, working_dir_path, mph_file_path); + if(save_mph) + save_mph_function(mph_file_path); + const uint64_t total_bits = mph->totalBitSize(); std::cout << "\nTotal MPHF size: " << total_bits / (8 * 1024 * 1024) << " MB." " Bits per k-mer: " << static_cast(total_bits) / kmer_count << ".\n"; diff --git a/src/Read_CdBG.cpp b/src/Read_CdBG.cpp index 75b22e74..4e85062c 100644 --- a/src/Read_CdBG.cpp +++ b/src/Read_CdBG.cpp @@ -174,7 +174,7 @@ void Read_CdBG::construct_hash_table(const uint64_t vertex_count, const bool std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); #endif - hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path()); + hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path(), params.save_mph()); } } diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index e31c3c20..f7c8ad1a 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -55,6 +55,15 @@ void Read_CdBG_Constructor::compute_DFA_states(const std::string& edge_db_pat thread_pool.close(); std::cout << "\nNumber of processed edges: " << edges_processed << "\n"; + + + // Save the hash table buckets, if a file path is provided. + if(params.save_buckets()) + { + std::cout << "Saving the hash table buckets into file " << buckets_file_path << "\n"; + hash_table.save_hash_buckets(buckets_file_path); + std::cout << "Saved the buckets in disk.\n"; + } } diff --git a/src/main.cpp b/src/main.cpp index 094839bc..174c9734 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -64,11 +64,8 @@ void build(int argc, char** argv) ; options.add_options("specialized") - // TODO: repurpose the following two options - ("mph", "minimal perfect hash (BBHash) file (optional)", - cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) - ("buckets", "hash table buckets (cuttlefish) file (optional)", - cxxopts::value()->default_value(cuttlefish::_default::EMPTY)) + ("save-mph", "save the minimal perfect hash (BBHash) over the vertex set") + ("save-buckets", "save the DFA-states collection of the vertices") ("save-vertices", "save the vertex set of the graph") ; @@ -104,8 +101,8 @@ void build(int argc, char** argv) std::optional(); const auto working_dir = result["work-dir"].as(); const auto path_cover = result["path-cover"].as(); - const auto mph_file = result["mph"].as(); - const auto buckets_file = result["buckets"].as(); + const auto save_mph = result["save-mph"].as(); + const auto save_buckets = result["save-buckets"].as(); const auto save_vertices = result["save-vertices"].as(); #ifdef CF_DEVELOP_MODE const double gamma = result["gamma"].as(); @@ -116,7 +113,7 @@ void build(int argc, char** argv) k, cutoff, vertex_db, edge_db, thread_count, max_memory, strict_memory, output_file, format, working_dir, path_cover, - mph_file, buckets_file, save_vertices + save_mph, save_buckets, save_vertices #ifdef CF_DEVELOP_MODE , gamma #endif From a793c45574648bcc06e818f894db15043718b55e Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 28 Jan 2022 15:58:23 -0500 Subject: [PATCH 324/350] Remove a redundant param --- include/CdBG.hpp | 3 +-- src/CdBG.cpp | 22 +++++++--------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/include/CdBG.hpp b/include/CdBG.hpp index 694437d6..5de669ae 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -120,8 +120,7 @@ class CdBG kmer_Enumeration_Stats enumerate_vertices() const; // Constructs the Cuttlefish hash table for the `vertex_count` vertices of the graph. - // If `load` is specified, then it is loaded from disk. - void construct_hash_table(uint64_t vertex_count, bool load = false); + void construct_hash_table(uint64_t vertex_count); // TODO: rename the "classify" methods with appropriate terminology that are consistent with the theory. diff --git a/src/CdBG.cpp b/src/CdBG.cpp index 52e29720..dba7502e 100644 --- a/src/CdBG.cpp +++ b/src/CdBG.cpp @@ -97,24 +97,16 @@ kmer_Enumeration_Stats CdBG::enumerate_vertices() const template -void CdBG::construct_hash_table(const uint64_t vertex_count, const bool load) +void CdBG::construct_hash_table(const uint64_t vertex_count) { - if(load) - { - hash_table = std::make_unique>(logistics.vertex_db_path(), vertex_count); - hash_table->load(params); - } - else - { - std::size_t max_memory = std::max(process_peak_memory(), params.max_memory() * 1024U * 1024U * 1024U); - max_memory = (max_memory > parser_memory ? max_memory - parser_memory : 0); + std::size_t max_memory = std::max(process_peak_memory(), params.max_memory() * 1024U * 1024U * 1024U); + max_memory = (max_memory > parser_memory ? max_memory - parser_memory : 0); - hash_table = (params.strict_memory() ? - std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : - std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); + hash_table = (params.strict_memory() ? + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory) : + std::make_unique>(logistics.vertex_db_path(), vertex_count, max_memory, std::numeric_limits::max())); - hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path(), params.save_mph()); - } + hash_table->construct(params.thread_count(), logistics.working_dir_path(), params.mph_file_path(), params.save_mph()); } From 56e6fb660f40975915d12d1160e2008c3d674767 Mon Sep 17 00:00:00 2001 From: jamshed Date: Fri, 28 Jan 2022 17:04:05 -0500 Subject: [PATCH 325/350] Display hash table save messages --- src/CdBG_Builder.cpp | 3 +-- src/Kmer_Hash_Table.cpp | 3 +++ src/Read_CdBG_Constructor.cpp | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/CdBG_Builder.cpp b/src/CdBG_Builder.cpp index fa9bfa3b..a2aaf47f 100644 --- a/src/CdBG_Builder.cpp +++ b/src/CdBG_Builder.cpp @@ -82,9 +82,8 @@ void CdBG::classify_vertices() // Save the hash table buckets, if a file path is provided. if(params.save_buckets()) { - std::cout << "Saving the hash table buckets into file " << buckets_file_path << "\n"; hash_table->save_hash_buckets(buckets_file_path); - std::cout << "Saved the buckets in disk.\n"; + std::cout << "Saved the hash buckets at " << buckets_file_path << "\n"; } } diff --git a/src/Kmer_Hash_Table.cpp b/src/Kmer_Hash_Table.cpp index 79f5e459..dda61741 100644 --- a/src/Kmer_Hash_Table.cpp +++ b/src/Kmer_Hash_Table.cpp @@ -208,7 +208,10 @@ void Kmer_Hash_Table::construct(const uint16_t thread_count, co build_mph_function(thread_count, working_dir_path, mph_file_path); if(save_mph) + { save_mph_function(mph_file_path); + std::cout << "Saved the hash function at " << mph_file_path << "\n"; + } const uint64_t total_bits = mph->totalBitSize(); std::cout << "\nTotal MPHF size: " << total_bits / (8 * 1024 * 1024) << " MB." diff --git a/src/Read_CdBG_Constructor.cpp b/src/Read_CdBG_Constructor.cpp index f7c8ad1a..4eb20158 100644 --- a/src/Read_CdBG_Constructor.cpp +++ b/src/Read_CdBG_Constructor.cpp @@ -60,9 +60,8 @@ void Read_CdBG_Constructor::compute_DFA_states(const std::string& edge_db_pat // Save the hash table buckets, if a file path is provided. if(params.save_buckets()) { - std::cout << "Saving the hash table buckets into file " << buckets_file_path << "\n"; hash_table.save_hash_buckets(buckets_file_path); - std::cout << "Saved the buckets in disk.\n"; + std::cout << "Saved the hash buckets at " << buckets_file_path << "\n"; } } From 58c8e409cfd6feca7825661eeecba709898a8fdd Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 14:40:30 -0500 Subject: [PATCH 326/350] Bump default thread-count --- include/Input_Defaults.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/Input_Defaults.hpp b/include/Input_Defaults.hpp index 79b45846..854fa943 100644 --- a/include/Input_Defaults.hpp +++ b/include/Input_Defaults.hpp @@ -6,6 +6,8 @@ #include "Output_Format.hpp" +#include + namespace cuttlefish { @@ -16,7 +18,8 @@ namespace cuttlefish constexpr uint16_t K = 27; constexpr uint32_t CUTOFF_FREQ_READS = 2; // Typical practice constexpr uint32_t CUTOFF_FREQ_REFS = 1; // Typical assumption - constexpr uint16_t THREAD_COUNT = 1; + const uint16_t THREAD_COUNT = (std::thread::hardware_concurrency() ? + (std::thread::hardware_concurrency()/ 4) : 8); // A quarter of the total thread-count. constexpr std::size_t MAX_MEMORY = 3; // Set as per KMC3 stage 1 performance. #ifdef CF_DEVELOP_MODE constexpr double GAMMA = 0; From 180fa903033e2b1ac7800f3069d4e5aaa59352bd Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 14:50:30 -0500 Subject: [PATCH 327/350] Misc. tweak input validation --- src/Build_Params.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Build_Params.cpp b/src/Build_Params.cpp index 91a7a697..7766a6f8 100644 --- a/src/Build_Params.cpp +++ b/src/Build_Params.cpp @@ -65,7 +65,7 @@ bool Build_Params::is_valid() const // Even `k` values are not consistent with the theory. // Also, `k` needs to be in the range `[1, MAX_K]`. - if((k_ & 1U) == 0 || (k_ > cuttlefish::MAX_K)) + if((k_ & static_cast(1)) == 0 || (k_ > cuttlefish::MAX_K)) { std::cout << "The k-mer length (k) needs to be odd and within " << cuttlefish::MAX_K << ".\n"; valid = false; @@ -76,7 +76,7 @@ bool Build_Params::is_valid() const const auto num_threads = std::thread::hardware_concurrency(); if(num_threads > 0 && thread_count_ > num_threads) { - std::cout << "At most " << num_threads << " concurrent threads are supported at the machine.\n"; + std::cout << "At most " << num_threads << " concurrent threads are supported by the machine.\n"; valid = false; } @@ -136,7 +136,7 @@ bool Build_Params::is_valid() const else // Validate Cuttlefish 1 specific arguments. { // Invalid output formats are to be discarded. - if(output_format_ >= cuttlefish::num_op_formats) + if(output_format() >= cuttlefish::num_op_formats) { std::cout << "Invalid output file format.\n"; valid = false; From 9e33f851e13d02fd40aefb880803ca2770f5ee7d Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 15:28:16 -0500 Subject: [PATCH 328/350] Add comands indicator help message --- src/main.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 174c9734..4b7a2cb5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -212,6 +212,15 @@ void validate(int argc, char** argv) } +void display_help_message() +{ + std::cout << "Supported commands: `build`, `help`.\n"; + + std::cout << "Usage:\n"; + std::cout << "\tcuttlefish build [options]\n"; +} + + int main(int argc, char** argv) { #ifdef CF_DEVELOP_MODE @@ -219,10 +228,7 @@ int main(int argc, char** argv) #endif if(argc < 2) - { - std::cout << "Usage:\ncuttlefish [OPTIONS]" << std::endl; - std::cout << "Supported commands: `build` and `validate`." << std::endl; - } + display_help_message(); else { std::string command(argv[1]); @@ -232,8 +238,10 @@ int main(int argc, char** argv) build(argc - 1, argv + 1); else if(command == "validate") validate(argc - 1, argv + 1); + else if(command == "help") + display_help_message(); else - std::cout << "Invalid command. Supported commands: `build` and `validate`" << std::endl; + display_help_message(); } return EXIT_SUCCESS; From 726df43ee22aeb2cf186f28634ec8c036bfcaaaa Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 15:42:42 -0500 Subject: [PATCH 329/350] Add version displayer --- CMakeLists.txt | 2 ++ include/version.hpp | 14 ++++++++++++++ src/main.cpp | 12 +++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 include/version.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b5c606bc..7523782c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,8 @@ set(OPTIMIZE_FLAGS -funroll-loops) # TODO: find out what are `__STDC_FORMAT_MACROS` and `SPDLOG_FMT_EXTERNAL_HO` for. add_compile_definitions(__STDC_FORMAT_MACROS SPDLOG_FMT_EXTERNAL_HO FMT_HEADER_ONLY XXH_INLINE_ALL) +add_compile_definitions(PROJECT_VERSION=${CMAKE_PROJECT_VERSION}) + if(INSTANCE_COUNT) add_compile_definitions(INSTANCE_COUNT=${INSTANCE_COUNT}) endif() diff --git a/include/version.hpp b/include/version.hpp new file mode 100644 index 00000000..934761e0 --- /dev/null +++ b/include/version.hpp @@ -0,0 +1,14 @@ + + +#include + + +// https://stackoverflow.com/a/20632065/2007834 +#define STRINGIFY2(X) #X +#define STRINGIFY(X) STRINGIFY2(X) + + +inline std::string version() +{ + return STRINGIFY(PROJECT_VERSION); +} diff --git a/src/main.cpp b/src/main.cpp index 4b7a2cb5..49991499 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,7 @@ #include "Build_Params.hpp" #include "Validation_Params.hpp" #include "Application.hpp" +#include "version.hpp" #include "cxxopts/cxxopts.hpp" #include "spdlog/sinks/stdout_color_sinks.h" @@ -212,9 +213,16 @@ void validate(int argc, char** argv) } +std::string executable_version() +{ + return "cuttlefish " + version(); +} + + void display_help_message() { - std::cout << "Supported commands: `build`, `help`.\n"; + std::cout << executable_version() << "\n"; + std::cout << "Supported commands: `build`, `help`, `version`.\n"; std::cout << "Usage:\n"; std::cout << "\tcuttlefish build [options]\n"; @@ -240,6 +248,8 @@ int main(int argc, char** argv) validate(argc - 1, argv + 1); else if(command == "help") display_help_message(); + else if(command == "version") + std::cout << executable_version() << "\n"; else display_help_message(); } From 6b72e47ad4f2882c4cfb95116d64216fcec4f15d Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 16:07:18 -0500 Subject: [PATCH 330/350] Update usage-manual --- README.md | 274 ++++++++++++++++--------------------------- data/reads.fq | 12 ++ data/ust-example.fa | 6 - data/ust-example.lst | 3 - 4 files changed, 111 insertions(+), 184 deletions(-) create mode 100644 data/reads.fq delete mode 100644 data/ust-example.fa delete mode 100644 data/ust-example.lst diff --git a/README.md b/README.md index 686facbd..ffd0705a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/cuttlefish/README.html) -Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from sequencing reads or reference sequences, which is highly scalable in terms of the size of the input data. +Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from sequencing reads or reference sequences, and is highly scalable in terms of the size of the input data. ## Table of contents @@ -26,14 +26,14 @@ Cuttlefish is a fast, parallel, and very lightweight memory tool to construct th Cuttlefish is a program to produce the compacted de Bruijn graph from sequencing reads or reference sequences. -The papers describing the work are: [Cuttlefish 1](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696) and [Cuttlefish 2](https://doi.org/10.1101/2021.12.14.472718) (pre-print). +The papers describing the work are: [Cuttlefish (original)](https://academic.oup.com/bioinformatics/article/37/Supplement_1/i177/6319696) and [Cuttlefish 2](https://doi.org/10.1101/2021.12.14.472718) (pre-print). ## Dependencies - -To install Cuttlefish from source, the following are required: +Cuttlefish can be installed using Bioconda (check [Installation](#installation)). +If installing _from source_, the following are required: -- [GCC](https://gcc.gnu.org/) compilers for C++14 and C11 +- [GCC](https://gcc.gnu.org/) compilers for C++17 and C11 - [CMake](https://cmake.org/) (version >= 3.14) - [zlib](https://zlib.net/) - [bzip2](https://www.sourceware.org/bzip2/) @@ -54,19 +54,9 @@ Besides, these should also be available via some package manager for your operat brew install cmake zlib bzip2 ``` -Cuttlefish also makes use of the [KMC 3](https://github.com/refresh-bio/KMC) tool. -If you are installing Cuttlefish from the source, then it will be automatically installed. -To use with Cuttlefish 1 while it is installed using `conda`, you may use the following to install KMC 3: - -- From [Bioconda](https://bioconda.github.io/user/install.html): - - ```bash - conda install -c bioconda kmc - ``` - ## Installation -- From [Bioconda](https://bioconda.github.io/user/install.html) (only for Cuttlefish 1, _for now_): +- From [Bioconda](https://bioconda.github.io/user/install.html): ```bash conda install -c bioconda cuttlefish @@ -75,17 +65,18 @@ To use with Cuttlefish 1 while it is installed using `conda`, you may use the fo The Conda package supports _k_ values up-to 127. To use larger _k_ values, please install Cuttlefish from the source. -- From source (works for both Cuttlefish 1 and 2): +- From source: ```bash git clone https://github.com/COMBINE-lab/cuttlefish.git - cd cuttlefish/ && mkdir build && cd build/ + cd cuttlefish/ && git checkout develop + mkdir build && cd build/ cmake -DCMAKE_INSTALL_PREFIX=../ .. make -j 8 install cd .. ``` - You may replace `8` in `make -j 8` with the preferred count for threads to use in the installation process. + You may replace `8` in `make -j 8` with the preferred count of threads to use in the installation process. This installs Cuttlefish in a sub-directory named `bin`, inside the project root directory. To specify a different installation directory, its path may be passed as the value of `-DCMAKE_INSTALL_PREFIX` with the `cmake` command, i.e. you may use `cmake -DCMAKE_INSTALL_PREFIX=/ ..` . @@ -97,151 +88,111 @@ To use with Cuttlefish 1 while it is installed using `conda`, you may use the fo ## Usage -`cuttlefish build --help` displays the following message: +`cuttlefish build --help` displays the following message (the default `threads` argument is machine-configuration specific): -```bash +```txt Efficiently construct the compacted de Bruijn graph from sequencing reads or reference sequences Usage: cuttlefish build [OPTION...] common options: - -r, --refs arg input files (default: "") - -l, --lists arg input file lists (default: "") - -d, --dirs arg input file directories (default: "") - -k, --kmer-len arg k-mer length (default: 25) - -t, --threads arg number of threads to use (default: 1) - -o, --output arg output file (default: "") - -w, --work-dir arg working directory (default: .) - -h, --help print usage - - cuttlefish 1.0 options: - -s, --kmc-db arg set of vertices, i.e. k-mers (KMC database) prefix - (default: .) - -f, --format arg output format (0: txt, 1: GFA 1.0, 2: GFA 2.0, 3: - GFA-reduced) (default: 0) - --rm remove the KMC database - - cuttlefish 2.0 options: - --read construct a compacted read de Bruijn graph - --ref construct a compacted reference de Bruijn graph - -c, --cutoff arg frequency cutoff for (k + 1)-mers (default: 2) - -m, --max-memory arg soft maximum memory limit (in GB) (default: 3) + -s, --seq arg input files + -l, --list arg input file lists + -d, --dir arg input file directories + -k, --kmer-len arg k-mer length (default: 27) + -t, --threads arg number of threads to use (default: 22) + -o, --output arg output file + -w, --work-dir arg working directory (default: .) + -m, --max-memory arg soft maximum memory limit in GB (default: 3) --unrestrict-memory do not impose memory usage restriction - --path-cover extract a maximal path cover of the de Bruijn - graph + -h, --help print usage + + cuttlefish_1 options: + -f, --format arg output format (0: FASTA, 1: GFA 1.0, 2: GFA 2.0, 3: + GFA-reduced) + + cuttlefish_2 options: + --read construct a compacted read de Bruijn graph (for FASTQ + input) + --ref construct a compacted reference de Bruijn graph (for + FASTA input) + -c, --cutoff arg frequency cutoff for (k + 1)-mers (default: refs: 1, + reads: 2) + --path-cover extract a maximal path cover of the de Bruijn graph debug options: - -e, --edge-db arg set of edges, i.e. (k + 1)-mers (KMC database) prefix - (default: "") + --vertex-set arg set of vertices, i.e. k-mers (KMC database) prefix + (default: "") + --edge-set arg set of edges, i.e. (k + 1)-mers (KMC database) prefix + (default: "") specialized options: - --mph arg minimal perfect hash (BBHash) file (optional) - (default: "") - --buckets arg hash table buckets (cuttlefish) file (optional) - (default: "") + --save-mph save the minimal perfect hash (BBHash) over the vertex + set + --save-buckets save the DFA-states collection of the vertices --save-vertices save the vertex set of the graph ``` -### Cuttlefish 2 +It supports GNU style arguments, `--` for long options, and `-` for short options. +Long options `opt` taking a parameter can be written as `--opt=parameter` or as `--opt parameter`. +Short options `o` taking a parameter is written as `-o parameter`. -To construct a compacted de Bruijn graph, use Cuttlefish as following: - -```bash -cuttlefish build -k -c -o -t -w -``` +The common arguments (for Cuttlefish 1 and 2) are set as following. -The arguments are set as following: +- The input files can be passed in any of the following ways (and the options may be mixed together). + - `-s ` + - `-l ` + - `-d ` -- The `` argument should be either `--read` or `--ref`, based on whether you are providing sequencing reads or reference sequences as input, respectively. -- The input files `` can be passed in any of the following ways (and the options may be mixed together). - - `-r ` - - `-l ` - - `-d ` + Multiple values for each option can be passed as `--seq=s1,s2,...`, `--seq s1 --seq s2 ...`, `-s s1,s2 ...`, or `-s s1 -s s2` (similarly for `list` and `dir`). In case of using sequencing reads as input, the files should be in the FASTQ format. For reference sequences, those should be in the FASTA format. The input files can also be possibly gzipped. -- The _k_-mer length `k` must be odd and within `63` (see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond this). -The default value is `25`. -- The frequency threshold `c` is set to `2` by default, and should be set to `1` when passing reference sequences as input. -- Cuttlefish 2 generates two output files with the prefix ``: - - A FASTA file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa`). +- The _k_-mer length `k` must be odd and within `127` (and `63` if installed from source; see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond these). +The default value is `27`. +- The number of threads `t` is set to a quarter of the number of concurrent threads supported, by default. +The use of high-enough values is recommended. +- Cuttlefish generates two output files: + - A FASTA / GFA1 / GFA2 file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa` / `.gfa1` / `.gfa2`). + The GFA output formats are exclusive for Cuttlefish 1. - A metadata file containing some structural characteristics of the de Bruijn graph and its compacted form (with the extension `.json`). -- The number of threads `t` is set to `1` by default, and the use of higher values is recommended. - The working directory `w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. The current directory is set as the default working directory. +- A soft maximum memory-limit `m` (in GB) can be provided to trade-off the RAM usage for faster execution time; +this will only be adhered to if the provided limit is at least the minimum required memory for Cuttlefish, determined internally. +- No memory-usage restriction can be imposed using `unrestrict-memory`, trading off RAM usage for faster execution time. -Some other useful arguments: - -- `--path-cover` to construct a maximal vertex-disjoint path cover of the de Bruijn graph, instead of its compacted variant -- `-m ` to pass a soft maximum memory-limit (in GB) to trade-off RAM usage for faster execution time; this will only be adhered to if the provided limit is larger than the minimum required memory for Cuttlefish, determined internally -- `--unrestrict-memory` to not impose any memory-usage restriction, trading off RAM usage for faster execution time - -### Cuttlefish 1 - -Unlike Cuttlefish 2, Cuttlefish 1 does not execute KMC 3 by itself (_for now_). -To produce the _k_-mer set from an individual input reference sequence using KMC 3, the following may be used: - -```bash -kmc -k -fm -ci1 -t -``` - -If working with multiple references, you may use: - -```bash -kmc -k -fm -ci1 -t @ -``` - -The input file `` or the files listed in `` should be in the FASTA format, possibly gzipped. -The `k` value should be odd (required by Cuttlefish), and is `25` by default. -Having executed, KMC 3 will produce two files with the same prefix ``, and extensions `.kmc_pre` and `.kmc_suf`. -When working within strict memory limits, you should add the arguments `-m -sm` with these invocations, where `` is your memory budget in gigabytes. - - +Cuttlefish 1 specific arguments are set as following. -Then to build the compacted de Bruijn graph, use Cuttlefish as following: - -```bash -cuttlefish build -k -s -o -f -t -w -``` +- The output formats (`f`) are — + - `0`: only the maximal unitig (non-branching path) fragments, in FASTA; + - `1`: the maximal unitigs, their connectivities, and the input sequence tilings, in GFA 1.0; + - `2`: the maximal unitigs, their connectivities, and the input sequence tilings, in GFA 2.0; and + - `3`: the maximal unitigs and the input sequence tilings, in GFA-reduced (see [I/O formats](#io-formats)). -The arguments are set as following: +Cuttlefish 2 specific arguments are set as following. -- The input references can be passed in any of the following ways (and the options may be mixed together). - - `-r ` - - `-l ` - - `-d ` - - Each input reference should be in the FASTA format, possibly gzipped. -- The _k_-mer length `k` must be odd and within `63` (or, `127` if you install Cuttlefish using `conda`; see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond these). -The default value is `25`. -- The _k_-mer set prefix `s` must match exactly the output path used in the `kmc` invocation, i.e. it should be the `` argument from the `kmc` invocation. -- The output formats (`f`) are — - - `0`: only the maximal unitig (non-branching path) fragments; - - `1`: GFA 1.0; - - `2`: GFA 2.0; and - - `3`: GFA-reduced (see [I/O formats](#io-formats)). -- The number of threads `t` is set to `1` by default, and the use of higher values is recommended. -- The working directory `-w` is used for temporary files created by the process—it is not created by Cuttlefish, and must exist beforehand. -The current directory is set as the default working directory. +- `read` and `ref` are ''input type'' arguments, based on whether you are providing sequencing reads or reference sequences as input, respectively. +- The frequency threshold `c` (of (k + 1)-mers) is set to `2` for read inputs, and `1` for reference inputs, by default. +- `path-cover` is used to construct a maximal vertex-disjoint path cover of the de Bruijn graph, instead of its compacted variant. ## Output formats -### Cuttlefish 2 +### Cuttlefish 2 output The currently supported output format is -- The set of the maximal unitigs (non-branching paths) from the original de Bruijn graph, in FASTA +- The set of the maximal unitigs (non-branching paths) of the de Bruijn graph, in FASTA Other output formats are currently in the development roadmap. -### Cuttlefish 1 +### Cuttlefish 1 output The currently supported output formats are — -- The set of the maximal unitigs (non-branching paths) from the original de Bruijn graph, in plain text +- The set of the maximal unitigs (non-branching paths) of the de Bruijn graph, in FASTA - The compacted de Bruijn graph in the [GFA 1.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) and the [GFA 2.0](https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md) formats - The compacted de Bruijn graph in a ''reduced'' GFA format. It consists of two files, with the extensions: `.cf_seg` and `.cf_seq`: - The `.cf_seg` file contains all the maximal unitig fragments of the graph (the segment outputs from GFA, i.e. the `S`-tagged entries), each one with a unique id. @@ -307,65 +258,39 @@ If one were constructing the compacted colored de Bruijn graph from raw sequenci ## Example usage -### Cuttlefish 2 - -_To be completed_ - -### Cuttlefish 1 - -Please use the `kmc` and the `cuttlefish` executables from their respective paths in the following examples. We use _k_ = 3, and 4 CPU threads, with a working directory named `temp` in the following examples. -- **For individual input genome reference** - - To output the compacted de Bruijn graph (in GFA 1.0) for the example FASTA file `refs1.fa` (provided in the `data` directory), the following may be used: - - - Generate the _k_-mer set: - - ```bash - kmc -k3 -fa -ci1 -t4 refs1.fa kmers temp/ - ``` +### Using Cuttlefish 2 - - Output the compacted graph (in GFA 1.0): +- **From FASTQ files** - ```bash - cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.gfa1 -f 1 -w temp/ - ``` +To construct the maximal unitigs of the example FASTQ file `reads.fq` (provided in the `data` directory) with frequency cutoff `c = 1`, the following may be used. - To get only the maximal unitig fragments (which is `-f 0` by default): - - ```bash - cuttlefish build -r refs1.fa -k 3 -s kmers -t 4 -o cdbg.txt -w temp/ - ``` - -- **For multiple input genome references** +```bash +cuttlefish build -s reads.fq -k 3 -t 4 -o cdbg -w temp/ --read -c 1 +``` - To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: +- **From FASTA files** - - Produce a newline-separated list of the paths of the input references. For example, +To construct the maximal unitigs of the example FASTA file `refs1.fa` (provided in the `data` directory), the following may be used. - ```bash - readlink -f refs1.fa > refs.lst - readlink -f refs2.fa >> refs.lst - ``` +```bash +cuttlefish build -s refs1.fa -k 3 -t 4 -o cdbg -w temp/ --ref +``` - - Generate the _k_-mer set: +These executions will produce two output files each: `cdbg.fa`, containing the maximal unitigs of the graph; and `cdbg.json`, a metadata file with some structural characteristics of the graph. - ```bash - kmc -k3 -fa -ci1 -t4 @refs.lst kmers temp/ - ``` +Multiple seq-files, lists of seq-files, or directories of seq-files may also be passed, as described in [Usage](#usage). - - Output the compacted graph (in GFA 2.0): +### Using Cuttlefish 1 - ```bash - cuttlefish build -l refs.lst -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ - ``` +To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: - Or, +```bash +cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ +``` - ```bash - cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ - ``` +You may also provide lists or directories of reference files as input, as described in [Usage](#usage). ## Larger _k_-mer sizes @@ -378,16 +303,11 @@ cmake -DINSTANCE_COUNT=64 .. ``` Cuttlefish supports only the odd `k` values within `MAX_K` due to theoretical reasons. -Currently, KMC3 supports a `MAX_K` of `255`. - - +Currently, `MAX_K` is supported upto 255. +Please contact the authors if support for a larger `MAX_K` is required. Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer as necessary—thus increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. - - ## Differences between Cuttlefish 1 & 2 - Cuttlefish 1 is applicable only for (whole-genome or transcriptome) reference sequences. @@ -398,6 +318,8 @@ Passing neither of these invokes Cuttlefish 1. ## Citations & Acknowledgement +### [Cuttlefish (original)](https://doi.org/10.1093/bioinformatics/btab309) + ```bibtex @article{10.1093/bioinformatics/btab309, author = {Khan, Jamshed and Patro, Rob}, @@ -414,6 +336,8 @@ Passing neither of these invokes Cuttlefish 1. } ``` +### [Cuttlefish 2](https://doi.org/10.1101/2021.12.14.472718) + ```bibtex @article{Khan2021.12.14.472718, author = {Khan, Jamshed and Kokot, Marek and Deorowicz, Sebastian and Patro, Rob}, diff --git a/data/reads.fq b/data/reads.fq new file mode 100644 index 00000000..49e21316 --- /dev/null +++ b/data/reads.fq @@ -0,0 +1,12 @@ +@sample-read-1 +CTAAGAT ++ +zzzzzzz +@sample-read-2 +CGATGCA ++ +zzzzzzz +@sample-read-3 +TAAGAGG ++ +zzzzzzz diff --git a/data/ust-example.fa b/data/ust-example.fa deleted file mode 100644 index bd484921..00000000 --- a/data/ust-example.fa +++ /dev/null @@ -1,6 +0,0 @@ ->example1 -AAACGGA ->example2 -AAACTGGA ->example3 -AAACTGGT diff --git a/data/ust-example.lst b/data/ust-example.lst deleted file mode 100644 index 6b32f206..00000000 --- a/data/ust-example.lst +++ /dev/null @@ -1,3 +0,0 @@ -AAACGGA -AAACTGGA -AAACTGGT From 75b88af2fda80a91d08ac551426c650222a42197 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 16:26:41 -0500 Subject: [PATCH 331/350] Bump version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7523782c..68538237 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.14) # Name the project, its version, and languages used in it. set(PROJECT_NAME cuttlefish) project(${PROJECT_NAME} - VERSION 1.0.0 + VERSION 2.0.0 LANGUAGES CXX C ) From de0a1ca9e368e5008eccf5f6533e232a526a0e29 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 17:02:12 -0500 Subject: [PATCH 332/350] Update version fetcher --- include/version.hpp | 11 +---------- src/main.cpp | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/version.hpp b/include/version.hpp index 934761e0..bc2f4d70 100644 --- a/include/version.hpp +++ b/include/version.hpp @@ -1,14 +1,5 @@ - -#include - - // https://stackoverflow.com/a/20632065/2007834 #define STRINGIFY2(X) #X #define STRINGIFY(X) STRINGIFY2(X) - - -inline std::string version() -{ - return STRINGIFY(PROJECT_VERSION); -} +#define VERSION STRINGIFY(PROJECT_VERSION) diff --git a/src/main.cpp b/src/main.cpp index 49991499..de7762a2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -215,7 +215,7 @@ void validate(int argc, char** argv) std::string executable_version() { - return "cuttlefish " + version(); + return "cuttlefish " VERSION; } From 6a7c26cd61e0d948e221b1b73d3965f02c3dde79 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 17:02:45 -0500 Subject: [PATCH 333/350] Use KMC-3.2.1 --- CMakeLists.txt | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68538237..4322a6b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,14 +115,11 @@ add_dependencies(jemalloc prj_jemalloc) message("Build system will fetch and install KMC3") ExternalProject_Add(prj_kmc DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external - # DOWNLOAD_COMMAND curl -k -L https://github.com/refresh-bio/KMC/archive/refs/heads/master.zip -o KMC-master.zip && - # unzip -qq KMC-master.zip && - # rm KMC-master.zip - DOWNLOAD_COMMAND git clone https://github.com/refresh-bio/kmc && - cd kmc - - # SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-master - SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/kmc + DOWNLOAD_COMMAND curl -k -L https://github.com/refresh-bio/KMC/archive/refs/tags/v3.2.1.tar.gz -o KMC-3.2.1.tar.gz && + tar -xzf KMC-3.2.1.tar.gz && + rm KMC-3.2.1.tar.gz + + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-3.2.1 BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ CONFIGURE_COMMAND "" From cf9bafff769cd3c8ea1c93ac92481a19b664ef47 Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 17:16:24 -0500 Subject: [PATCH 334/350] Fix outdated example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ffd0705a..0a53c747 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,7 @@ Multiple seq-files, lists of seq-files, or directories of seq-files may also be To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: ```bash -cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ +cuttlefish build -s refs1.fa,refs2.fa -k 3 -t 4 -o cdbg.gfa2 -f 2 -w temp/ ``` You may also provide lists or directories of reference files as input, as described in [Usage](#usage). From 787e0946f17891ab3329ab8c312f661264374ca6 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 31 Jan 2022 17:18:10 -0500 Subject: [PATCH 335/350] small tweaks to readme --- README.md | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 0a53c747..2ebece88 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/cuttlefish/README.html) -Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from sequencing reads or reference sequences, and is highly scalable in terms of the size of the input data. +Cuttlefish is a fast, parallel, and very lightweight memory tool to construct the compacted de Bruijn graph from sequencing reads or reference sequences. It is highly scalable in terms of the size of the input data. ## Table of contents @@ -33,7 +33,7 @@ The papers describing the work are: [Cuttlefish (original)](https://academic.oup Cuttlefish can be installed using Bioconda (check [Installation](#installation)). If installing _from source_, the following are required: -- [GCC](https://gcc.gnu.org/) compilers for C++17 and C11 +- [GCC](https://gcc.gnu.org/) **or** [Clang](https://clang.llvm.org) compilers for C++17 and C11 - [CMake](https://cmake.org/) (version >= 3.14) - [zlib](https://zlib.net/) - [bzip2](https://www.sourceware.org/bzip2/) @@ -149,11 +149,12 @@ The common arguments (for Cuttlefish 1 and 2) are set as following. In case of using sequencing reads as input, the files should be in the FASTQ format. For reference sequences, those should be in the FASTA format. - The input files can also be possibly gzipped. + The input files can also be gzipped. - The _k_-mer length `k` must be odd and within `127` (and `63` if installed from source; see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond these). The default value is `27`. - The number of threads `t` is set to a quarter of the number of concurrent threads supported, by default. -The use of high-enough values is recommended. +The use of high-enough values is recommended. See the Cuttlefish and Cuttlefish 2 papers for experiments detailing the scaling +behavior of this tool with increasing thread counts. - Cuttlefish generates two output files: - A FASTA / GFA1 / GFA2 file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa` / `.gfa1` / `.gfa2`). The GFA output formats are exclusive for Cuttlefish 1. @@ -162,7 +163,7 @@ The use of high-enough values is recommended. The current directory is set as the default working directory. - A soft maximum memory-limit `m` (in GB) can be provided to trade-off the RAM usage for faster execution time; this will only be adhered to if the provided limit is at least the minimum required memory for Cuttlefish, determined internally. -- No memory-usage restriction can be imposed using `unrestrict-memory`, trading off RAM usage for faster execution time. +- Memory-usage restrictions can be lifted by using `unrestrict-memory`, trading off extra RAM usage for faster execution time. Cuttlefish 1 specific arguments are set as following. @@ -235,8 +236,8 @@ The currently supported output formats are — Whether a pair (ui, ui+1) is an edge or a gap can be inferred by checking the suffix and the prefix (of length `k - 1`) of the unitigs ui and ui+1, respectively (in their correct orientations, based on their following `+`/`-` signs). Note that, a gap is possible in a sequence-tiling only if the sequence contains characters outside of `A`, `C`, `G`, and `T`. - For moderate to large sized genomes, this output format is preferrable to the GFA ones—the GFA formats can be quite verbose for this particular scenario, while the reduced representation provides effitively the same information, while taking much lesser space. - For example, for the 7-human genomes (experimented with in the manuscripts) and using `k = 31`, the compacted graph takes 112 GB in GFA2, while 29.3 GB in this reduced format. + For moderate to large sized genomes, this output format is preferrable to the GFA ones as the GFA formats can be quite verbose for this particular scenario, while the reduced representation provides effitively the same information, while taking much less space. + For example, for the 7-human genome dataset (experimented with in the manuscripts) and using `k = 31`, the compacted graph takes 112 GB in GFA2, but only 29.3 GB in this reduced format. ### Orientation of the output @@ -287,7 +288,7 @@ Multiple seq-files, lists of seq-files, or directories of seq-files may also be To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: ```bash -cuttlefish build -s refs1.fa,refs2.fa -k 3 -t 4 -o cdbg.gfa2 -f 2 -w temp/ +cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ ``` You may also provide lists or directories of reference files as input, as described in [Usage](#usage). @@ -306,11 +307,11 @@ Cuttlefish supports only the odd `k` values within `MAX_K` due to theoretical re Currently, `MAX_K` is supported upto 255. Please contact the authors if support for a larger `MAX_K` is required. -Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer as necessary—thus increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. +Note that, Cuttlefish uses only as many bytes as required (rounded up to multiples of 8) for a _k_-mer. Thus, increasing the maximum _k_-mer size capacity through setting large values for `MAX_K` does not affect the performance for smaller _k_-mer sizes. ## Differences between Cuttlefish 1 & 2 -- Cuttlefish 1 is applicable only for (whole-genome or transcriptome) reference sequences. +- Cuttlefish 1 is applicable only for assembled reference sequences. Whereas Cuttlefish 2 is applicable for both sequencing reads and reference sequences. - For reference sequences, Cuttlefish 1 supports outputting the compacted graph in the GFA formats, whereas Cuttlefish 2 does not support this _yet_. - Cuttlefish 2 can be used by passing either one of the following arguments to the `cuttlefish build` command: `--read` or `--ref`. @@ -318,6 +319,8 @@ Passing neither of these invokes Cuttlefish 1. ## Citations & Acknowledgement +If you use Cuttlefish or Cuttlefish 2 in your work, please include the following citations, as appropriate: + ### [Cuttlefish (original)](https://doi.org/10.1093/bioinformatics/btab309) ```bibtex @@ -365,4 +368,4 @@ This work is supported by _NIH R01 HG009937_, and by _NSF CCF-1750472_, and _CNS - The [kseq](http://lh3lh3.users.sourceforge.net/kseq.shtml) library is MIT licensed. - The [spdlog](https://github.com/gabime/spdlog) library is MIT licensed. - The [xxHash](https://github.com/Cyan4973/xxHash) library is BSD licensed. -- Cuttlefish is Revised BSD licensed. +- Cuttlefish itself is licensed under a Revised BSD license. From 76072497488abeffaeef8f72f985fd420a50a1da Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 18:36:14 -0500 Subject: [PATCH 336/350] Misc. tweak readme --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2ebece88..68e8c170 100644 --- a/README.md +++ b/README.md @@ -153,8 +153,7 @@ The common arguments (for Cuttlefish 1 and 2) are set as following. - The _k_-mer length `k` must be odd and within `127` (and `63` if installed from source; see [Larger _k_-mer sizes](#larger-k-mer-sizes) to increase the _k_-mer size capacity beyond these). The default value is `27`. - The number of threads `t` is set to a quarter of the number of concurrent threads supported, by default. -The use of high-enough values is recommended. See the Cuttlefish and Cuttlefish 2 papers for experiments detailing the scaling -behavior of this tool with increasing thread counts. +The use of high-enough values is recommended. - Cuttlefish generates two output files: - A FASTA / GFA1 / GFA2 file containing the maximal unitigs of the de Bruijn graph (with the extension `.fa` / `.gfa1` / `.gfa2`). The GFA output formats are exclusive for Cuttlefish 1. @@ -288,7 +287,7 @@ Multiple seq-files, lists of seq-files, or directories of seq-files may also be To output the compacted de Bruijn graph (in GFA 2.0) for the example FASTA files `refs1.fa` and `refs2.fa` (provided in the `data` directory), the following may be used: ```bash -cuttlefish build -r refs1.fa,refs2.fa -k 3 -s kmers -t 4 -o cdbg.gfa2 -f 2 -w temp/ +cuttlefish build -s refs1.fa,refs2.fa -k 3 -t 4 -o cdbg.gfa2 -f 2 -w temp/ ``` You may also provide lists or directories of reference files as input, as described in [Usage](#usage). @@ -368,4 +367,4 @@ This work is supported by _NIH R01 HG009937_, and by _NSF CCF-1750472_, and _CNS - The [kseq](http://lh3lh3.users.sourceforge.net/kseq.shtml) library is MIT licensed. - The [spdlog](https://github.com/gabime/spdlog) library is MIT licensed. - The [xxHash](https://github.com/Cyan4973/xxHash) library is BSD licensed. -- Cuttlefish itself is licensed under a Revised BSD license. +- Cuttlefish itself is Revised BSD licensed. From 0fdb6755673429cd77dd652a9c6a657ed74b9763 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 31 Jan 2022 23:07:34 -0500 Subject: [PATCH 337/350] alterations to work with clang on OSX --- CMakeLists.txt | 13 +++- include/DNA_Utility.hpp | 1 + patches/kmc_patch.diff | 142 ++++++++++++++++++++++++++++++++++++++++ src/utility.cpp | 2 +- 4 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 patches/kmc_patch.diff diff --git a/CMakeLists.txt b/CMakeLists.txt index 4322a6b2..a7a3c25a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ + # Specify the minimum version of CMake to use. # CMake can have different behaviors (policies) based on its version used. cmake_minimum_required(VERSION 3.14) @@ -55,11 +56,21 @@ if(CF_DEVELOP_MODE) add_compile_definitions(CF_DEVELOP_MODE) endif() +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") execute_process( COMMAND getconf LEVEL1_DCACHE_LINESIZE COMMAND tr -d '\n' OUTPUT_VARIABLE L1_CACHE_LINE_SIZE ) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") +execute_process( + COMMAND sysctl machdep.cpu.cache.linesize + COMMAND awk "{print $2}" + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE +) +endif() + add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) @@ -118,7 +129,7 @@ ExternalProject_Add(prj_kmc DOWNLOAD_COMMAND curl -k -L https://github.com/refresh-bio/KMC/archive/refs/tags/v3.2.1.tar.gz -o KMC-3.2.1.tar.gz && tar -xzf KMC-3.2.1.tar.gz && rm KMC-3.2.1.tar.gz - + PATCH_COMMAND patch --strip 1 < ${CMAKE_SOURCE_DIR}/patches/kmc_patch.diff SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-3.2.1 BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ diff --git a/include/DNA_Utility.hpp b/include/DNA_Utility.hpp index 9f96e3b1..1adfd1dd 100644 --- a/include/DNA_Utility.hpp +++ b/include/DNA_Utility.hpp @@ -5,6 +5,7 @@ #include "DNA.hpp" +#include class DNA_Utility diff --git a/patches/kmc_patch.diff b/patches/kmc_patch.diff new file mode 100644 index 00000000..56a44630 --- /dev/null +++ b/patches/kmc_patch.diff @@ -0,0 +1,142 @@ +diff --git a/Makefile b/Makefile +index bc07d35..4bfd785 100644 +--- a/Makefile ++++ b/Makefile +@@ -13,10 +13,10 @@ OUT_BIN_DIR = bin + OUT_INCLUDE_DIR = include + + ifeq ($(UNAME_S),Darwin) +- CC = /usr/local/bin/g++-10 ++ CC = clang++ + +- CFLAGS = -Wall -O3 -m64 -static-libgcc -static-libstdc++ -pthread -std=c++14 +- CLINK = -lm -static-libgcc -static-libstdc++ -O3 -pthread -std=c++14 ++ CFLAGS = -Wall -O3 -m64 -pthread -std=c++14 ++ CLINK = -lm -O3 -pthread -std=c++14 + + PY_KMC_API_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++14 -O3 + else +@@ -61,7 +61,7 @@ ifeq ($(UNAME_S),Darwin) + $(KMC_TOOLS_DIR)/libs/libz.1.2.5.dylib \ + $(KMC_TOOLS_DIR)/libs/libbz2.1.0.5.dylib + +- LIB_KMC_CORE = $(OUT_BIN_DIR)/libkmc_core.mac.a ++ LIB_KMC_CORE = $(OUT_BIN_DIR)/libkmc_core.a + else + RADULS_OBJS = \ + $(KMC_MAIN_DIR)/raduls_sse2.o \ +diff --git a/kmc_CLI/kmc.cpp b/kmc_CLI/kmc.cpp +index 15e579e..1d62040 100644 +--- a/kmc_CLI/kmc.cpp ++++ b/kmc_CLI/kmc.cpp +@@ -1,5 +1,7 @@ + #define _CRT_SECURE_NO_WARNINGS + #include "../kmc_core/kmc_runner.h" ++#include ++#include + #include + #include + #include +@@ -246,6 +248,9 @@ bool parse_parameters(int argc, char* argv[], Params& params) + stage2Params.SetOutputFileName(argv[i++]); + stage1Params.SetTmpPath(argv[i++]); + ++ std::random_device rd; ++ std::mt19937 g(rd()); ++ + std::vector input_file_names; + if (input_file_name[0] != '@') + input_file_names.push_back(input_file_name); +@@ -264,7 +269,7 @@ bool parse_parameters(int argc, char* argv[], Params& params) + input_file_names.push_back(s); + + in.close(); +- std::random_shuffle(input_file_names.begin(), input_file_names.end()); ++ std::shuffle(input_file_names.begin(), input_file_names.end(), g); + } + stage1Params.SetInputFiles(input_file_names); + +diff --git a/kmc_api/kmer_defs.h b/kmc_api/kmer_defs.h +index 90b0db4..4fe8aaf 100644 +--- a/kmc_api/kmer_defs.h ++++ b/kmc_api/kmer_defs.h +@@ -34,7 +34,7 @@ + + + #include +- #include ++ #include + #include + + #else +diff --git a/kmc_core/defs.h b/kmc_core/defs.h +index 75afc97..e0ec1df 100644 +--- a/kmc_core/defs.h ++++ b/kmc_core/defs.h +@@ -8,6 +8,7 @@ + Date : 2022-01-04 + */ + ++ + #ifndef _DEFS_H + #define _DEFS_H + +@@ -32,6 +33,7 @@ + + #define COMPACT_CUMSUM_PART_SIZE (1<<10) + ++ + #define KMER_X 3 + + #define STATS_FASTQ_SIZE (1 << 28) +@@ -66,6 +68,7 @@ + #define MAX_SR 128 + + //Range of number of sorter threads pre sorter in strict memory mode ++ + #define MIN_SMSO 1 + #define MAX_SMSO 16 + +@@ -109,8 +112,8 @@ using uint64 = uint64_t; + #include + + #include +-#include +-using __gnu_cxx::copy_n; ++#include ++//using __gnu_cxx::copy_n; + + #endif + +diff --git a/kmc_core/queues.h b/kmc_core/queues.h +index 17a0465..2c0e587 100644 +--- a/kmc_core/queues.h ++++ b/kmc_core/queues.h +@@ -11,6 +11,8 @@ + #ifndef _QUEUES_H + #define _QUEUES_H + ++#include ++#include + #include "defs.h" + #include + #include +@@ -571,6 +573,9 @@ public: + void init_random() + { + lock_guard lck(mtx); ++ std::random_device rd; ++ std::mt19937 g(rd()); ++ + vector> bin_sizes; + + for (auto& p : m) +@@ -589,7 +594,7 @@ public: + for (uint32 i = no_sort_end; i < bin_sizes.size(); ++i) + random_bins.push_back(bin_sizes[i].first); + +- random_shuffle(random_bins.begin(), random_bins.end()); ++ shuffle(random_bins.begin(), random_bins.end(), g); + + for (uint32 i = no_sort_start; i < no_sort_end; ++i) + random_bins.push_back(bin_sizes[i].first); diff --git a/src/utility.cpp b/src/utility.cpp index 55348f48..122ffd7c 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -138,7 +138,7 @@ std::size_t process_peak_memory() { constexpr const char* process_file = "/proc/self/status"; constexpr const char* peak_mem_field = "VmHWM:"; - constexpr std::size_t field_len = std::strlen(peak_mem_field); + const std::size_t field_len = std::strlen(peak_mem_field); std::FILE* fp = std::fopen(process_file, "r"); if(fp == NULL) From c904e584a7277b29a88e64adf8f14247c1f402be Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 31 Jan 2022 23:22:54 -0500 Subject: [PATCH 338/350] clean up --- CMakeLists.txt | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7a3c25a..7a02a9d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,19 +56,29 @@ if(CF_DEVELOP_MODE) add_compile_definitions(CF_DEVELOP_MODE) endif() + +# Here, we have some platform-specific considerations +# of which we must take care if(CMAKE_SYSTEM_NAME STREQUAL "Linux") -execute_process( - COMMAND getconf LEVEL1_DCACHE_LINESIZE - COMMAND tr -d '\n' - OUTPUT_VARIABLE L1_CACHE_LINE_SIZE -) + execute_process( + COMMAND getconf LEVEL1_DCACHE_LINESIZE + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE + ) elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") -execute_process( - COMMAND sysctl machdep.cpu.cache.linesize - COMMAND awk "{print $2}" - COMMAND tr -d '\n' - OUTPUT_VARIABLE L1_CACHE_LINE_SIZE -) + # OSX has getconf, but doesn't report LEVEL1_DCACHE_LINESIZE + # so we instead use sysctl and the corresponding variable + execute_process( + COMMAND sysctl machdep.cpu.cache.linesize + COMMAND awk "{print $2}" + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE + ) + + # later on, jemalloc will complain if the C compiler + # hasn't been properly set + set(CMAKE_C_COMPILER clang) + set(CMAKE_CXX_COMPILER clang++) endif() add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) From ef33f11176d67bd0d975109d80b61927c58bbd4c Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 23:38:26 -0500 Subject: [PATCH 339/350] Comment out test-code --- src/test.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index 562d653f..23a28c52 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -26,6 +26,7 @@ #include +/* // STEP 1: declare the type of file handler and the read() function KSEQ_INIT(int, read) @@ -268,7 +269,6 @@ void test_kmer_iterator(const char* file_name) } -/* void check_uint64_BBHash(const char* file_name, uint16_t thread_count) { typedef boomphf::SingleHashFunctor hasher_t; @@ -287,7 +287,6 @@ void check_uint64_BBHash(const char* file_name, uint16_t thread_count) boophf_t * bphf = new boomphf::mphf(input_keys.size(), input_keys, ".", thread_count); delete bphf; } -*/ void test_async_writer(const char* log_file_name) @@ -597,6 +596,8 @@ void write_kmers(const std::string& kmc_db_path, const uint16_t thread_count, co output.close(); } +*/ + int main(int argc, char** argv) { @@ -628,8 +629,8 @@ int main(int argc, char** argv) // count_kmers_in_unitigs(argv[1], atoi(argv[2])); - static constexpr uint16_t k = 28; - static const size_t consumer_count = std::atoi(argv[2]); + // static constexpr uint16_t k = 28; + // static const size_t consumer_count = std::atoi(argv[2]); // test_buffered_iterator_performance(argv[1]); // test_SPMC_iterator_performance(argv[1], consumer_count); From 4dae13da51fa5c9e8d7121c25a53e14b0e4efd9e Mon Sep 17 00:00:00 2001 From: jamshed Date: Mon, 31 Jan 2022 23:44:42 -0500 Subject: [PATCH 340/350] Misc. tweaks --- CMakeLists.txt | 40 ++++++++++++++++++++-------------------- include/Spin_Lock.hpp | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a02a9d8..0a539cc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,27 +58,27 @@ endif() # Here, we have some platform-specific considerations -# of which we must take care +# of which we must take care. if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - execute_process( - COMMAND getconf LEVEL1_DCACHE_LINESIZE - COMMAND tr -d '\n' - OUTPUT_VARIABLE L1_CACHE_LINE_SIZE - ) + execute_process( + COMMAND getconf LEVEL1_DCACHE_LINESIZE + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE + ) elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - # OSX has getconf, but doesn't report LEVEL1_DCACHE_LINESIZE - # so we instead use sysctl and the corresponding variable - execute_process( - COMMAND sysctl machdep.cpu.cache.linesize - COMMAND awk "{print $2}" - COMMAND tr -d '\n' - OUTPUT_VARIABLE L1_CACHE_LINE_SIZE - ) - - # later on, jemalloc will complain if the C compiler - # hasn't been properly set - set(CMAKE_C_COMPILER clang) - set(CMAKE_CXX_COMPILER clang++) + # OSX has `getconf`, but doesn't report `LEVEL1_DCACHE_LINESIZE`; + # so we instead use `sysctl` for the corresponding variable. + execute_process( + COMMAND sysctl machdep.cpu.cache.linesize + COMMAND awk "{print $2}" + COMMAND tr -d '\n' + OUTPUT_VARIABLE L1_CACHE_LINE_SIZE + ) + + # Later on, jemalloc will complain if the C compiler + # hasn't been properly set + set(CMAKE_C_COMPILER clang) + set(CMAKE_CXX_COMPILER clang++) endif() add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) @@ -139,7 +139,7 @@ ExternalProject_Add(prj_kmc DOWNLOAD_COMMAND curl -k -L https://github.com/refresh-bio/KMC/archive/refs/tags/v3.2.1.tar.gz -o KMC-3.2.1.tar.gz && tar -xzf KMC-3.2.1.tar.gz && rm KMC-3.2.1.tar.gz - PATCH_COMMAND patch --strip 1 < ${CMAKE_SOURCE_DIR}/patches/kmc_patch.diff + PATCH_COMMAND patch --strip 1 < ${CMAKE_SOURCE_DIR}/patches/kmc_patch.diff SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/KMC-3.2.1 BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ diff --git a/include/Spin_Lock.hpp b/include/Spin_Lock.hpp index 7836b543..d4140067 100644 --- a/include/Spin_Lock.hpp +++ b/include/Spin_Lock.hpp @@ -14,7 +14,7 @@ class Spin_Lock { private: - std::atomic_flag lock_{ATOMIC_FLAG_INIT}; + std::atomic_flag lock_ = ATOMIC_FLAG_INIT; public: From fde178beb466a64045d5a27ed310f61796fbf8f0 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 1 Feb 2022 14:23:59 -0500 Subject: [PATCH 341/350] Fix GFA file extensions --- include/Build_Params.hpp | 42 ++++++++++++++++++++++++++++++++- include/CdBG.hpp | 3 --- include/File_Extensions.hpp | 3 ++- src/CdBG_GFA_Reduced_Writer.cpp | 7 +----- src/CdBG_Writer.cpp | 14 ++++------- 5 files changed, 48 insertions(+), 21 deletions(-) diff --git a/include/Build_Params.hpp b/include/Build_Params.hpp index 00c072a8..499c34df 100644 --- a/include/Build_Params.hpp +++ b/include/Build_Params.hpp @@ -43,6 +43,32 @@ class Build_Params #endif + // Returns the extension of the output file, depending on the output format requested. + const std::string output_file_ext() const + { + if(is_read_graph() || is_ref_graph()) + return cuttlefish::file_ext::unipaths_ext; + + switch(output_format()) + { + case cuttlefish::Output_Format::fa: + return cuttlefish::file_ext::unipaths_ext; + + case cuttlefish::Output_Format::gfa1: + return cuttlefish::file_ext::gfa1_ext; + + case cuttlefish::Output_Format::gfa2: + return cuttlefish::file_ext::gfa2_ext; + + default: + break; + } + + + return ""; + } + + public: // Constructs a parameters wrapper object with the self-explanatory parameters. @@ -151,7 +177,7 @@ class Build_Params // Returns the path to the output file. const std::string output_file_path() const { - return output_file_path_ + cuttlefish::file_ext::unipaths_ext; + return output_file_path_ + output_file_ext(); } @@ -162,6 +188,20 @@ class Build_Params } + // Returns the path to the output segment-file for the GFA-reduced format. + const std::string segment_file_path() const + { + return output_file_path_ + cuttlefish::file_ext::seg_ext; + } + + + // Returns the path to the output sequence-file for the GFA-reduced format. + const std::string sequence_file_path() const + { + return output_file_path_ + cuttlefish::file_ext::seq_ext; + } + + // Returns the working directory (for temporary files). const std::string& working_dir_path() const { diff --git a/include/CdBG.hpp b/include/CdBG.hpp index b6d3550c..c1fccff5 100644 --- a/include/CdBG.hpp +++ b/include/CdBG.hpp @@ -98,9 +98,6 @@ class CdBG std::string overlap_file_prefix = "cuttlefish-overlap-output-"; static constexpr size_t TEMP_FILE_PREFIX_LEN = 10; - // File extensions for the GFA-reduced output format files. - const static std::string SEG_FILE_EXT, SEQ_FILE_EXT; - // Debug std::vector seg_write_time; std::vector link_write_time; diff --git a/include/File_Extensions.hpp b/include/File_Extensions.hpp index 9d164ac7..f63c59ea 100644 --- a/include/File_Extensions.hpp +++ b/include/File_Extensions.hpp @@ -19,7 +19,8 @@ namespace cuttlefish // For reference dBGs only: - // TODO: use these to replace the corresponding constants from `CdBG_Writer`. + constexpr char gfa1_ext[] = ".gfa1"; + constexpr char gfa2_ext[] = ".gfa2"; constexpr char seg_ext[] = ".cf_seg"; constexpr char seq_ext[] = ".cf_seq"; } diff --git a/src/CdBG_GFA_Reduced_Writer.cpp b/src/CdBG_GFA_Reduced_Writer.cpp index 629c5004..7800abd0 100644 --- a/src/CdBG_GFA_Reduced_Writer.cpp +++ b/src/CdBG_GFA_Reduced_Writer.cpp @@ -2,11 +2,6 @@ #include "CdBG.hpp" -// Define the static fields required for the GFA-reduced output. -template const std::string CdBG::SEG_FILE_EXT = ".cf_seg"; -template const std::string CdBG::SEQ_FILE_EXT = ".cf_seq"; - - template void CdBG::write_segment(const uint16_t thread_id, const char* const seq, const uint64_t segment_name, const size_t start_kmer_idx, const size_t end_kmer_idx, const cuttlefish::dir_t dir) { @@ -54,7 +49,7 @@ void CdBG::write_sequence_tiling(Job_Queue& job const uint16_t thread_count = params.thread_count(); - const std::string& seq_file_path = params.output_file_path() + SEQ_FILE_EXT; + const std::string& seq_file_path = params.sequence_file_path(); // Open the output file in append mode. std::ofstream output(seq_file_path.c_str(), std::ios_base::app); diff --git a/src/CdBG_Writer.cpp b/src/CdBG_Writer.cpp index 4d9afbca..d3cc161c 100644 --- a/src/CdBG_Writer.cpp +++ b/src/CdBG_Writer.cpp @@ -11,11 +11,6 @@ #include -// Define the static fields required for the GFA-reduced output. -template const std::string CdBG::SEG_FILE_EXT = ".cf_seg"; -template const std::string CdBG::SEQ_FILE_EXT = ".cf_seq"; - - template void CdBG::output_maximal_unitigs() { @@ -480,14 +475,13 @@ template void CdBG::clear_output_file() const { const cuttlefish::Output_Format op_format = params.output_format(); - const std::string& output_file_path = params.output_file_path(); if(op_format == cuttlefish::fa || op_format == cuttlefish::gfa1 || op_format == cuttlefish::gfa2) - clear_file(output_file_path); + clear_file(params.output_file_path()); else if(op_format == cuttlefish::gfa_reduced) { - const std::string seg_file_path(output_file_path + SEG_FILE_EXT); - const std::string seq_file_path(output_file_path + SEQ_FILE_EXT); + const std::string seg_file_path(params.segment_file_path()); + const std::string seq_file_path(params.sequence_file_path()); clear_file(seg_file_path); clear_file(seq_file_path); @@ -500,7 +494,7 @@ void CdBG::init_output_loggers() { const cuttlefish::Output_Format gfa_v = params.output_format(); const std::string& output_file_path = (gfa_v == cuttlefish::Output_Format::gfa_reduced ? - params.output_file_path() + SEG_FILE_EXT : params.output_file_path()); + params.segment_file_path() : params.output_file_path()); const uint16_t thread_count = params.thread_count(); From 9dfcc2c28eae75ba6e9bc5027a19f4b499cb10a1 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 1 Feb 2022 17:00:52 -0500 Subject: [PATCH 342/350] Pass compiler vars to KMC; avoid explicit setting for jemalloc --- CMakeLists.txt | 6 +++--- patches/kmc_patch.diff | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0a539cc6..734a43aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,8 +77,8 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") # Later on, jemalloc will complain if the C compiler # hasn't been properly set - set(CMAKE_C_COMPILER clang) - set(CMAKE_CXX_COMPILER clang++) + # set(CMAKE_C_COMPILER clang) + # set(CMAKE_CXX_COMPILER clang++) endif() add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) @@ -144,7 +144,7 @@ ExternalProject_Add(prj_kmc BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external/ CONFIGURE_COMMAND "" - BUILD_COMMAND make -j + BUILD_COMMAND make -j CC=${CMAKE_CXX_COMPILER} INSTALL_COMMAND cp bin/libkmc_core.a ${EXT_LIB} && cp include/kmc_runner.h ${EXT_INCLUDE} ) diff --git a/patches/kmc_patch.diff b/patches/kmc_patch.diff index 56a44630..b47e0205 100644 --- a/patches/kmc_patch.diff +++ b/patches/kmc_patch.diff @@ -2,13 +2,12 @@ diff --git a/Makefile b/Makefile index bc07d35..4bfd785 100644 --- a/Makefile +++ b/Makefile -@@ -13,10 +13,10 @@ OUT_BIN_DIR = bin +@@ -13,15 +13,11 @@ OUT_INCLUDE_DIR = include ifeq ($(UNAME_S),Darwin) - CC = /usr/local/bin/g++-10 -+ CC = clang++ - +- - CFLAGS = -Wall -O3 -m64 -static-libgcc -static-libstdc++ -pthread -std=c++14 - CLINK = -lm -static-libgcc -static-libstdc++ -O3 -pthread -std=c++14 + CFLAGS = -Wall -O3 -m64 -pthread -std=c++14 @@ -16,7 +15,12 @@ index bc07d35..4bfd785 100644 PY_KMC_API_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++14 -O3 else -@@ -61,7 +61,7 @@ ifeq ($(UNAME_S),Darwin) +- CC = g++ +- + CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 + CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14 + +@@ -61,7 +57,7 @@ $(KMC_TOOLS_DIR)/libs/libz.1.2.5.dylib \ $(KMC_TOOLS_DIR)/libs/libbz2.1.0.5.dylib From 731c691654176b901740666f12dc252908341233 Mon Sep 17 00:00:00 2001 From: jamshed Date: Tue, 1 Feb 2022 17:35:06 -0500 Subject: [PATCH 343/350] Make external sub-dirs explicitly --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 734a43aa..495558bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,9 +106,12 @@ endif() # Module required to download and install external projects. include(ExternalProject) + set(EXT_LIB ${CMAKE_SOURCE_DIR}/external/lib/) set(EXT_INCLUDE ${CMAKE_SOURCE_DIR}/external/include/) +file(MAKE_DIRECTORY ${EXT_LIB} ${EXT_INCLUDE}) + # Prepare the `jemalloc` library. It provides scalable concurrency support and better avoidance # of fragmentation. From 1780f88d9dbd46f8fb1f007e34fae77367a74489 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 11:43:15 -0500 Subject: [PATCH 344/350] Fix jemalloc installation --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 495558bf..14b9b77c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,8 +126,7 @@ ExternalProject_Add(prj_jemalloc BUILD_IN_SOURCE TRUE INSTALL_DIR ${CMAKE_SOURCE_DIR}/external CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" - INSTALL_COMMAND cp -r lib ${EXT_LIB} && - cp -r include ${EXT_INCLUDE} + INSTALL_COMMAND cp lib/libjemalloc.a ${EXT_LIB} ) add_library(jemalloc STATIC IMPORTED) From 1ec63696900a50b78f8fcc5fb5c1c23cf402704a Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 12:13:02 -0500 Subject: [PATCH 345/350] Use existing jemalloc if found --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 14b9b77c..6344bee9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,23 +115,39 @@ file(MAKE_DIRECTORY ${EXT_LIB} ${EXT_INCLUDE}) # Prepare the `jemalloc` library. It provides scalable concurrency support and better avoidance # of fragmentation. -message("Build system will fetch and install jemalloc") -ExternalProject_Add(prj_jemalloc - DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external - DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz && - tar -xzf jemalloc-5.2.1.tar.gz && - rm jemalloc-5.2.1.tar.gz +set(MALLOC_LIB "") +set(JEMALLOC_MIN_VERSION "5.2.1") +find_package(Jemalloc) +if(JEMALLOC_FOUND) + if(${JEMALLOC_VERSION} VERSION_GREATER_EQUAL ${JEMALLOC_MIN_VERSION}) + message("Found the jemalloc library (v${JEMALLOC_VERSION}) in the system.") + set(MALLOC_LIB ${JEMALLOC_LIBRARIES}) + endif() +endif() - SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 - BUILD_IN_SOURCE TRUE - INSTALL_DIR ${CMAKE_SOURCE_DIR}/external - CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" - INSTALL_COMMAND cp lib/libjemalloc.a ${EXT_LIB} -) +if(MALLOC_LIB STREQUAL "") + message("Build system will fetch and install jemalloc") + ExternalProject_Add(prj_jemalloc + DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external + DOWNLOAD_COMMAND curl -k -L https://github.com/jemalloc/jemalloc/archive/5.2.1.tar.gz -o jemalloc-5.2.1.tar.gz && + tar -xzf jemalloc-5.2.1.tar.gz && + rm jemalloc-5.2.1.tar.gz + + SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/jemalloc-5.2.1 + BUILD_IN_SOURCE TRUE + INSTALL_DIR ${CMAKE_SOURCE_DIR}/external + CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --disable-debug --enable-static --prefix= --silent" + INSTALL_COMMAND cp lib/libjemalloc.a ${EXT_LIB} + ) + + set(MALLOC_LIB ${EXT_LIB}/libjemalloc.a) +endif() add_library(jemalloc STATIC IMPORTED) -set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION ${EXT_LIB}/libjemalloc.a) -add_dependencies(jemalloc prj_jemalloc) +set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION ${MALLOC_LIB}) +if(NOT JEMALLOC_FOUND) + add_dependencies(jemalloc prj_jemalloc) +endif() # Prepare the `kmc` library — required by the Cuttlefish algorithm implementation. From dbe10edcf24c7dad4188f5be056816a1adf4ff11 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 14:25:20 -0500 Subject: [PATCH 346/350] Add FindJemalloc module --- CMakeLists.txt | 4 +++ cmake/Modules/FindJemalloc.cmake | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 cmake/Modules/FindJemalloc.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 6344bee9..d6ca9e0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,10 @@ if(NOT BZIP2_FOUND) endif() +# Set path for modules required to search for existing packages in the system. +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") + + # Module required to download and install external projects. include(ExternalProject) diff --git a/cmake/Modules/FindJemalloc.cmake b/cmake/Modules/FindJemalloc.cmake new file mode 100644 index 00000000..9141467f --- /dev/null +++ b/cmake/Modules/FindJemalloc.cmake @@ -0,0 +1,53 @@ +# From: https://raw.githubusercontent.com/STEllAR-GROUP/hpx/master/cmake/FindJemalloc.cmake +# Copyright (c) 2014 Thomas Heller +# Copyright (c) 2007-2012 Hartmut Kaiser +# Copyright (c) 2010-2011 Matt Anderson +# Copyright (c) 2011 Bryce Lelbach +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +find_package(PkgConfig) +pkg_check_modules(PC_JEMALLOC QUIET libjemalloc) + +find_path(JEMALLOC_INCLUDE_DIR jemalloc/jemalloc.h + HINTS + ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT + ${PC_JEMALLOC_MINIMAL_INCLUDEDIR} + ${PC_JEMALLOC_MINIMAL_INCLUDE_DIRS} + ${PC_JEMALLOC_INCLUDEDIR} + ${PC_JEMALLOC_INCLUDE_DIRS} + PATH_SUFFIXES include) + +find_library(JEMALLOC_LIBRARY NAMES jemalloc libjemalloc + HINTS + ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT + ${PC_JEMALLOC_MINIMAL_LIBDIR} + ${PC_JEMALLOC_MINIMAL_LIBRARY_DIRS} + ${PC_JEMALLOC_LIBDIR} + ${PC_JEMALLOC_LIBRARY_DIRS} + PATH_SUFFIXES lib lib64) + +if(JEMALLOC_INCLUDE_DIR) + set(_version_regex "^#define[ \t]+JEMALLOC_VERSION[ \t]+\"([^\"]+)\".*") + file(STRINGS "${JEMALLOC_INCLUDE_DIR}/jemalloc/jemalloc.h" + JEMALLOC_VERSION REGEX "${_version_regex}") + string(REGEX REPLACE "${_version_regex}" "\\1" + JEMALLOC_VERSION "${JEMALLOC_VERSION}") + unset(_version_regex) +endif() + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set JEMALLOC_FOUND to TRUE +# if all listed variables are TRUE and the requested version matches. +find_package_handle_standard_args(Jemalloc REQUIRED_VARS + JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR + VERSION_VAR JEMALLOC_VERSION) + + +if(JEMALLOC_FOUND) + set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY}) + set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR}) +endif() + +mark_as_advanced(JEMALLOC_INCLUDE_DIR JEMALLOC_LIBRARY) From 5b22833f8120e3276a9106afa853f06f8b0399de Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 16:12:26 -0500 Subject: [PATCH 347/350] Fix mismatching include-guard --- include/Read_CdBG_Extractor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Read_CdBG_Extractor.hpp b/include/Read_CdBG_Extractor.hpp index 0bb38842..c903db04 100644 --- a/include/Read_CdBG_Extractor.hpp +++ b/include/Read_CdBG_Extractor.hpp @@ -1,5 +1,5 @@ -#ifndef READ_CDBG_EXTRACTPR_HPP +#ifndef READ_CDBG_EXTRACTOR_HPP #define READ_CDBG_EXTRACTOR_HPP From d07f952d9c78c61459dde588379f15cbff45f12f Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 16:23:02 -0500 Subject: [PATCH 348/350] Improve jemalloc conditional --- CMakeLists.txt | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d6ca9e0b..74efe2db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,15 +121,11 @@ file(MAKE_DIRECTORY ${EXT_LIB} ${EXT_INCLUDE}) # of fragmentation. set(MALLOC_LIB "") set(JEMALLOC_MIN_VERSION "5.2.1") -find_package(Jemalloc) +find_package(Jemalloc ${JEMALLOC_MIN_VERSION}) if(JEMALLOC_FOUND) - if(${JEMALLOC_VERSION} VERSION_GREATER_EQUAL ${JEMALLOC_MIN_VERSION}) - message("Found the jemalloc library (v${JEMALLOC_VERSION}) in the system.") - set(MALLOC_LIB ${JEMALLOC_LIBRARIES}) - endif() -endif() - -if(MALLOC_LIB STREQUAL "") + message("Found jemalloc (v${JEMALLOC_VERSION}) in the system") + set(MALLOC_LIB ${JEMALLOC_LIBRARIES}) +else() message("Build system will fetch and install jemalloc") ExternalProject_Add(prj_jemalloc DOWNLOAD_DIR ${CMAKE_SOURCE_DIR}/external From c0b4c9d078b2edc6cbf478df8a7b228b25b02787 Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 16:28:12 -0500 Subject: [PATCH 349/350] Reinstate jemalloc source-build hack (for osx) --- CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74efe2db..191709cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,9 +76,11 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") ) # Later on, jemalloc will complain if the C compiler - # hasn't been properly set - # set(CMAKE_C_COMPILER clang) - # set(CMAKE_CXX_COMPILER clang++) + # hasn't been properly set. + if(NOT CONDA_BUILD) + set(CMAKE_C_COMPILER clang) + set(CMAKE_CXX_COMPILER clang++) + endif() endif() add_compile_definitions(L1_CACHE_LINE_SIZE=${L1_CACHE_LINE_SIZE}) From 4cb8daa2605aab047fbf77e075112e90825e305d Mon Sep 17 00:00:00 2001 From: jamshed Date: Wed, 2 Feb 2022 16:29:30 -0500 Subject: [PATCH 350/350] Help conda-build get around its old SDK-based checks --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 191709cf..641e9226 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,5 +187,11 @@ set(CMAKE_C_FLAGS_DEBUG "-g") set(CMAKE_C_FLAGS_RELEASE "-O3") +# Help `conda` build for OSX through circumventing some of its old SDK-based checks. +if(CONDA_BUILD) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_LIBCPP_DISABLE_AVAILABILITY") +endif() + + # Add subdirectory `src` to the build; CMake will open `src/CMakeLists.txt` for such. add_subdirectory(src)