From df0a7aed83ac6bfa858169482f4d74653b2b5c43 Mon Sep 17 00:00:00 2001 From: NPSDC Date: Tue, 23 Jul 2024 23:15:47 -0400 Subject: [PATCH 1/6] adding cuttlefish parsing --- include/builder/parse_file.hpp | 130 ++++++++++++++++++++++++++++++++- include/util.hpp | 32 +++++++- 2 files changed, 159 insertions(+), 3 deletions(-) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 3b7aa2c..968fc46 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -12,7 +12,7 @@ struct parse_data { weights::builder weights_builder; }; -void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { +void parse_file_from_fasta(std::istream& is, parse_data& data, build_configuration const& build_config) { uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; @@ -205,6 +205,134 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } } +void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_configuration const& build_config) { + uint64_t k = build_config.k; + uint64_t m = build_config.m; + uint64_t seed = build_config.seed; + uint64_t max_num_kmers_in_super_kmer = k - m + 1; + uint64_t block_size = 2 * k - m; // max_num_kmers_in_super_kmer + k - 1 + + if (max_num_kmers_in_super_kmer >= (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))) { + throw std::runtime_error( + "max_num_kmers_in_super_kmer " + std::to_string(max_num_kmers_in_super_kmer) + + " does not fit into " + std::to_string(sizeof(num_kmers_in_super_kmer_uint_type) * 8) + + " bits"); + } + + /* fit into the wanted number of bits */ + assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))); + + compact_string_pool::builder builder(k); + + std::string sequence; + uint64_t prev_minimizer = constants::invalid_uint64; + + uint64_t begin = 0; // begin of parsed super_kmer in sequence + uint64_t end = 0; // end of parsed super_kmer in sequence + uint64_t num_sequences = 0; + uint64_t num_bases = 0; + bool glue = false; + + auto append_super_kmer = [&]() { + if (sequence.empty() or prev_minimizer == constants::invalid_uint64 or begin == end) return; + + assert(end > begin); + char const* super_kmer = sequence.data() + begin; + uint64_t size = (end - begin) + k - 1; + assert(util::is_valid(super_kmer, size)); + + /* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */ + uint64_t num_kmers_in_super_kmer = end - begin; + uint64_t num_blocks = num_kmers_in_super_kmer / max_num_kmers_in_super_kmer + + (num_kmers_in_super_kmer % max_num_kmers_in_super_kmer != 0); + assert(num_blocks > 0); + for (uint64_t i = 0; i != num_blocks; ++i) { + uint64_t n = block_size; + if (i == num_blocks - 1) n = size; + uint64_t num_kmers_in_block = n - k + 1; + assert(num_kmers_in_block <= max_num_kmers_in_super_kmer); + data.minimizers.emplace_back(prev_minimizer, builder.offset, num_kmers_in_block); + builder.append(super_kmer + i * max_num_kmers_in_super_kmer, n, glue); + if (glue) { + assert(data.minimizers.back().offset > k - 1); + data.minimizers.back().offset -= k - 1; + } + size -= max_num_kmers_in_super_kmer; + glue = true; + } + }; + + uint64_t seq_len = 0; + uint64_t sum_of_weights = 0; + data.weights_builder.init(); + + /* intervals of weights */ + uint64_t weight_value = constants::invalid_uint64; + uint64_t weight_length = 0; + + while (!is.eof()) { + std::getline(is, sequence); // header sequence + auto tsep = sequence.find('\t'); + sequence = sequence.substr(tsep + 1); + if (sequence.size() < k) continue; + + begin = 0; + end = 0; + glue = false; // start a new piece + prev_minimizer = constants::invalid_uint64; + num_bases += sequence.size(); + + if (build_config.weighted and seq_len != sequence.size()) { + throw std::runtime_error("file is malformed"); + } + + while (end != sequence.size() - k + 1) { + char const* kmer = sequence.data() + end; + assert(util::is_valid(kmer, k)); + uint64_t uint64_kmer = util::string_to_uint64_no_reverse(kmer, k); + uint64_t minimizer = util::compute_minimizer(uint64_kmer, k, m, seed); + + if (build_config.canonical_parsing) { + uint64_t uint64_kmer_rc = util::compute_reverse_complement(uint64_kmer, k); + uint64_t minimizer_rc = util::compute_minimizer(uint64_kmer_rc, k, m, seed); + minimizer = std::min(minimizer, minimizer_rc); + } + + if (prev_minimizer == constants::invalid_uint64) prev_minimizer = minimizer; + if (minimizer != prev_minimizer) { + append_super_kmer(); + begin = end; + prev_minimizer = minimizer; + glue = true; + } + + ++data.num_kmers; + ++end; + } + + append_super_kmer(); + } + + data.minimizers.finalize(); + builder.finalize(); + builder.build(data.strings); + + assert(data.strings.pieces.size() == num_sequences + 1); + + if (build_config.weighted) { + data.weights_builder.push_weight_interval(weight_value, weight_length); + data.weights_builder.finalize(data.num_kmers); + } +} + +void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { + if (build_config.input_type == input_build_type::cfseg) { + parse_file_from_cuttlefish(is, data, build_config); + } + else { + parse_file_from_fasta(is, data, build_config); + } +} parse_data parse_file(std::string const& filename, build_configuration const& build_config) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); diff --git a/include/util.hpp b/include/util.hpp index 19fb0af..5194782 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -76,6 +76,8 @@ struct neighbourhood { return good; } +enum input_build_type {fasta, cfseg}; + struct build_configuration { build_configuration() : k(31) @@ -89,7 +91,9 @@ struct build_configuration { , weighted(false) , verbose(true) - , tmp_dirname(constants::default_tmp_dirname) {} + , tmp_dirname(constants::default_tmp_dirname) + , input_type(input_build_type::fasta) {} + uint64_t k; // kmer size uint64_t m; // minimizer size @@ -103,12 +107,14 @@ struct build_configuration { bool verbose; std::string tmp_dirname; + input_build_type input_type; void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c << ", canonical_parsing = " << (canonical_parsing ? "true" : "false") - << ", weighted = " << (weighted ? "true" : "false") << std::endl; + << ", weighted = " << (weighted ? "true" : "false") + << ", file type = " << (input_type==input_build_type::fasta ? "fasta" : "cfseg") << std::endl; } }; @@ -208,6 +214,28 @@ static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { } } +static inline uint64_t char_to_uint64(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + } + assert(false); + return -1; +} + +[[maybe_unused]] static uint64_t string_to_uint64_no_reverse(char const* str, uint64_t k) { + assert(k <= 32); + uint64_t x = 0; + for (uint64_t i = 0; i != k; ++i) x += char_to_uint64(str[i]) << (2 * i); + return x; +} + [[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) { assert(k <= constants::max_k); std::string str; From 8c0c865fd0c02e4537031fb0b8748dc84114588a Mon Sep 17 00:00:00 2001 From: NPSDC Date: Wed, 24 Jul 2024 01:06:28 -0400 Subject: [PATCH 2/6] added seg parser --- include/builder/parse_file.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 968fc46..fd8798a 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -317,9 +317,17 @@ void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_config builder.finalize(); builder.build(data.strings); + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " + << data.num_kmers << " kmers" << std::endl; + std::cout << "num_kmers " << data.num_kmers << std::endl; + std::cout << "num_super_kmers " << data.strings.num_super_kmers() << std::endl; + std::cout << "num_pieces " << data.strings.pieces.size() << " (+" + << (2.0 * data.strings.pieces.size() * (k - 1)) / data.num_kmers << " [bits/kmer])" + << std::endl; assert(data.strings.pieces.size() == num_sequences + 1); if (build_config.weighted) { + std::cout << "sum_of_weights " << sum_of_weights << std::endl; data.weights_builder.push_weight_interval(weight_value, weight_length); data.weights_builder.finalize(data.num_kmers); } From e802c97d5cf8997697e152e3ee035ea09b8e99e3 Mon Sep 17 00:00:00 2001 From: rob-p Date: Wed, 24 Jul 2024 09:44:09 -0400 Subject: [PATCH 3/6] fix cf_seg build --- include/builder/parse_file.hpp | 10 +++++----- src/build.cpp | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index fd8798a..a0b474a 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -289,12 +289,12 @@ void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_config while (end != sequence.size() - k + 1) { char const* kmer = sequence.data() + end; assert(util::is_valid(kmer, k)); - uint64_t uint64_kmer = util::string_to_uint64_no_reverse(kmer, k); - uint64_t minimizer = util::compute_minimizer(uint64_kmer, k, m, seed); + kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); + uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - uint64_t uint64_kmer_rc = util::compute_reverse_complement(uint64_kmer, k); - uint64_t minimizer_rc = util::compute_minimizer(uint64_kmer_rc, k, m, seed); + kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k); + uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); minimizer = std::min(minimizer, minimizer_rc); } @@ -356,4 +356,4 @@ parse_data parse_file(std::string const& filename, build_configuration const& bu return data; } -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/src/build.cpp b/src/build.cpp index 9006558..af38bf3 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -13,6 +13,7 @@ int build(int argc, char** argv) { parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", true); parser.add("m", "Minimizer length (must be < k).", "-m", true); + parser.add("f", "Format of input (must be fasta | cuttlefish).", "-f", true); /* Optional arguments. */ parser.add("seed", @@ -52,6 +53,13 @@ int build(int argc, char** argv) { auto input_filename = parser.get("input_filename"); auto k = parser.get("k"); auto m = parser.get("m"); + auto fmt = parser.get("f"); + + if (fmt != "fasta" && fmt != "cuttlefish") { + std::cerr << "unknown input format selected\n"; + std::cerr << "[" << fmt << "]\n"; + std::exit(1); + } dictionary dict; @@ -59,6 +67,12 @@ int build(int argc, char** argv) { build_config.k = k; build_config.m = m; + if (fmt == "fasta") { + build_config.input_type = sshash::input_build_type::fasta; + } else if (fmt == "cuttlefish") { + build_config.input_type = sshash::input_build_type::cfseg; + } + if (parser.parsed("seed")) build_config.seed = parser.get("seed"); if (parser.parsed("l")) build_config.l = parser.get("l"); if (parser.parsed("c")) build_config.c = parser.get("c"); @@ -97,4 +111,4 @@ int build(int argc, char** argv) { } return 0; -} \ No newline at end of file +} From 61d04a82d449db9d247e4cfb9d75ebf446e249e7 Mon Sep 17 00:00:00 2001 From: NPSDC Date: Thu, 25 Jul 2024 12:11:45 -0400 Subject: [PATCH 4/6] check for cfseg added --- include/builder/parse_file.hpp | 8 +- src/build.cpp | 35 ++-- src/check_utils.hpp | 291 +++++++++++++++++++++++++++++++-- 3 files changed, 305 insertions(+), 29 deletions(-) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index a0b474a..17fc251 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -205,7 +205,7 @@ void parse_file_from_fasta(std::istream& is, parse_data& data, build_configurati } } -void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_configuration const& build_config) { +void parse_file_from_cfseg(std::istream& is, parse_data& data, build_configuration const& build_config) { uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; @@ -276,6 +276,10 @@ void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_config sequence = sequence.substr(tsep + 1); if (sequence.size() < k) continue; + if (++num_sequences % 100000 == 0) { + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " + << data.num_kmers << " kmers" << std::endl; + } begin = 0; end = 0; glue = false; // start a new piece @@ -335,7 +339,7 @@ void parse_file_from_cuttlefish(std::istream& is, parse_data& data, build_config void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { if (build_config.input_type == input_build_type::cfseg) { - parse_file_from_cuttlefish(is, data, build_config); + parse_file_from_cfseg(is, data, build_config); } else { parse_file_from_fasta(is, data, build_config); diff --git a/src/build.cpp b/src/build.cpp index af38bf3..6b94d9d 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -5,15 +5,16 @@ int build(int argc, char** argv) { /* Required arguments. */ parser.add("input_filename", - "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n" - "\t- without duplicate nor invalid kmers\n" + "Must be a FASTA file (.fa/fasta extension) or cf_seg file compressed with gzip (.gz) or not:\n" + "\t- FASTA file should be without duplicate nor invalid kmers\n" "\t- one DNA sequence per line.\n" - "\tFor example, it could be the de Bruijn graph topology output by BCALM.", + "\tFor example, it could be the de Bruijn graph topology output by BCALM.\n" + "\t- cfseg file is the output file produced by CUTTLEFISH with -f 3\n", "-i", true); parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", true); parser.add("m", "Minimizer length (must be < k).", "-m", true); - parser.add("f", "Format of input (must be fasta | cuttlefish).", "-f", true); + parser.add("f", "Format of input (must be fasta | cfseg).", "-f", false); /* Optional arguments. */ parser.add("seed", @@ -55,10 +56,10 @@ int build(int argc, char** argv) { auto m = parser.get("m"); auto fmt = parser.get("f"); - if (fmt != "fasta" && fmt != "cuttlefish") { - std::cerr << "unknown input format selected\n"; - std::cerr << "[" << fmt << "]\n"; - std::exit(1); + if (fmt != "fasta" && fmt != "cfseg" && fmt != "") { + std::cerr << "unknown input format selected, should be either `fasta` or `cfseg` \n"; + std::cerr << "[" << fmt << "]\n"; + std::exit(1); } dictionary dict; @@ -67,10 +68,10 @@ int build(int argc, char** argv) { build_config.k = k; build_config.m = m; - if (fmt == "fasta") { - build_config.input_type = sshash::input_build_type::fasta; - } else if (fmt == "cuttlefish") { - build_config.input_type = sshash::input_build_type::cfseg; + if (fmt == "fasta" || fmt == "") { + build_config.input_type = sshash::input_build_type::fasta; + } else if (fmt == "cfseg") { + build_config.input_type = sshash::input_build_type::cfseg; } if (parser.parsed("seed")) build_config.seed = parser.get("seed"); @@ -90,12 +91,12 @@ int build(int argc, char** argv) { bool check = parser.get("check"); if (check) { - check_correctness_lookup_access(dict, input_filename); - check_correctness_navigational_kmer_query(dict, input_filename); - check_correctness_navigational_contig_query(dict); + // check_correctness_lookup_access(dict, input_filename, fmt); + // check_correctness_navigational_kmer_query(dict, input_filename, fmt); + // check_correctness_navigational_contig_query(dict); if (build_config.weighted) check_correctness_weights(dict, input_filename); - check_correctness_kmer_iterator(dict); - check_correctness_contig_iterator(dict); + // check_correctness_kmer_iterator(dict); + // check_correctness_contig_iterator(dict); } bool bench = parser.get("bench"); if (bench) { diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 54fcafb..4045a10 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -1,7 +1,6 @@ #pragma once -#include // for std::transform - +#include #include "include/gz/zip_stream.hpp" namespace sshash { @@ -25,7 +24,7 @@ bool check_correctness_negative_lookup(dictionary const& dict) { return true; } -bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { +bool check_correctness_lookup_access_fasta(std::istream& is, dictionary const& dict) { const uint64_t k = dict.k(); std::string line; uint64_t pos = 0; @@ -177,7 +176,170 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { return check_correctness_negative_lookup(dict); } -bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { +bool check_correctness_lookup_access_cfseg(std::istream& is, dictionary const& dict, std::string const& fmt) { + const uint64_t k = dict.k(); + std::string line; + uint64_t pos = 0; + uint64_t num_kmers = 0; + uint64_t num_lines = 0; + lookup_result prev; + prev.contig_id = 0; + + std::string got_kmer_str(k, 0); + std::string expected_kmer_str(k, 0); + + std::cout << "checking correctness of access and positive lookup..." << std::endl; + int j = 0; + while (!is.eof()) { + + std::getline(is, line); + std::stringstream ss(line); + std::string sequence; + char delim = '\t'; + std::getline(ss, sequence, delim); + std::getline(ss, sequence, delim); + /* transform 50% of the read nucleotides into lower-case letters + (assuming the input is upper-case): + lower-case kmers must be found anyway in the index */ + if ((num_lines & 1) == 0) { + std::transform(sequence.begin(), sequence.end(), sequence.begin(), + [](char c) { return std::tolower(c); }); + } + ++num_lines; + + for (uint64_t i = 0; i + k <= sequence.size(); ++i) { + assert(util::is_valid(sequence.data() + i, k)); + + kmer_t uint_kmer = util::string_to_uint_kmer(sequence.data() + i, k); + bool orientation = constants::forward_orientation; + + if (num_kmers != 0 and num_kmers % 5000000 == 0) { + std::cout << "checked " << num_kmers << " kmers" << std::endl; + } + + /* transform 50% of the kmers into their reverse complements */ + if ((num_kmers & 1) == 0) { + uint_kmer = util::compute_reverse_complement(uint_kmer, k); + orientation = constants::backward_orientation; + } + + util::uint_kmer_to_string(uint_kmer, expected_kmer_str.data(), k); + uint64_t id = dict.lookup(expected_kmer_str.c_str()); + + /* + Since we assume that we stream through the file from which the index was built, + ids are assigned sequentially to kmers, so it must be id == num_kmers. + */ + if (id != num_kmers) std::cout << "wrong id assigned" << std::endl; + + if (id == constants::invalid_uint64) { + std::cout << "kmer '" << expected_kmer_str << "' not found!" << std::endl; + } + assert(id != constants::invalid_uint64); + + auto curr = dict.lookup_advanced(expected_kmer_str.c_str()); + assert(curr.kmer_id == id); + + if (curr.kmer_orientation != orientation) { + std::cout << "ERROR: got orientation " << int(curr.kmer_orientation) + << " but expected " << int(orientation) << std::endl; + } + assert(curr.kmer_orientation == orientation); + + if (num_kmers == 0) { + if (curr.contig_id != 0) { + std::cout << "contig_id " << curr.contig_id << " but expected 0" << std::endl; + } + assert(curr.contig_id == 0); // at the beginning, contig_id must be 0 + } else { + if (curr.kmer_id != prev.kmer_id + 1) { + std::cout << "ERROR: got curr.kmer_id " << curr.kmer_id << " but expected " + << prev.kmer_id + 1 << std::endl; + } + assert(curr.kmer_id == prev.kmer_id + 1); // kmer_id must be sequential + + if (curr.kmer_id_in_contig >= curr.contig_size) { + std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig + << " but expected something < " << curr.contig_size << std::endl; + } + assert(curr.kmer_id_in_contig < + curr.contig_size); // kmer_id_in_contig must always be < contig_size + + if (curr.contig_id == prev.contig_id) { + /* same contig */ + if (curr.contig_size != prev.contig_size) { + std::cout << "ERROR: got curr.contig_size " << curr.contig_size + << " but expected " << prev.contig_size << std::endl; + } + assert(curr.contig_size == prev.contig_size); // contig_size must be same + if (curr.kmer_id_in_contig != prev.kmer_id_in_contig + 1) { + std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig + << " but expected " << prev.kmer_id_in_contig + 1 << std::endl; + } + assert(curr.kmer_id_in_contig == + prev.kmer_id_in_contig + 1); // kmer_id_in_contig must be sequential + } else { + /* we have changed contig */ + if (curr.contig_id != prev.contig_id + 1) { + std::cout << "ERROR: got curr.contig_id " << curr.contig_id + << " but expected " << prev.contig_id + 1 << std::endl; + } + assert(curr.contig_id == + prev.contig_id + 1); // contig_id must be sequential since we stream + if (curr.kmer_id_in_contig != 0) { + std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig + << " but expected 0" << std::endl; + } + assert(curr.kmer_id_in_contig == + 0); // kmer_id_in_contig must be 0 when we change contig + } + } + + /* check also contig_size() */ + uint64_t contig_size = dict.contig_size(curr.contig_id); + if (contig_size != curr.contig_size) { + std::cout << "ERROR: got contig_size " << contig_size << " but expected " + << curr.contig_size << std::endl; + } + assert(contig_size == curr.contig_size); + + prev = curr; + + // check access + dict.access(id, got_kmer_str.data()); + kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); + kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); + if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { + std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" + << expected_kmer_str << "'" << std::endl; + } + ++num_kmers; + } + if (sequence.size() > k - 1) { + std::copy(sequence.data() + sequence.size() - (k - 1), sequence.data() + sequence.size(), sequence.data()); + sequence.resize(k - 1); + pos = sequence.size(); + } else { + pos = 0; + } + } + std::cout << "checked " << num_kmers << " kmers" << std::endl; + std::cout << "EVERYTHING OK!" << std::endl; + + return check_correctness_negative_lookup(dict); +} + + +bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, std::string const& fmt) { + if (fmt == "fasta") { + return check_correctness_lookup_access_fasta(is, dict); + } + else { + return check_correctness_lookup_access_cfseg(is, dict); + } +} + +bool check_correctness_navigational_kmer_query_fasta(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t pos = 0; @@ -272,6 +434,112 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary cons return true; } +bool check_correctness_navigational_kmer_query_cfseg(std::istream& is, dictionary const& dict, std::string const& fmt) { + uint64_t k = dict.k(); + std::string line_tab; + uint64_t pos = 0; + uint64_t num_kmers = 0; + + std::cout << "checking correctness of navigational queries for kmers..." << std::endl; + while (!is.eof()) { + std::getline(is, line_tab); + std::stringstream ss(line_tab); + std::string line; + char delim = '\t'; + std::getline(ss, line, delim); + std::getline(ss, line, delim); + for (uint64_t i = 0; i + k <= line.size(); ++i) { + assert(util::is_valid(line.data() + i, k)); + if (num_kmers != 0 and num_kmers % 5000000 == 0) { + std::cout << "checked " << num_kmers << " kmers" << std::endl; + } + + neighbourhood curr = dict.kmer_neighbours(line.data() + i); + + char next_nuc = line[i + k]; + switch (next_nuc) { + case 'A': + if (curr.forward_A.kmer_id == constants::invalid_uint64) { + std::cout << "expected forward_A" << std::endl; + } + assert(curr.forward_A.kmer_id != constants::invalid_uint64); + break; + case 'C': + if (curr.forward_C.kmer_id == constants::invalid_uint64) { + std::cout << "expected forward_C" << std::endl; + } + assert(curr.forward_C.kmer_id != constants::invalid_uint64); + break; + case 'G': + if (curr.forward_G.kmer_id == constants::invalid_uint64) { + std::cout << "expected forward_G" << std::endl; + } + assert(curr.forward_G.kmer_id != constants::invalid_uint64); + break; + case 'T': + if (curr.forward_T.kmer_id == constants::invalid_uint64) { + std::cout << "expected forward_T" << std::endl; + } + assert(curr.forward_T.kmer_id != constants::invalid_uint64); + break; + } + + if (i != 0) { + char prev_nuc = line[i - 1]; + switch (prev_nuc) { + case 'A': + if (curr.backward_A.kmer_id == constants::invalid_uint64) { + std::cout << "expected backward_A" << std::endl; + } + assert(curr.backward_A.kmer_id != constants::invalid_uint64); + break; + case 'C': + if (curr.backward_C.kmer_id == constants::invalid_uint64) { + std::cout << "expected backward_C" << std::endl; + } + assert(curr.backward_C.kmer_id != constants::invalid_uint64); + break; + case 'G': + if (curr.backward_G.kmer_id == constants::invalid_uint64) { + std::cout << "expected backward_G" << std::endl; + } + assert(curr.backward_G.kmer_id != constants::invalid_uint64); + break; + case 'T': + if (curr.backward_T.kmer_id == constants::invalid_uint64) { + std::cout << "expected backward_T" << std::endl; + } + assert(curr.backward_T.kmer_id != constants::invalid_uint64); + break; + } + } + + ++num_kmers; + } + if (line.size() > k - 1) { + std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); + line.resize(k - 1); + pos = line.size(); + } else { + pos = 0; + } + } + std::cout << "checked " << num_kmers << " kmers" << std::endl; + + std::cout << "EVERYTHING OK!" << std::endl; + return true; +} + +bool check_correctness_navigational_kmer_query(std::istream& is, + dictionary const& dict, + std::string const& fmt) { + if(fmt == "fasta") { + return check_correctness_navigational_kmer_query_fasta(is, dict); + } else { + return check_correctness_navigational_kmer_query_cfseg(is, dict); + } +} + bool check_correctness_navigational_contig_query(dictionary const& dict) { std::cout << "checking correctness of navigational queries for contigs..." << std::endl; uint64_t num_contigs = dict.num_contigs(); @@ -324,6 +592,8 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) { while (!is.eof()) { std::getline(is, line); // header line + std::cout << "line is " << line << std::endl; + if (line.empty()) break; uint64_t i = 0; @@ -365,15 +635,15 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) { The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { +bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename, std::string const& fmt) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; if (util::ends_with(filename, ".gz")) { zip_istream zis(is); - good = check_correctness_lookup_access(zis, dict); + good = check_correctness_lookup_access(zis, dict, fmt); } else { - good = check_correctness_lookup_access(is, dict); + good = check_correctness_lookup_access(is, dict, fmt); } is.close(); return good; @@ -384,15 +654,16 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const& Throughout the code, we assume the input does not contain any duplicate. */ bool check_correctness_navigational_kmer_query(dictionary const& dict, - std::string const& filename) { + std::string const& filename, + std::string const& fmt) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; if (util::ends_with(filename, ".gz")) { zip_istream zis(is); - good = check_correctness_navigational_kmer_query(zis, dict); + good = check_correctness_navigational_kmer_query(zis, dict, fmt); } else { - good = check_correctness_navigational_kmer_query(is, dict); + good = check_correctness_navigational_kmer_query(is, dict, fmt); } is.close(); return good; From 691e26af066c3147bdc06090d809bc88c9ff412f Mon Sep 17 00:00:00 2001 From: NPSDC Date: Thu, 25 Jul 2024 21:29:16 -0400 Subject: [PATCH 5/6] generalized support for both fasta and cfseg --- src/build.cpp | 11 +- src/check_utils.hpp | 328 ++++++-------------------------------------- 2 files changed, 49 insertions(+), 290 deletions(-) diff --git a/src/build.cpp b/src/build.cpp index 6b94d9d..d010f07 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -69,6 +69,7 @@ int build(int argc, char** argv) { build_config.m = m; if (fmt == "fasta" || fmt == "") { + fmt = "fasta"; build_config.input_type = sshash::input_build_type::fasta; } else if (fmt == "cfseg") { build_config.input_type = sshash::input_build_type::cfseg; @@ -79,6 +80,12 @@ int build(int argc, char** argv) { if (parser.parsed("c")) build_config.c = parser.get("c"); build_config.canonical_parsing = parser.get("canonical_parsing"); build_config.weighted = parser.get("weighted"); + + if (build_config.weighted && fmt=="cfseg") { + std::cerr << "weighted index file for cfseg is not supported\n"; + std::exit(1); + } + build_config.verbose = parser.get("verbose"); if (parser.parsed("tmp_dirname")) { build_config.tmp_dirname = parser.get("tmp_dirname"); @@ -91,10 +98,10 @@ int build(int argc, char** argv) { bool check = parser.get("check"); if (check) { - // check_correctness_lookup_access(dict, input_filename, fmt); + check_correctness_lookup_access(dict, input_filename, fmt); // check_correctness_navigational_kmer_query(dict, input_filename, fmt); // check_correctness_navigational_contig_query(dict); - if (build_config.weighted) check_correctness_weights(dict, input_filename); + // if (build_config.weighted) check_correctness_weights(dict, input_filename); // check_correctness_kmer_iterator(dict); // check_correctness_contig_iterator(dict); } diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 4045a10..3ab29b1 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -24,180 +24,36 @@ bool check_correctness_negative_lookup(dictionary const& dict) { return true; } -bool check_correctness_lookup_access_fasta(std::istream& is, dictionary const& dict) { +bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, std::string const& fmt) { const uint64_t k = dict.k(); std::string line; uint64_t pos = 0; uint64_t num_kmers = 0; uint64_t num_lines = 0; lookup_result prev; + std::string sequence; prev.contig_id = 0; std::string got_kmer_str(k, 0); std::string expected_kmer_str(k, 0); std::cout << "checking correctness of access and positive lookup..." << std::endl; - while (appendline(is, line)) { - if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { + // std::getline(is, line); + if (fmt == "fasta") { + std::cout << "pos " << line[0] << std::endl; + if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { // comment or empty line restart the term buffer - line.clear(); - continue; - } - - /* transform 50% of the read nucleotides into lower-case letters - (assuming the input is upper-case): - lower-case kmers must be found anyway in the index */ - if ((num_lines & 1) == 0) { - std::transform(line.begin(), line.end(), line.begin(), - [](char c) { return std::tolower(c); }); - } - ++num_lines; - - for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); - - kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); - bool orientation = constants::forward_orientation; - - if (num_kmers != 0 and num_kmers % 5000000 == 0) { - std::cout << "checked " << num_kmers << " kmers" << std::endl; - } - - /* transform 50% of the kmers into their reverse complements */ - if ((num_kmers & 1) == 0) { - uint_kmer = util::compute_reverse_complement(uint_kmer, k); - orientation = constants::backward_orientation; - } - - util::uint_kmer_to_string(uint_kmer, expected_kmer_str.data(), k); - uint64_t id = dict.lookup(expected_kmer_str.c_str()); - - /* - Since we assume that we stream through the file from which the index was built, - ids are assigned sequentially to kmers, so it must be id == num_kmers. - */ - if (id != num_kmers) std::cout << "wrong id assigned" << std::endl; - - if (id == constants::invalid_uint64) { - std::cout << "kmer '" << expected_kmer_str << "' not found!" << std::endl; - } - assert(id != constants::invalid_uint64); - - auto curr = dict.lookup_advanced(expected_kmer_str.c_str()); - assert(curr.kmer_id == id); - - if (curr.kmer_orientation != orientation) { - std::cout << "ERROR: got orientation " << int(curr.kmer_orientation) - << " but expected " << int(orientation) << std::endl; - } - assert(curr.kmer_orientation == orientation); - - if (num_kmers == 0) { - if (curr.contig_id != 0) { - std::cout << "contig_id " << curr.contig_id << " but expected 0" << std::endl; - } - assert(curr.contig_id == 0); // at the beginning, contig_id must be 0 - } else { - if (curr.kmer_id != prev.kmer_id + 1) { - std::cout << "ERROR: got curr.kmer_id " << curr.kmer_id << " but expected " - << prev.kmer_id + 1 << std::endl; - } - assert(curr.kmer_id == prev.kmer_id + 1); // kmer_id must be sequential - - if (curr.kmer_id_in_contig >= curr.contig_size) { - std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig - << " but expected something < " << curr.contig_size << std::endl; - } - assert(curr.kmer_id_in_contig < - curr.contig_size); // kmer_id_in_contig must always be < contig_size - - if (curr.contig_id == prev.contig_id) { - /* same contig */ - if (curr.contig_size != prev.contig_size) { - std::cout << "ERROR: got curr.contig_size " << curr.contig_size - << " but expected " << prev.contig_size << std::endl; - } - assert(curr.contig_size == prev.contig_size); // contig_size must be same - if (curr.kmer_id_in_contig != prev.kmer_id_in_contig + 1) { - std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig - << " but expected " << prev.kmer_id_in_contig + 1 << std::endl; - } - assert(curr.kmer_id_in_contig == - prev.kmer_id_in_contig + 1); // kmer_id_in_contig must be sequential - } else { - /* we have changed contig */ - if (curr.contig_id != prev.contig_id + 1) { - std::cout << "ERROR: got curr.contig_id " << curr.contig_id - << " but expected " << prev.contig_id + 1 << std::endl; - } - assert(curr.contig_id == - prev.contig_id + 1); // contig_id must be sequential since we stream - if (curr.kmer_id_in_contig != 0) { - std::cout << "ERROR: got curr.kmer_id_in_contig " << curr.kmer_id_in_contig - << " but expected 0" << std::endl; - } - assert(curr.kmer_id_in_contig == - 0); // kmer_id_in_contig must be 0 when we change contig - } + line.clear(); + continue; } - - /* check also contig_size() */ - uint64_t contig_size = dict.contig_size(curr.contig_id); - if (contig_size != curr.contig_size) { - std::cout << "ERROR: got contig_size " << contig_size << " but expected " - << curr.contig_size << std::endl; - } - assert(contig_size == curr.contig_size); - - prev = curr; - - // check access - dict.access(id, got_kmer_str.data()); - kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); - if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { - std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" - << expected_kmer_str << "'" << std::endl; - } - ++num_kmers; - } - if (line.size() > k - 1) { - std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); - line.resize(k - 1); - pos = line.size(); + sequence = line; } else { - pos = 0; + std::stringstream ss(line); + char delim = '\t'; + std::getline(ss, sequence, delim); + std::getline(ss, sequence, delim); } - } - std::cout << "checked " << num_kmers << " kmers" << std::endl; - std::cout << "EVERYTHING OK!" << std::endl; - - return check_correctness_negative_lookup(dict); -} - -bool check_correctness_lookup_access_cfseg(std::istream& is, dictionary const& dict, std::string const& fmt) { - const uint64_t k = dict.k(); - std::string line; - uint64_t pos = 0; - uint64_t num_kmers = 0; - uint64_t num_lines = 0; - lookup_result prev; - prev.contig_id = 0; - - std::string got_kmer_str(k, 0); - std::string expected_kmer_str(k, 0); - - std::cout << "checking correctness of access and positive lookup..." << std::endl; - int j = 0; - while (!is.eof()) { - - std::getline(is, line); - std::stringstream ss(line); - std::string sequence; - char delim = '\t'; - std::getline(ss, sequence, delim); - std::getline(ss, sequence, delim); /* transform 50% of the read nucleotides into lower-case letters (assuming the input is upper-case): lower-case kmers must be found anyway in the index */ @@ -206,7 +62,7 @@ bool check_correctness_lookup_access_cfseg(std::istream& is, dictionary const& d [](char c) { return std::tolower(c); }); } ++num_lines; - + std::cout << "seq " << sequence << std::endl; for (uint64_t i = 0; i + k <= sequence.size(); ++i) { assert(util::is_valid(sequence.data() + i, k)); @@ -326,137 +182,43 @@ bool check_correctness_lookup_access_cfseg(std::istream& is, dictionary const& d std::cout << "checked " << num_kmers << " kmers" << std::endl; std::cout << "EVERYTHING OK!" << std::endl; - return check_correctness_negative_lookup(dict); -} - - -bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, std::string const& fmt) { - if (fmt == "fasta") { - return check_correctness_lookup_access_fasta(is, dict); - } - else { - return check_correctness_lookup_access_cfseg(is, dict); - } + return check_correctness_negative_lookup(dict); } -bool check_correctness_navigational_kmer_query_fasta(std::istream& is, dictionary const& dict) { +bool check_correctness_navigational_kmer_query(std::istream& is, + dictionary const& dict, + std::string const& fmt) { uint64_t k = dict.k(); std::string line; uint64_t pos = 0; uint64_t num_kmers = 0; + std::string sequence; std::cout << "checking correctness of navigational queries for kmers..." << std::endl; - while (appendline(is, line)) { - if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { - // comment or empty line restart the term buffer - line.clear(); - continue; - } - for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); - if (num_kmers != 0 and num_kmers % 5000000 == 0) { - std::cout << "checked " << num_kmers << " kmers" << std::endl; - } - - neighbourhood curr = dict.kmer_neighbours(line.data() + i); - - char next_nuc = line[i + k]; - switch (next_nuc) { - case 'A': - if (curr.forward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_A" << std::endl; - } - assert(curr.forward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.forward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_C" << std::endl; - } - assert(curr.forward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.forward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_G" << std::endl; - } - assert(curr.forward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.forward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_T" << std::endl; - } - assert(curr.forward_T.kmer_id != constants::invalid_uint64); - break; - } - - if (i != 0) { - char prev_nuc = line[i - 1]; - switch (prev_nuc) { - case 'A': - if (curr.backward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_A" << std::endl; - } - assert(curr.backward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.backward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_C" << std::endl; - } - assert(curr.backward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.backward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_G" << std::endl; - } - assert(curr.backward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.backward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_T" << std::endl; - } - assert(curr.backward_T.kmer_id != constants::invalid_uint64); - break; - } + while (!is.eof()) { + std::getline(is, line); + if (fmt == "fasta") { + if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { + // comment or empty line restart the term buffer + line.clear(); + continue; } - - ++num_kmers; - } - if (line.size() > k - 1) { - std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); - line.resize(k - 1); - pos = line.size(); + sequence = line; } else { - pos = 0; + std::stringstream ss(line); + char delim = '\t'; + std::getline(ss, sequence, delim); + std::getline(ss, sequence, delim); } - } - std::cout << "checked " << num_kmers << " kmers" << std::endl; - - std::cout << "EVERYTHING OK!" << std::endl; - return true; -} -bool check_correctness_navigational_kmer_query_cfseg(std::istream& is, dictionary const& dict, std::string const& fmt) { - uint64_t k = dict.k(); - std::string line_tab; - uint64_t pos = 0; - uint64_t num_kmers = 0; - - std::cout << "checking correctness of navigational queries for kmers..." << std::endl; - while (!is.eof()) { - std::getline(is, line_tab); - std::stringstream ss(line_tab); - std::string line; - char delim = '\t'; - std::getline(ss, line, delim); - std::getline(ss, line, delim); - for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); + for (uint64_t i = 0; i + k <= sequence.size(); ++i) { + assert(util::is_valid(sequence.data() + i, k)); if (num_kmers != 0 and num_kmers % 5000000 == 0) { std::cout << "checked " << num_kmers << " kmers" << std::endl; } - neighbourhood curr = dict.kmer_neighbours(line.data() + i); - - char next_nuc = line[i + k]; + neighbourhood curr = dict.kmer_neighbours(sequence.data() + i); + char next_nuc = sequence[i + k]; switch (next_nuc) { case 'A': if (curr.forward_A.kmer_id == constants::invalid_uint64) { @@ -485,7 +247,7 @@ bool check_correctness_navigational_kmer_query_cfseg(std::istream& is, dictionar } if (i != 0) { - char prev_nuc = line[i - 1]; + char prev_nuc = sequence[i - 1]; switch (prev_nuc) { case 'A': if (curr.backward_A.kmer_id == constants::invalid_uint64) { @@ -516,10 +278,10 @@ bool check_correctness_navigational_kmer_query_cfseg(std::istream& is, dictionar ++num_kmers; } - if (line.size() > k - 1) { - std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); - line.resize(k - 1); - pos = line.size(); + if (sequence.size() > k - 1) { + std::copy(sequence.data() + sequence.size() - (k - 1), sequence.data() + sequence.size(), sequence.data()); + sequence.resize(k - 1); + pos = sequence.size(); } else { pos = 0; } @@ -530,16 +292,6 @@ bool check_correctness_navigational_kmer_query_cfseg(std::istream& is, dictionar return true; } -bool check_correctness_navigational_kmer_query(std::istream& is, - dictionary const& dict, - std::string const& fmt) { - if(fmt == "fasta") { - return check_correctness_navigational_kmer_query_fasta(is, dict); - } else { - return check_correctness_navigational_kmer_query_cfseg(is, dict); - } -} - bool check_correctness_navigational_contig_query(dictionary const& dict) { std::cout << "checking correctness of navigational queries for contigs..." << std::endl; uint64_t num_contigs = dict.num_contigs(); From a5b662ac8a92e7b7ad21ade9a3a735ffcabbd4bd Mon Sep 17 00:00:00 2001 From: NPSDC Date: Thu, 25 Jul 2024 22:50:13 -0400 Subject: [PATCH 6/6] sshash with checking for cfseg supported --- src/build.cpp | 10 +++++----- src/check_utils.hpp | 20 +++++++++----------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/build.cpp b/src/build.cpp index d010f07..48e4bc6 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -99,11 +99,11 @@ int build(int argc, char** argv) { bool check = parser.get("check"); if (check) { check_correctness_lookup_access(dict, input_filename, fmt); - // check_correctness_navigational_kmer_query(dict, input_filename, fmt); - // check_correctness_navigational_contig_query(dict); - // if (build_config.weighted) check_correctness_weights(dict, input_filename); - // check_correctness_kmer_iterator(dict); - // check_correctness_contig_iterator(dict); + check_correctness_navigational_kmer_query(dict, input_filename, fmt); + check_correctness_navigational_contig_query(dict); + if (build_config.weighted) check_correctness_weights(dict, input_filename); + check_correctness_kmer_iterator(dict); + check_correctness_contig_iterator(dict); } bool bench = parser.get("bench"); if (bench) { diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 3ab29b1..6c8a71f 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -38,10 +38,10 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, s std::string expected_kmer_str(k, 0); std::cout << "checking correctness of access and positive lookup..." << std::endl; + while (appendline(is, line)) { - // std::getline(is, line); + sequence.clear(); if (fmt == "fasta") { - std::cout << "pos " << line[0] << std::endl; if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { // comment or empty line restart the term buffer line.clear(); @@ -62,7 +62,6 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, s [](char c) { return std::tolower(c); }); } ++num_lines; - std::cout << "seq " << sequence << std::endl; for (uint64_t i = 0; i + k <= sequence.size(); ++i) { assert(util::is_valid(sequence.data() + i, k)); @@ -172,9 +171,9 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict, s ++num_kmers; } if (sequence.size() > k - 1) { - std::copy(sequence.data() + sequence.size() - (k - 1), sequence.data() + sequence.size(), sequence.data()); - sequence.resize(k - 1); - pos = sequence.size(); + std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); + line.resize(k - 1); + pos = line.size(); } else { pos = 0; } @@ -195,7 +194,7 @@ bool check_correctness_navigational_kmer_query(std::istream& is, std::string sequence; std::cout << "checking correctness of navigational queries for kmers..." << std::endl; - while (!is.eof()) { + while (appendline(is, line)) { std::getline(is, line); if (fmt == "fasta") { if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { @@ -279,9 +278,9 @@ bool check_correctness_navigational_kmer_query(std::istream& is, ++num_kmers; } if (sequence.size() > k - 1) { - std::copy(sequence.data() + sequence.size() - (k - 1), sequence.data() + sequence.size(), sequence.data()); - sequence.resize(k - 1); - pos = sequence.size(); + std::copy(line.data() + line.size() - (k - 1), line.data() + line.size(), line.data()); + line.resize(k - 1); + pos = line.size(); } else { pos = 0; } @@ -344,7 +343,6 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) { while (!is.eof()) { std::getline(is, line); // header line - std::cout << "line is " << line << std::endl; if (line.empty()) break;