diff --git a/.github/workflows/ci_utility.yml b/.github/workflows/ci_utility.yml index 1916f68f..00e488fe 100644 --- a/.github/workflows/ci_utility.yml +++ b/.github/workflows/ci_utility.yml @@ -4,6 +4,9 @@ on: push: branches: - 'main' + pull_request: + types: + - unlabeled workflow_dispatch: concurrency: diff --git a/include/raptor/hierarchical_interleaved_bloom_filter.hpp b/include/raptor/hierarchical_interleaved_bloom_filter.hpp index 84d472d7..78bd2765 100644 --- a/include/raptor/hierarchical_interleaved_bloom_filter.hpp +++ b/include/raptor/hierarchical_interleaved_bloom_filter.hpp @@ -333,8 +333,8 @@ class hierarchical_interleaved_bloom_filter::membership_agent if (current_filename_index < 0) // merged bin { - if (sum >= threshold) - bulk_contains_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold); + // if (sum >= threshold) + bulk_contains_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold); sum = 0u; } else if (bin + 1u == result.size() || // last bin diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index e9bf44dd..869cc7dc 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -130,4 +130,41 @@ target_link_libraries ("generate_reads_refseq" "common") add_executable ("ibf_fpr" src/applications/ibf_fpr.cpp) target_link_libraries ("ibf_fpr" "common") +add_executable ("check_fastq" src/applications/check_fastq.cpp) +target_link_libraries ("check_fastq" "common") +install (TARGETS "check_fastq" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("compare_amr_genes" hibf/misc/helper/src/compare_amr_genes.cpp) +target_link_libraries ("compare_amr_genes" "common") +install (TARGETS "compare_amr_genes" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("create_comparable_output" hibf/misc/helper/src/create_comparable_output.cpp) +target_link_libraries ("create_comparable_output" "common") +install (TARGETS "create_comparable_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("normalise_bifrost_output" hibf/misc/helper/src/normalise_bifrost_output.cpp) +target_link_libraries ("normalise_bifrost_output" "common") +install (TARGETS "normalise_bifrost_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("normalise_raptor_output" hibf/misc/helper/src/normalise_raptor_output.cpp) +target_link_libraries ("normalise_raptor_output" "common") +install (TARGETS "normalise_raptor_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("normalise_yara_truth_file" hibf/misc/helper/src/normalise_yara_truth_file.cpp) +target_link_libraries ("normalise_yara_truth_file" "common") +install (TARGETS "normalise_yara_truth_file" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("compare_mantis_raptor_output" hibf/misc/helper/src/compare_mantis_raptor_output.cpp) +target_link_libraries ("compare_mantis_raptor_output" "common") +install (TARGETS "compare_mantis_raptor_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +# formerly create_truth_file +add_executable ("to_be_deleted" hibf/misc/helper/src/to_be_deleted.cpp) +target_link_libraries ("to_be_deleted" "common") +install (TARGETS "to_be_deleted" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + +add_executable ("compare_output" hibf/misc/helper/src/compare_output.cpp) +target_link_libraries ("compare_output" "common") +install (TARGETS "compare_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + message (STATUS "${FontBold}You can run `make install` to build the application.${FontReset}") diff --git a/util/hibf/misc/helper/src/CMakeLists.txt b/util/hibf/misc/helper/src/CMakeLists.txt index 84b68050..77a9931e 100644 --- a/util/hibf/misc/helper/src/CMakeLists.txt +++ b/util/hibf/misc/helper/src/CMakeLists.txt @@ -2,3 +2,6 @@ cmake_minimum_required (VERSION 3.8) add_executable ("fasta_to_fastq" fasta_to_fastq.cpp) target_link_libraries ("fasta_to_fastq" PUBLIC seqan3::seqan3 sharg::sharg) + +add_executable ("compare_output" compare_output.cpp) +target_link_libraries ("compare_output" PUBLIC seqan3::seqan3 sharg::sharg) diff --git a/util/hibf/misc/helper/src/compare_amr_genes.cpp b/util/hibf/misc/helper/src/compare_amr_genes.cpp new file mode 100644 index 00000000..1c4218d9 --- /dev/null +++ b/util/hibf/misc/helper/src/compare_amr_genes.cpp @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) +{ + if (argc != 3) + throw std::runtime_error{"Please provide a processed raptor result file and the truth file"}; + + std::ifstream raptor_result{argv[1]}; + std::ifstream truth_file{argv[2]}; + + if (!raptor_result.good()) + throw std::runtime_error{"Could not open file " + std::string{argv[1]}}; + if (!truth_file.good()) + throw std::runtime_error{"Could not open file " + std::string{argv[2]}}; + + std::unordered_map user_bin_ids; + + std::cout << "Reading in user bin ids from raptor header in " << argv[1] << "... "; + std::string line; + while (std::getline(raptor_result, line) && line[0] == '#' && line[1] != 'Q') + { + std::string const value{line.begin(), line.begin() + line.find('\t')}; + std::string const key{line.begin() + line.find_last_of('/') + 1, line.begin() + line.find_first_of('.')}; + uint64_t value_as_number = std::atoi(value.data()); + user_bin_ids.emplace(key, value_as_number); + } + std::cout << "Done " << std::endl; + + std::unordered_map> truth_set{}; + + std::cout << "Reading in truth set from file " << argv[2] << "... "; + // process header line + std::getline(truth_file, line); + std::vector genes{}; + for (auto && gene : line | std::views::split('\t')) + { + std::string gene_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(gene); + genes.push_back(gene_str); + } + // process rest of files + while (std::getline(truth_file, line)) + { + std::string sample_id{line.begin(), line.begin() + line.find('\t')}; + uint64_t sample_idx = user_bin_ids[sample_id]; + + size_t current_pos{0}; + for (auto && occ : line | std::views::split('\t')) + { + if (std::ranges::equal(occ, std::string{"1"})) // not 0 + truth_set[genes[current_pos]].push_back(sample_idx); + ++current_pos; + } + } + std::cout << "Done - Truth set has size " << truth_set.size() << std::endl; + + std::cout << "Processing Results from raptor file " << argv[1] << "... "; + + std::ofstream false_positives_file{"raptor.fps"}; + std::ofstream false_negatives_file{"raptor.fns"}; + uint64_t true_positives{0}; + uint64_t false_positives{0}; + uint64_t false_negatives{0}; + uint64_t line_no{0}; + uint64_t all_raptor{0}; + size_t skipped_genes{}; + + while (std::getline(raptor_result, line)) + { + auto gv = line | std::views::split('|') | std::views::drop(5); + std::string gene = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*gv.begin()); + + auto it = truth_set.find(gene); + if (it == truth_set.end()) + { + ++skipped_genes; + std::cerr << "Warning: Could not find gene '" << gene << "' in truth set.\n"; + continue; + } + + auto & truth_fields = it->second; + + std::string raptor_fields{line.begin() + line.find('\t') + 1, line.end()}; + auto raptor_fields_view = raptor_fields | std::views::split(','); + + auto truth_it = truth_fields.begin(); + auto raptor_it = raptor_fields_view.begin(); + + while (truth_it != truth_fields.end() && raptor_it != raptor_fields_view.end()) + { + std::string raptor_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*raptor_it); + uint64_t raptor_value = std::atoi(raptor_str.data()); + + uint64_t truth_value = *truth_it; + + if (truth_value != raptor_value) // If mantis results are empty, then...? + { + if (truth_value < raptor_value) + { + false_negatives_file << gene << ":" << truth_value << '\n'; + ++false_negatives; + // ++all_mantis; + ++truth_it; + } + else + { + false_positives_file << gene << ":" << raptor_value << '\n'; + ++false_positives; + ++all_raptor; + ++raptor_it; + } + } + else + { + ++true_positives; + // ++all_mantis; + ++all_raptor; + ++truth_it; + ++raptor_it; + } + } + + while (truth_it != truth_fields.end()) // process the rest of mantis + { + uint64_t truth_value = *truth_it; + ++false_negatives; + false_negatives_file << gene << ":" << truth_value << '\n'; + // ++all_mantis; + ++truth_it; + } + + while (raptor_it != raptor_fields_view.end()) // process the rest of raptor if any + { + std::string raptor_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*raptor_it); + uint64_t raptor_value = std::atoi(raptor_str.data()); + false_positives_file << gene << ":" << raptor_value << '\n'; + ++false_positives; + ++all_raptor; + ++raptor_it; + } + + ++line_no; + } + + std::cout << std::endl; + // std::cout << "Mantis total #hits:" << all_mantis << std::endl; + std::cout << "#Skipped genes: " << skipped_genes << std::endl; + std::cout << "Raptor total #hits:" << all_raptor << std::endl; + std::cout << "#True positives raptor: " << true_positives << std::endl; + std::cout << "#False positives raptor: " << false_positives << std::endl; + std::cout << "#False negatives raptor: " << false_negatives << std::endl; +} diff --git a/util/hibf/misc/helper/src/compare_mantis_raptor_output.cpp b/util/hibf/misc/helper/src/compare_mantis_raptor_output.cpp new file mode 100644 index 00000000..ff55e17d --- /dev/null +++ b/util/hibf/misc/helper/src/compare_mantis_raptor_output.cpp @@ -0,0 +1,377 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline robin_hood::unordered_map +parse_user_bin_ids(std::filesystem::path const & user_bin_ids_file) +{ + std::string line_buffer{}; + uint64_t id_buffer{}; + robin_hood::unordered_map ub_name_to_id; + std::ifstream user_bin_ids_in{user_bin_ids_file}; + + // Contains lines: "some_number reference_name" + while (std::getline(user_bin_ids_in, line_buffer)) + { + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const id_value{line_buffer.begin(), tab_it}; + std::string_view const name_key{++tab_it, line_buffer.end()}; + std::from_chars(id_value.data(), id_value.data() + id_value.size(), id_buffer); + ub_name_to_id.emplace(name_key, id_buffer); + } + return ub_name_to_id; +} + +inline void check_output_file(std::filesystem::path const & output_file) +{ + std::filesystem::path const output_directory = output_file.parent_path(); + std::error_code ec{}; + std::filesystem::create_directories(output_directory, ec); + + if (!output_directory.empty() && ec) + throw seqan3::argument_parser_error{ + seqan3::detail::to_string("Failed to create directory\"", output_directory.c_str(), "\": ", ec.message())}; +} + +struct config +{ + std::filesystem::path mantis_result_file{}; + std::filesystem::path raptor_result_file{}; + std::filesystem::path user_bin_ids_file{}; + std::filesystem::path output_directory{}; +}; + +auto find_tab(std::string const & str) +{ + auto const pos = str.find('\t'); + if (pos == std::string::npos) + throw std::runtime_error{"Line \"" + str + "\" does not contain a tab."}; + return str.begin() + pos; +} + +/* The input is the outer iterator of a split view. We use a comma as delimiter for multiple hits, i.e. a line + * may look like: + * `GCF_000005825.2_ASM582v2_genomic100 0,283,7288` = `\t`. + * We extract the `` part and apply a split_view on it. + * + * The split view has two iterators/ranges: outer and inner: + * [0 283 7288] + * ^ + * outer_it = split_view.begin() + * + * [0 283 7288] + * ^ + * ++outer_it; + * + * [ 2 8 3 ] = inner_range = *outer_it + * ^ + * inner_it = inner_range.begin() + * + * In GCC12, one could create a string_view from a split_view (inner_range) directly, since a split_view preserves + * contiguous and other properties. In GCC12, the pre-GCC12 split_view is called lazy_split_view. + * + * Pre-GCC12, the split_view is lazy and does not preserve contiguous_range. Therefore, a string_view must be + * constructed in a bit more complicated manner: + * * get address of inner_it; this is the first character + * * get length of inner_range + * * construct string_view from pointer and length + */ +auto extract_hit(auto const & outer_it) +{ + auto const & inner_range = *outer_it; + char const * const first_char = std::addressof(*(inner_range.begin())); + auto const length = std::ranges::distance(inner_range); + + uint64_t result{}; + std::from_chars(first_char, first_char + length, result); + return result; +} + +void compare_results(config const & cfg) +{ + // map[reference_name] = number + robin_hood::unordered_map const ub_name_to_id{parse_user_bin_ids(cfg.user_bin_ids_file)}; + + std::ifstream mantis_result{cfg.mantis_result_file}; + std::ifstream raptor_result{cfg.raptor_result_file}; + + std::ofstream false_positives_file{cfg.output_directory / "comparison.raptor.fps"}; + std::ofstream false_negatives_file{cfg.output_directory / "comparison.raptor.fns"}; + std::ofstream missing_ground_truths_file{cfg.output_directory / "comparison.missing_ground_truth.warn"}; + std::ofstream missing_lines_file{cfg.output_directory / "comparison.missing_lines.warn"}; + std::ofstream stats_file{cfg.output_directory / "comparison.stats.tsv"}; + + bool missing_lines{false}; + + std::string mantis_line{}; + std::string raptor_line{}; + uint64_t false_positives{}; + uint64_t false_negatives{}; + uint64_t line_no{}; + uint64_t mantis_hit_count{}; + uint64_t mantis_miss{}; + uint64_t raptor_hit_count{}; + uint64_t raptor_miss{}; + +#if 1 + std::string query_name_buffer{}; + auto parse_original_bin = [&query_name_buffer, &ub_name_to_id](std::string_view const & line) + { + // E.g., "GCF_000005825.2_ASM582v2_genomic106". 106 is the read number. + // find() returns an iterator to the 'g' of "genomic". `+7` moves the iterator to '1', which is the end + // of the bin name. + query_name_buffer.assign(line.begin(), line.begin() + line.find("genomic") + 7); + return ub_name_to_id.at(query_name_buffer); + }; +#else + std::array read_id_buffer; + std::string query_name_buffer{}; + constexpr std::string_view bin_prefix{"bin_"}; + auto parse_original_bin = + [&read_id_buffer, &query_name_buffer, &ub_name_to_id, &bin_prefix](std::string_view const & line) + { + // E.g., "GCF_000005825.2_ASM582v2_genomic106". 106 is the read number. + // find() returns an iterator to the 'g' of "genomic". `+7` moves the iterator to '1', which is the end + // of the bin name. + uint64_t result{}; + std::from_chars(line.data(), line.data() + line.size(), result); + result &= 0b1111'1111'1111'1111'1111; + result >>= 4u; + auto [ptr, ec] = std::to_chars(read_id_buffer.data(), read_id_buffer.data() + read_id_buffer.size(), result); + (void)ec; + query_name_buffer.assign(bin_prefix); + std::string_view const read_id{read_id_buffer.data(), ptr}; + query_name_buffer.append(std::string(5u - read_id.size(), '0')); + query_name_buffer.append(read_id); + return ub_name_to_id.at(query_name_buffer); + }; +#endif + auto parse_query_name = [&mantis_line, &raptor_line](auto const & mantis_tab_it, auto const & raptor_tab_it) + { + std::string_view const mantis_query_name{mantis_line.begin(), mantis_tab_it}; + std::string_view const raptor_query_name{raptor_line.begin(), raptor_tab_it}; + if (mantis_query_name != raptor_query_name) + throw std::runtime_error{"Query names do not match, something went wrong"}; + return mantis_query_name; + }; + + while (std::getline(mantis_result, mantis_line) && std::getline(raptor_result, raptor_line)) + { + auto const mantis_tab_it{find_tab(mantis_line)}; + auto const raptor_tab_it{find_tab(raptor_line)}; + std::string_view const query_name{parse_query_name(mantis_tab_it, raptor_tab_it)}; + + uint64_t const original_bin{parse_original_bin(query_name)}; + bool mantis_found_correct_bin{false}; + bool raptor_found_correct_bin{false}; + + std::ranges::split_view const mantis_fields_view{std::string_view{mantis_tab_it + 1, mantis_line.end()}, ','}; + std::ranges::split_view const raptor_fields_view{std::string_view{raptor_tab_it + 1, raptor_line.end()}, ','}; + auto mantis_it{mantis_fields_view.begin()}; + auto raptor_it{raptor_fields_view.begin()}; + + while (mantis_it != mantis_fields_view.end() && raptor_it != raptor_fields_view.end()) + { + uint64_t const mantis_hit_bin{extract_hit(mantis_it)}; + mantis_found_correct_bin = mantis_found_correct_bin || mantis_hit_bin == original_bin; + + uint64_t const raptor_hit_bin{extract_hit(raptor_it)}; + raptor_found_correct_bin = raptor_found_correct_bin || raptor_hit_bin == original_bin; + + // if (query_name == "GCF_000005845.2_ASM584v2_genomic236") + // std::cout << "" << std::endl; + + false_negatives_file << query_name << ':' << mantis_hit_bin << '\n'; + ++false_negatives; + } + // if (query_name == "GCF_000005845.2_ASM584v2_genomic236") + // std::cout << ",no-FN-because-original_bin>" << std::endl; + ++mantis_hit_count; + ++mantis_it; + } + else + { + if (raptor_hit_bin != original_bin) + { + false_positives_file << query_name << ':' << raptor_hit_bin << '\n'; + ++false_positives; + // if (query_name == "GCF_000005845.2_ASM584v2_genomic236") + // std::cout << ",FP>" << std::endl; + } + // if (query_name == "GCF_000005845.2_ASM584v2_genomic236") + // std::cout << ",no-FP-because-original_bin>" << std::endl; + ++raptor_hit_count; + ++raptor_it; + } + } + else + { + // if (query_name == "GCF_000005845.2_ASM584v2_genomic236") + // std::cout << ">" << std::endl; + ++mantis_hit_count; + ++raptor_hit_count; + ++mantis_it; + ++raptor_it; + } + } + + // process the rest of mantis + while (mantis_it != mantis_fields_view.end()) + { + uint64_t const mantis_hit_bin{extract_hit(mantis_it)}; + mantis_found_correct_bin = mantis_found_correct_bin || mantis_hit_bin == original_bin; + false_negatives_file << query_name << ':' << mantis_hit_bin << '\n'; + ++false_negatives; + ++mantis_hit_count; + ++mantis_it; + } + + // process the rest of raptor + while (raptor_it != raptor_fields_view.end()) + { + uint64_t const raptor_hit_bin{extract_hit(raptor_it)}; + raptor_found_correct_bin = raptor_found_correct_bin || raptor_hit_bin == original_bin; + false_positives_file << query_name << ':' << raptor_hit_bin << '\n'; + ++false_positives; + ++raptor_hit_count; + ++raptor_it; + } + + if (!mantis_found_correct_bin) + { + ++mantis_miss; + missing_ground_truths_file << "Line " << line_no << ": " + << "Could not find query " << query_name << ' ' << '(' << query_name_buffer + << ':' << original_bin << ") " + << "in its respective genome in mantis.\n"; + } + if (!raptor_found_correct_bin) + { + ++raptor_miss; + missing_ground_truths_file << "Line " << line_no << ": " + << "Could not find query " << query_name << ' ' << '(' << query_name_buffer + << ':' << original_bin << ") " + << "in its respective genome in raptor.\n"; + } + + ++line_no; + } + + while (std::getline(mantis_result, mantis_line)) + { + missing_lines = true; + missing_lines_file << "Missing line of mantis in comparison: " << mantis_line << '\n'; + } + while (std::getline(raptor_result, raptor_line)) + { + missing_lines = true; + missing_lines_file << "Missing line of raptor in comparison: " << raptor_line << '\n'; + } + + stats_file << "Mantis total:\t" << mantis_hit_count << '\n'; + stats_file << "Mantis miss: \t" << mantis_miss << '\n'; + stats_file << "Raptor total:\t" << raptor_hit_count << '\n'; + stats_file << "Raptor miss: \t" << raptor_miss << '\n'; + stats_file << "Raptor FP: \t" << false_positives << '\n'; + stats_file << "Raptor FN: \t" << false_negatives << '\n'; + + if (missing_lines) + std::cout << "[WARNING] Somes lines were missing. See " << (cfg.output_directory / "missing_lines.warn") + << '\n'; + + if (mantis_miss || raptor_miss) + std::cout << "[Info] Missing ground truths are listed in " + << (cfg.output_directory / "missing_ground_truth.warn") << '\n'; + + std::cout << "[Info] False positives: " << (cfg.output_directory / "raptor.fps") << '\n'; + + std::cout << "[Info] False negatives: " << (cfg.output_directory / "raptor.fns") << '\n'; + + std::cout << "[Info] Statistics: " << (cfg.output_directory / "stats.tsv") << '\n'; + + std::cout << "[Info] Content of stats.tsv:\n" + << " Mantis total:\t" << mantis_hit_count << '\n' + << " Mantis miss: \t" << mantis_miss << '\n' + << " Raptor total:\t" << raptor_hit_count << '\n' + << " Raptor miss: \t" << raptor_miss << '\n' + << " Raptor FP: \t" << false_positives << '\n' + << " Raptor FN: \t" << false_negatives << '\n'; +} + +void init_parser(seqan3::argument_parser & parser, config & cfg) +{ + parser.add_option(cfg.mantis_result_file, + '\0', + "mantis_results", + "The mantis result file produced by normalise_mantis_output.", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.raptor_result_file, + '\0', + "raptor_results", + "The raptor result file, e.g., \"raptor.results\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.user_bin_ids_file, + '\0', + "user_bin_ids", + "The file containing user bin ids, e.g., \"user_bin.ids\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.output_directory, + '\0', + "output_directory", + "Provide a path to the output.", + seqan3::option_spec::required); +} + +int main(int argc, char ** argv) +{ + seqan3::argument_parser parser{"compare_mantis_raptor_output", argc, argv, seqan3::update_notifications::off}; + parser.info.author = "Svenja Mehringer, Enrico Seiler"; + parser.info.email = "enrico.seiler@fu-berlin.de"; + parser.info.short_description = "Compares mantis and raptor results."; + parser.info.version = "0.0.1"; + + config cfg{}; + init_parser(parser, cfg); + + try + { + parser.parse(); + cfg.output_directory = std::filesystem::absolute(cfg.output_directory); + check_output_file(cfg.output_directory / "stats.txt"); + } + catch (seqan3::argument_parser_error const & ext) + { + std::cerr << "[Error] " << ext.what() << '\n'; + std::exit(-1); + } + + compare_results(cfg); +} diff --git a/util/hibf/misc/helper/src/compare_output.cpp b/util/hibf/misc/helper/src/compare_output.cpp new file mode 100644 index 00000000..45bbcbec --- /dev/null +++ b/util/hibf/misc/helper/src/compare_output.cpp @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) +{ + if (argc != 4) + throw std::runtime_error{ + "Please provide user_bin.id mantis.ready and raptor.ready"}; // $FILENAME_USER_BIN_IDS $FILENAME_MANTIS_READY_TO_COMPARE $FILENAME_RAPTOR_READY_TO_COMPARE + + std::ifstream user_bin_ids_file{argv[1]}; + + std::cout << "Reading in " << argv[1] << "... "; + std::string line; + std::unordered_map user_bin_ids; + while (std::getline(user_bin_ids_file, line)) + { + std::string value{line.begin(), line.begin() + line.find('\t')}; + std::string key{line.begin() + line.find('\t') + 1, line.end()}; + uint64_t value_as_number = std::atoi(value.data()); + user_bin_ids.emplace(key, value_as_number); + } + std::cout << "Done " << std::endl; + + std::ifstream mantis_result{argv[2]}; + std::ifstream raptor_result{argv[3]}; + + if (!mantis_result.good()) + throw std::runtime_error{"Could not open file " + std::string{argv[2]}}; + if (!raptor_result.good()) + throw std::runtime_error{"Could not open file " + std::string{argv[3]}}; + + std::cout << "Processing Results from " << argv[2] << " and " << argv[3] << "... "; + + std::string mantis_line, raptor_line; + std::ofstream false_positives_file{"raptor.fps"}; + std::ofstream false_negatives_file{"raptor.fns"}; + uint64_t false_positives{0}; + uint64_t false_negatives{0}; + uint64_t line_no{0}; + uint64_t all_mantis{0}; + uint64_t all_raptor{0}; + + while (std::getline(mantis_result, mantis_line) && std::getline(raptor_result, raptor_line)) + { + std::string mantis_query_name{mantis_line.begin(), mantis_line.begin() + mantis_line.find('\t')}; + std::string raptor_query_name{raptor_line.begin(), raptor_line.begin() + raptor_line.find('\t')}; + if (mantis_query_name != raptor_query_name) + throw std::runtime_error{"Query names do not match, something went wrong"}; + + std::string query_name{mantis_query_name.begin(), + mantis_query_name.begin() + mantis_query_name.find("genomic") + 7}; + uint64_t query_id = user_bin_ids[query_name]; + bool found_query_id_in_mantis{false}; + bool found_query_id_in_raptor{false}; + + std::string mantis_fields{mantis_line.begin() + mantis_line.find('\t') + 1, mantis_line.end()}; + std::string raptor_fields{raptor_line.begin() + raptor_line.find('\t') + 1, raptor_line.end()}; + auto mantis_fields_view = mantis_fields | std::views::split(','); + auto raptor_fields_view = raptor_fields | std::views::split(','); + auto mantis_it = mantis_fields_view.begin(); + auto raptor_it = raptor_fields_view.begin(); + + while (mantis_it != mantis_fields_view.end() && raptor_it != raptor_fields_view.end()) + { + std::string mantis_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*mantis_it); + std::string raptor_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*raptor_it); + // Should also work: + // std::string_view mantis_str{*mantis_it}; + // std::string_view raptor_str{*raptor_it}; + uint64_t mantis_value = std::atoi(mantis_str.data()); + uint64_t raptor_value = std::atoi(raptor_str.data()); + + found_query_id_in_mantis = found_query_id_in_mantis || mantis_value == query_id; + found_query_id_in_raptor = found_query_id_in_raptor || raptor_value == query_id; + // Was: + // if (mantis_value == query_id) + // found_query_id_in_mantis = true; + // if (raptor_value == query_id) + // found_query_id_in_raptor = true; + + if (mantis_value != raptor_value) // If mantis results are empty, then...? + { + if (mantis_value < raptor_value) + { + if (raptor_value != query_id) + { + false_negatives_file << mantis_query_name << ":" << mantis_value << '\n'; + ++false_negatives; + } + ++all_mantis; + ++mantis_it; + } + else + { + if (raptor_value != query_id) + { + false_positives_file << raptor_query_name << ":" << raptor_value << '\n'; + ++false_positives; + } + ++all_raptor; + ++raptor_it; + } + } + else + { + ++all_mantis; + ++all_raptor; + ++mantis_it; + ++raptor_it; + } + } + + while (mantis_it != mantis_fields_view.end()) // process the rest of mantis + { + std::string mantis_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*mantis_it); + std::string query_name{mantis_query_name.begin(), + mantis_query_name.begin() + mantis_query_name.find("genomic") + 7}; + // uint64_t query_id = user_bin_ids[query_name]; + uint64_t mantis_value = std::atoi(mantis_str.data()); + found_query_id_in_mantis = found_query_id_in_mantis || mantis_value == query_id; + ++false_negatives; + false_negatives_file << query_name << ":" << mantis_value << '\n'; + ++all_mantis; + ++mantis_it; + } + + while (raptor_it != raptor_fields_view.end()) // process the rest of raptor if any + { + std::string raptor_str = [](auto v) + { + std::string s; + for (auto c : v) + s.push_back(c); + return s; + }(*raptor_it); + std::string query_name{mantis_query_name.begin(), + mantis_query_name.begin() + mantis_query_name.find("genomic") + 7}; + // uint64_t query_id = user_bin_ids[query_name]; + uint64_t raptor_value = std::atoi(raptor_str.data()); + if (raptor_value == query_id) + { + found_query_id_in_raptor = true; + } + else + { + false_positives_file << query_name << ":" << raptor_value << '\n'; + ++false_positives; + } + ++all_raptor; + ++raptor_it; + } + + if (!found_query_id_in_mantis) + std::cerr << "Warning in line " << line_no << ": Could not find query " << mantis_query_name << "(" + << query_name << ":" << query_id << ") in its respective gemnome in mantis." << std::endl; + if (!found_query_id_in_raptor) + std::cerr << "Warning in line " << line_no << ": Could not find query " << raptor_query_name << "(" + << query_name << ":" << query_id << ") in its respective gemnome in raptor." << std::endl; + + ++line_no; + } + + while (std::getline(mantis_result, mantis_line)) + std::cerr << "WARNING: Missing line of mantis in comparison: " << mantis_line; + while (std::getline(raptor_result, raptor_line)) + std::cerr << "WARNING: Missing line of raptor in comparison: " << raptor_line; + + std::cout << std::endl; + std::cout << "Mantis total #hits:" << all_mantis << std::endl; + std::cout << "Raptor total #hits:" << all_raptor << std::endl; + std::cout << "#False positives raptor: " << false_positives << std::endl; + std::cout << "#False negatives raptor: " << false_negatives << std::endl; +} diff --git a/util/hibf/misc/helper/src/create_comparable_output.cpp b/util/hibf/misc/helper/src/create_comparable_output.cpp new file mode 100644 index 00000000..6e34a662 --- /dev/null +++ b/util/hibf/misc/helper/src/create_comparable_output.cpp @@ -0,0 +1,180 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) +{ + constexpr int kmer_size{32}; + constexpr int number_of_errors{2}; + constexpr int destroyed_kmers{kmer_size * number_of_errors}; + + // $FILENAME_QUERY_NAMES + // $FILENAME_USER_BIN_IDS + // $FILENAME_MANTIS_QUERY_RESULT + if (argc != 4) + throw std::runtime_error{"Please provide query.names user_bin.ids and mantis.results"}; + + std::string line_buffer; // Buffer for file I/O + + // Parse query names + std::vector query_names; + { + std::ifstream query_names_file{argv[1]}; + std::cout << "Reading " << argv[1] << "... "; + // Contains lines: "query_name" + while (std::getline(query_names_file, line_buffer)) + query_names.push_back(line_buffer); + std::cout << "Done " << std::endl; + } + + // Parse user bin name-to-id info + std::unordered_map ub_name_to_id; // [reference_name] = number + { + std::ifstream user_bin_ids_file{argv[2]}; + std::cout << "Reading in " << argv[2] << "... "; + // Contains lines: "some_number reference_name" + while (std::getline(user_bin_ids_file, line_buffer)) + { + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view id_value{line_buffer.begin(), tab_it}; + std::string_view name_key{++tab_it, line_buffer.end()}; + ub_name_to_id.emplace(name_key, std::atoi(id_value.data())); + } + std::cout << "Done " << std::endl; + } + + // Process mantis results + std::ifstream mantis_result_in{argv[3]}; + std::ofstream mantis_result_out{"mantis.ready"}; + + std::cout << "Processing " << argv[3] << "... "; + // ## Threshold: + // Let: + // * p patternsize + // * k k-mer size + // * e errors + // * c k-mer count + // * t threshold + // Then: + // * c = p - k + 1 [Lemma A] + // * p = c + k - 1 [Eq 1] + // * t = p - (e + 1) * k + 1 [Lemma B] + // * t = c + k - 1 - (e + 1) * k + 1 [Eq 1 + Lemma B] + // * t = c + 1 - 1 + k - k - e * k + // * t = c - e * k + // However, for mantis, we need - 1. Maybe this is some edge case in mantis that isn't covered. + // 155 is the threshold, 154 gets reported. + auto threshold = [destroyed_kmers](int const kmer_count) + { + assert(kmer_count > 0); + assert(kmer_count - 1 >= destroyed_kmers); + return kmer_count - (destroyed_kmers)-1; + }; + + int mantis_threshold{}; // Needs to be set for each query. + + // ## Mantis results ## + // There is no query ID, instead they are enumerated: seq0 - seqX + // For each read: + // * seqX kmers in query + // * For each user bin that has kmers, the count is reported: + // * absolute_path/to/ub.squeakr count + // E.g.: + // seq0 219 + // [...]/GCF_016028855.1_ASM1602885v1_genomic.squeakr 173 + // [...]/GCF_016128175.1_ASM1612817v1_genomic.squeakr 205 + // [...]/GCF_020162095.1_ASM2016209v1_genomic.squeakr 1 + // seq1 219 + std::string ub_name_buffer{}; + auto parse_user_bin_id = [&ub_name_buffer, &ub_name_to_id](std::string const & line) + { + ub_name_buffer.assign(line.begin() + line.find_last_of('/') + 1, // Skip absolute path + line.begin() + line.find_last_of('.')); // Skip .squeakr extension + return ub_name_to_id[ub_name_buffer]; + }; + + auto parse_kmer_count = [](std::string const & line) + { + std::string_view const sv{line.begin() + line.find('\t') + 1, // Skip seqX + line.end()}; + return std::atoi(sv.data()); + }; + + size_t current_query_number{}; + std::vector results; + + // // process first line + // std::getline(mantis_result_in, line); + // mantis_result_out << query_names[i] << '\t'; + // std::string kmer_count_str{line.begin() + line.find('\t') + 1, line.end()}; + // int kmer_count = std::atoi(kmer_count_str.data()); + // mantis_threshold = threshold(kmer_count); + // ++i; + // ^^^ old + // vvv TODO: Why is this a special case? Shouldn't it be covered by the while loop? + + std::string result_buffer{}; + + while (std::getline(mantis_result_in, line_buffer)) + { + if (line_buffer.starts_with("seq")) // new query + { + // // Process the results of the previous query. + // std::sort(results.begin(), results.end()); + // if (!results.empty()) + // mantis_result_out << results.front(); + // for (unsigned r = 1; r < results.size(); ++r) + // mantis_result_out << ',' << results[r]; + // mantis_result_out << '\n'; + // results.clear(); + // ^^^ old + // vvv TODO: this could span everything regarding results; then the first line doesn't need special treatment? + // Process the results of the previous query. + if (!results.empty()) + { + std::sort(results.begin(), results.end()); + for (size_t const ub : results) + result_buffer += std::to_string(ub) + ','; + result_buffer.back() = '\n'; + mantis_result_out << result_buffer; + result_buffer.clear(); + results.clear(); + } + + // Output new query name. + assert(current_query_number < query_names.size()); + mantis_result_out << query_names[current_query_number] << '\t'; + + // Compute threshold for current query. + mantis_threshold = threshold(parse_kmer_count(line_buffer)); + + ++current_query_number; + } + else + { + if (parse_kmer_count(line_buffer) >= mantis_threshold) + results.push_back(parse_user_bin_id(line_buffer)); + } + } + // write out last result + if (!results.empty()) + { + std::sort(results.begin(), results.end()); + for (size_t const ub : results) + result_buffer += std::to_string(ub) + ','; + result_buffer.back() = '\n'; + mantis_result_out << result_buffer; + } + // std::sort(results.begin(), results.end()); + // if (!results.empty()) + // mantis_result_out << results.front(); + // for (unsigned r = 1; r < results.size(); ++r) + // mantis_result_out << "," << results[r]; + // mantis_result_out << '\n'; + + std::cout << "Done " << std::endl; +} diff --git a/util/hibf/misc/helper/src/normalise_bifrost_output.cpp b/util/hibf/misc/helper/src/normalise_bifrost_output.cpp new file mode 100644 index 00000000..c8be0167 --- /dev/null +++ b/util/hibf/misc/helper/src/normalise_bifrost_output.cpp @@ -0,0 +1,227 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline robin_hood::unordered_map +parse_user_bin_ids(std::filesystem::path const & user_bin_ids_file) +{ + std::string line_buffer{}; + robin_hood::unordered_map ub_name_to_id; + std::ifstream user_bin_ids_in{user_bin_ids_file}; + + // Contains lines: "some_number reference_name" + while (std::getline(user_bin_ids_in, line_buffer)) + { + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const id_value{line_buffer.begin(), tab_it}; + std::string_view const name_key{++tab_it, line_buffer.end()}; + ub_name_to_id.emplace(name_key, id_value); + } + return ub_name_to_id; +} + +inline void check_output_file(std::filesystem::path const & output_file) +{ + std::filesystem::path const output_directory = output_file.parent_path(); + std::error_code ec{}; + std::filesystem::create_directories(output_directory, ec); + + if (!output_directory.empty() && ec) + throw seqan3::argument_parser_error{ + seqan3::detail::to_string("Failed to create directory\"", output_directory.c_str(), "\": ", ec.message())}; +} + +struct config +{ + std::filesystem::path query_names_file{}; + std::filesystem::path user_bin_ids_file{}; + std::filesystem::path bifrost_result_file{}; + std::filesystem::path output_file{}; +}; + +std::vector parse_query_names(std::filesystem::path const & query_names_file) +{ + std::string line_buffer{}; + std::vector query_names; + std::ifstream query_names_in{query_names_file}; + + std::cerr << "Reading " << query_names_file << " ... " << std::flush; + // Contains lines: "query_name" + while (std::getline(query_names_in, line_buffer)) + query_names.push_back(line_buffer); + std::cerr << "Done" << std::endl; + return query_names; +} + +void normalise_output(config const & cfg) +{ + // All query names + std::vector const query_names{parse_query_names(cfg.query_names_file)}; + // map[reference_name] = number + std::cerr << "Reading " << cfg.user_bin_ids_file << " ... " << std::flush; + robin_hood::unordered_map const ub_name_to_id{parse_user_bin_ids(cfg.user_bin_ids_file)}; + std::cerr << "Done" << std::endl; + + // Process bifrost results + std::ifstream bifrost_result_in{cfg.bifrost_result_file}; + std::ofstream bifrost_result_out{cfg.output_file}; + size_t current_query_number{}; + std::vector results; + + // Buffers for file I/O + std::string ub_name_buffer{}; + std::string result_buffer{}; + std::string line_buffer{}; + + std::vector bifrost_user_bins{}; + + std::string normalised_bifrost_line{}; + + // ## Bifrost results ## + // Bifrost outputs a matrix. Column names = user bin id. Row names = query names + auto split_line_by_tab_and = [](std::string_view bifrost_line, auto do_me) + { + std::string_view::size_type current_pos = 0; + std::string_view::size_type tab_pos{bifrost_line.find('\t')}; + size_t column_idx{}; + + while (tab_pos != std::string_view::npos) + { + auto current = std::string(&bifrost_line[current_pos], tab_pos - current_pos); + do_me(current, column_idx); + current_pos = tab_pos + 1; + tab_pos = bifrost_line.find('\t', current_pos); + ++column_idx; + } + // process last ub + auto last = std::string(&bifrost_line[current_pos], bifrost_line.size() - current_pos); + do_me(last, column_idx); + }; + + auto parse_header_user_bin_id = [&bifrost_user_bins, &ub_name_to_id](std::string const & sv, size_t idx) + { + if (idx != 0) + { + auto filename_start = sv.find_last_of('/') + 1; + auto user_bin_id = sv.substr(filename_start, sv.size() - filename_start - 7 /* |".fna.gz"| */); + + try + { + bifrost_user_bins.push_back(ub_name_to_id.at(user_bin_id)); + } + catch (std::exception const & e) + { + std::cerr << "Could not find id: " << user_bin_id << std::endl; + throw e; + } + } + else + { + bifrost_user_bins.push_back("0"); // don't mess up the indices + } + }; + + auto insert_if_one = [&normalised_bifrost_line, &bifrost_user_bins](std::string_view sv, size_t idx) + { + if (sv == std::string_view{"1"}) // excludes 0 and the first column which is alywas the query name + { + normalised_bifrost_line.insert(normalised_bifrost_line.end(), + bifrost_user_bins[idx].begin(), + bifrost_user_bins[idx].end()); + normalised_bifrost_line.push_back(','); + } + }; + + std::cerr << "Processing " << cfg.bifrost_result_file << " ... " << std::endl; + + // Parse header line + if (std::getline(bifrost_result_in, line_buffer)) + { + assert(line_buffer.starts_with("query_name")); + split_line_by_tab_and(line_buffer, parse_header_user_bin_id); + } + + std::cerr << "Successfully parsed Header line ... " << std::endl; + assert(ub_name_to_id.size() == bifrost_user_bins.size()); + + while (std::getline(bifrost_result_in, line_buffer)) + { + assert(query_names[current_query_number] == std::string(&line_buffer[0], line_buffer.find('\t'))); + normalised_bifrost_line = query_names[current_query_number]; + normalised_bifrost_line.push_back('\t'); + split_line_by_tab_and(line_buffer, insert_if_one); + if (normalised_bifrost_line.back() == ',') + normalised_bifrost_line.pop_back(); + bifrost_result_out << normalised_bifrost_line << '\n'; + ++current_query_number; + normalised_bifrost_line.clear(); + } + + std::cerr << "Done" << std::endl; +} + +void init_parser(seqan3::argument_parser & parser, config & cfg) +{ + parser.add_option(cfg.query_names_file, + '\0', + "query_names", + "The file containing query names, e.g., \"query.names\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.user_bin_ids_file, + '\0', + "user_bin_ids", + "The file containing user bin ids, e.g., \"user_bin.ids\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.bifrost_result_file, + '\0', + "bifrost_results", + "The bifrost result file, e.g., \"bifrost.results\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.output_file, + '\0', + "output_file", + "Provide a path to the output.", + seqan3::option_spec::required); +} + +int main(int argc, char ** argv) +{ + seqan3::argument_parser parser{"normalise_bifrost_output", argc, argv, seqan3::update_notifications::off}; + parser.info.author = "Svenja Mehringer, Enrico Seiler"; + parser.info.email = "enrico.seiler@fu-berlin.de"; + parser.info.short_description = "Converts bifrost results into raptor-like results."; + parser.info.version = "0.0.1"; + + config cfg{}; + init_parser(parser, cfg); + + try + { + parser.parse(); + check_output_file(cfg.output_file); + } + catch (seqan3::argument_parser_error const & ext) + { + std::cerr << "[Error] " << ext.what() << '\n'; + std::exit(-1); + } + + normalise_output(cfg); +} diff --git a/util/hibf/misc/helper/src/normalise_raptor_output.cpp b/util/hibf/misc/helper/src/normalise_raptor_output.cpp new file mode 100644 index 00000000..8f9f5828 --- /dev/null +++ b/util/hibf/misc/helper/src/normalise_raptor_output.cpp @@ -0,0 +1,194 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline robin_hood::unordered_map +parse_user_bin_ids(std::filesystem::path const & user_bin_ids_file) +{ + std::string line_buffer{}; + uint64_t id_buffer{}; + robin_hood::unordered_map ub_name_to_id; + std::ifstream user_bin_ids_in{user_bin_ids_file}; + + // Contains lines: "some_number reference_name" + while (std::getline(user_bin_ids_in, line_buffer)) + { + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const id_value{line_buffer.begin(), tab_it}; + std::string_view const name_key{++tab_it, line_buffer.end()}; + std::from_chars(id_value.data(), id_value.data() + id_value.size(), id_buffer); + ub_name_to_id.emplace(name_key, id_buffer); + } + return ub_name_to_id; +} + +inline void check_output_file(std::filesystem::path const & output_file) +{ + std::filesystem::path const output_directory = output_file.parent_path(); + std::error_code ec{}; + std::filesystem::create_directories(output_directory, ec); + + if (!output_directory.empty() && ec) + throw seqan3::argument_parser_error{ + seqan3::detail::to_string("Failed to create directory\"", output_directory.c_str(), "\": ", ec.message())}; +} + +struct config +{ + std::filesystem::path truth_user_bin_ids_file{}; + std::filesystem::path raptor_result_file{}; + std::filesystem::path output_file{}; +}; + +robin_hood::unordered_map +create_ub_to_ub_mapping_from_header(std::ifstream & raptor_result_in, std::string & line_buffer, config const & cfg) +{ + // map[reference_name] = number + std::cerr << "Reading " << cfg.truth_user_bin_ids_file << " ... " << std::flush; + robin_hood::unordered_map const truth_ub_name_to_id{ + parse_user_bin_ids(cfg.truth_user_bin_ids_file)}; + std::cerr << "Done" << std::endl; + + robin_hood::unordered_map ub_to_ub; + + std::cerr << "Create ub_to_ub mapping ... " << std::flush; + while (std::getline(raptor_result_in, line_buffer) && line_buffer.starts_with('#') && line_buffer[1] != ('Q')) + { + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const idx{line_buffer.begin() + 1 /* skip '#' */, + line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const name_key{line_buffer.begin() + line_buffer.find_last_of('/') + 1, + line_buffer.begin() + line_buffer.find(".fna.gz")}; + + // std::cerr << "SEARCH FOR" << name_key << std::endl; + + ub_to_ub.emplace(idx, truth_ub_name_to_id.at(std::string{name_key})); + } + std::cerr << "Done" << std::endl; + + return ub_to_ub; +} + +void normalise_output(config const & cfg) +{ + // Process raptor results + std::ifstream raptor_result_in{cfg.raptor_result_file}; + std::ofstream raptor_result_out{cfg.output_file}; + + // Buffers for file I/O + std::vector result_user_bins{}; + std::string line_buffer{}; + + // ## Raptor results ## + // The header stores the user bin ID. + // The ids of the file to normalize have to be adapted to the truth sets user bin ids + auto const ub_to_ub = create_ub_to_ub_mapping_from_header(raptor_result_in, line_buffer, cfg); + + std::cerr << "Processsssssing " << cfg.raptor_result_file << " ... " << std::flush; + + while (std::getline(raptor_result_in, line_buffer)) + { + result_user_bins.clear(); + + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string_view const id{line_buffer.begin(), tab_it}; + std::string_view const bins{++tab_it, line_buffer.end()}; + + if (bins.empty()) + { + raptor_result_out << id << '\t' << '\n'; + continue; + } + + std::string_view::size_type current_pos = 0; + std::string_view::size_type comma_pos{bins.find(',')}; + + while (comma_pos != std::string_view::npos) + { + auto ub_it = ub_to_ub.find(std::string(&bins[current_pos], comma_pos - current_pos)); + if (ub_it == ub_to_ub.end()) + throw std::runtime_error{"Could not find id " + std::string(&bins[current_pos], comma_pos - current_pos) + + " in ub_to_ub."}; + result_user_bins.push_back(ub_it->second); + current_pos = comma_pos + 1; + comma_pos = bins.find(',', current_pos); + } + // process last ub + auto ub_it = ub_to_ub.find(std::string(&bins[current_pos], bins.size() - current_pos)); + if (ub_it == ub_to_ub.end()) + throw std::runtime_error{"Could not find id " + std::string(&bins[current_pos], bins.size() - current_pos) + + " in ub_to_ub."}; + result_user_bins.push_back(ub_it->second); + std::sort(result_user_bins.begin(), result_user_bins.end()); // compare script afterwards requires sorted UBs + + // Write new normalised raptor line: + auto it = result_user_bins.begin(); + raptor_result_out << id << '\t' << *(it++); + while (it != result_user_bins.end()) + raptor_result_out << ',' << *(it++); + raptor_result_out << '\n'; + } + + std::cerr << "Done" << std::endl; +} + +void init_parser(seqan3::argument_parser & parser, config & cfg) +{ + parser.add_option(cfg.truth_user_bin_ids_file, + '\0', + "user_bin_ids", + "The file containing user bin ids from the 'truth'-raptor file, e.g., \"user_bin.ids\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.raptor_result_file, + '\0', + "raptor_results", + "The raptor result file, e.g., \"raptor.results\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.output_file, + '\0', + "output_file", + "Provide a path to the output.", + seqan3::option_spec::required); +} + +int main(int argc, char ** argv) +{ + seqan3::argument_parser parser{"normalise_mantis_output", argc, argv, seqan3::update_notifications::off}; + parser.info.author = "Svenja Mehringer, Enrico Seiler"; + parser.info.email = "enrico.seiler@fu-berlin.de"; + parser.info.short_description = "Unifies raptor results by replacing user bin ids from one raptor file with those " + "of a 'truth'-raptor file."; + parser.info.version = "0.0.1"; + + config cfg{}; + init_parser(parser, cfg); + + try + { + parser.parse(); + check_output_file(cfg.output_file); + } + catch (seqan3::argument_parser_error const & ext) + { + std::cerr << "[Error] " << ext.what() << '\n'; + std::exit(-1); + } + + normalise_output(cfg); +} diff --git a/util/hibf/misc/helper/src/normalise_yara_truth_file.cpp b/util/hibf/misc/helper/src/normalise_yara_truth_file.cpp new file mode 100644 index 00000000..ffd16727 --- /dev/null +++ b/util/hibf/misc/helper/src/normalise_yara_truth_file.cpp @@ -0,0 +1,202 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline void check_output_file(std::filesystem::path const & output_file) +{ + std::filesystem::path const output_directory = output_file.parent_path(); + std::error_code ec{}; + std::filesystem::create_directories(output_directory, ec); + + if (!output_directory.empty() && ec) + throw seqan3::argument_parser_error{ + seqan3::detail::to_string("Failed to create directory\"", output_directory.c_str(), "\": ", ec.message())}; +} + +struct config +{ + std::filesystem::path yara_result_file{}; + std::filesystem::path output_file{}; + std::filesystem::path query_names_file{}; +}; + +std::vector parse_query_names(std::filesystem::path const & query_names_file) +{ + std::string line_buffer{}; + std::vector query_names; + std::ifstream query_names_in{query_names_file}; + + std::cerr << "Reading " << query_names_file << " ... " << std::flush; + // Contains lines: "query_name" + while (std::getline(query_names_in, line_buffer)) + query_names.push_back(line_buffer); + std::cerr << "Done" << std::endl; + return query_names; +} + +void normalise_output(config const & cfg) +{ + std::vector const query_names{parse_query_names(cfg.query_names_file)}; + std::cerr << "Read " << query_names.size() << "query names" << std::endl; + + // Process yara results + std::ifstream yara_result_in{cfg.yara_result_file}; + std::ofstream yara_result_out{cfg.output_file}; + std::vector results; + std::string last_seen_query_name{}; + + // Buffers for file I/O + std::string result_buffer{}; + std::string line_buffer{}; + + auto parse_query_name_and_user_bin = [](std::string const & line) + { + uint64_t idx{}; + std::string const qname{line.begin(), line.begin() + line.find(':')}; + std::string_view const idx_str{line.begin() + qname.size() + 1, line.end()}; + std::from_chars(idx_str.data(), idx_str.data() + idx_str.size(), idx); + + return std::make_pair(qname, idx); + }; + + auto process_results = [&results, &result_buffer, &yara_result_out]() + { + if (!results.empty()) + { + std::sort(results.begin(), results.end()); + for (size_t const ub : results) + result_buffer += std::to_string(ub) + ','; + result_buffer.back() = '\n'; + + yara_result_out << result_buffer; + result_buffer.clear(); + results.clear(); + } + else + { + yara_result_out << '\n'; + } + }; + + auto qname_it = query_names.begin(); + + auto check_qname_against_reference = [&qname_it, &query_names, &last_seen_query_name, &yara_result_out]() + { + if (qname_it == query_names.end()) + throw std::runtime_error{"query_names consumed although processing has not ended. last_seen: " + + last_seen_query_name}; + + while (qname_it != query_names.end() && *qname_it != last_seen_query_name) + { + std::cerr << "Note: " << *qname_it << " not found in validation file." << std::endl; + yara_result_out << *qname_it << '\t' << '\n'; + ++qname_it; + } + + if (qname_it != query_names.end()) + ++qname_it; // function is called when last_seen_query_name is also updated + }; + + std::cerr << "Processing " << cfg.yara_result_file << " ... " << std::flush; + + // First line. + if (std::getline(yara_result_in, line_buffer)) + { + auto && [qname, user_bin_idx] = parse_query_name_and_user_bin(line_buffer); + last_seen_query_name = qname; + results.push_back(user_bin_idx); + } + + while (std::getline(yara_result_in, line_buffer)) + { + auto && [qname, user_bin_idx] = parse_query_name_and_user_bin(line_buffer); + + if (qname != last_seen_query_name) // new query + { + check_qname_against_reference(); + + yara_result_out << last_seen_query_name << '\t'; + + process_results(); + + results.clear(); + last_seen_query_name = qname; + results.push_back(user_bin_idx); + } + else + { + results.push_back(user_bin_idx); + } + } + + // Write last results. + yara_result_out << last_seen_query_name << '\t'; + process_results(); + check_qname_against_reference(); + + if (qname_it != query_names.end()) + throw std::runtime_error{"query_names not fully consumed although processing has ended. last qname: " + + (*qname_it)}; + + std::cerr << "Done" << std::endl; +} + +void init_parser(seqan3::argument_parser & parser, config & cfg) +{ + parser.add_option(cfg.yara_result_file, + '\0', + "yara_results", + "The yara result file, e.g., \"yara.results\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.query_names_file, + '\0', + "query_names", + "The file containing query names, e.g., \"query.names\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + parser.add_option(cfg.output_file, + '\0', + "output_file", + "Provide a path to the output.", + seqan3::option_spec::required); +} + +int main(int argc, char ** argv) +{ + seqan3::argument_parser parser{"normalise_yara_output", argc, argv, seqan3::update_notifications::off}; + parser.info.author = "Svenja Mehringer, Enrico Seiler"; + parser.info.email = "enrico.seiler@fu-berlin.de"; + parser.info.short_description = "Converts yara results into raptor-like results."; + parser.info.version = "0.0.1"; + + config cfg{}; + init_parser(parser, cfg); + + try + { + parser.parse(); + check_output_file(cfg.output_file); + } + catch (seqan3::argument_parser_error const & ext) + { + std::cerr << "[Error] " << ext.what() << '\n'; + std::exit(-1); + } + + normalise_output(cfg); +} diff --git a/util/hibf/misc/helper/src/to_be_deleted.cpp b/util/hibf/misc/helper/src/to_be_deleted.cpp new file mode 100644 index 00000000..71e114ce --- /dev/null +++ b/util/hibf/misc/helper/src/to_be_deleted.cpp @@ -0,0 +1,215 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline void check_output_file(std::filesystem::path const & output_file) +{ + std::filesystem::path const output_directory = output_file.parent_path(); + std::error_code ec{}; + std::filesystem::create_directories(output_directory, ec); + + if (!output_directory.empty() && ec) + throw seqan3::argument_parser_error{ + seqan3::detail::to_string("Failed to create directory\"", output_directory.c_str(), "\": ", ec.message())}; +} + +enum class validation +{ + fps, + fns +}; + +inline robin_hood::unordered_map> +parse_validation_for(std::filesystem::path const & validation_file, validation which) +{ + uint64_t user_bin_id_buffer{}; + uint64_t count_buffer{}; + robin_hood::unordered_map> truths_per_query; + + std::ifstream validation_in{validation_file}; + std::string line_buffer{}; + + // Contains lines: "squery_name:user_in_id:count" + while (std::getline(validation_in, line_buffer)) + { + auto start{line_buffer.begin()}; + auto colon{start + line_buffer.find(':')}; + std::string const query_name{line_buffer.begin(), colon}; + start = ++colon; + colon = line_buffer.begin() + line_buffer.find(':', colon - line_buffer.begin()); + std::string_view const user_in_id{start, colon}; + std::string_view const count_str{++colon, line_buffer.end()}; + + std::from_chars(user_in_id.data(), user_in_id.data() + user_in_id.size(), user_bin_id_buffer); + std::from_chars(count_str.data(), count_str.data() + count_str.size(), count_buffer); + + if ((which == validation::fps && count_buffer < 155) || (which == validation::fns && count_buffer >= 155)) + truths_per_query[query_name].push_back(user_bin_id_buffer); + } + + return truths_per_query; +} + +struct config +{ + std::filesystem::path raptor_result_file{}; + std::filesystem::path FN_file{}; + std::filesystem::path FP_file{}; + std::filesystem::path output_file{}; +}; + +void correct_truth_file(config const & cfg) +{ + // Process raptor results + std::ifstream raptor_result_in{cfg.raptor_result_file}; + std::ofstream raptor_result_out{cfg.output_file}; + + // Buffers for file I/O + std::vector result_user_bins{}; + std::string line_buffer{}; + uint64_t user_bin_id_buffer{}; + + auto fns_truths_per_query = parse_validation_for(cfg.FN_file, validation::fns); + auto fps_truths_per_query = parse_validation_for(cfg.FP_file, validation::fps); + + std::cerr << "Processsssssing " << cfg.raptor_result_file << " ... " << std::flush; + + // rewrite header + while (std::getline(raptor_result_in, line_buffer) && line_buffer[0] == '#') + raptor_result_out << line_buffer << '\n'; + + do + { + result_user_bins.clear(); + + auto tab_it{line_buffer.begin() + line_buffer.find('\t')}; + std::string const id{line_buffer.begin(), tab_it}; + std::string_view const bins{++tab_it, line_buffer.end()}; + + if (bins.empty()) + { + raptor_result_out << id << '\t' << '\n'; + continue; + } + + std::string_view::size_type current_pos = 0; + std::string_view::size_type comma_pos{bins.find(',')}; + + // get all user bins + while (comma_pos != std::string_view::npos) + { + auto user_bin_id = std::string(&bins[current_pos], comma_pos - current_pos); + std::from_chars(user_bin_id.data(), user_bin_id.data() + user_bin_id.size(), user_bin_id_buffer); + result_user_bins.push_back(user_bin_id_buffer); + current_pos = comma_pos + 1; + comma_pos = bins.find(',', current_pos); + } + // process last ub + auto user_bin_id = std::string(&bins[current_pos], bins.size() - current_pos); + std::from_chars(user_bin_id.data(), user_bin_id.data() + user_bin_id.size(), user_bin_id_buffer); + result_user_bins.push_back(user_bin_id_buffer); + + // remove true false posiives + auto find_fps = [&](auto s) + { + auto & l = fps_truths_per_query[id]; + return std::find(l.begin(), l.end(), s) != l.end(); + }; + result_user_bins.erase(std::remove_if(result_user_bins.begin(), result_user_bins.end(), find_fps), + result_user_bins.end()); + + // insert false negatives + // currently we don't have false negatives + if (!fns_truths_per_query[id].empty()) + std::cerr << "warning: there is a FN I want to insert.\n"; + // auto & list = fns_truths_per_query[id]; + // result_user_bins.insert(result_user_bins.end(), list.begin(), list.end()); + // std::sort(result_user_bins.begin(), result_user_bins.end()); // sort again. + + if (result_user_bins.empty()) + { + raptor_result_out << id << '\t' << '\n'; + continue; + } + + // Write new normalised raptor line: + auto it = result_user_bins.begin(); + raptor_result_out << id << '\t' << *(it++); + while (it != result_user_bins.end()) + raptor_result_out << ',' << *(it++); + raptor_result_out << '\n'; + } + while (std::getline(raptor_result_in, line_buffer)); + + std::cerr << "Done" << std::endl; +} + +void init_parser(seqan3::argument_parser & parser, config & cfg) +{ + parser.add_option(cfg.raptor_result_file, + '\0', + "raptor_results", + "The raptor result file, e.g., \"raptor.results\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + + parser.add_option(cfg.FN_file, + '\0', + "fns", + "The true false negatives to incooporate.\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + + parser.add_option(cfg.FP_file, + '\0', + "fps", + "The true false negatives to incooporate.\".", + seqan3::option_spec::required, + seqan3::input_file_validator{}); + + parser.add_option(cfg.output_file, + '\0', + "output_file", + "Provide a path to the output.", + seqan3::option_spec::required); +} + +int main(int argc, char ** argv) +{ + seqan3::argument_parser parser{"normalise_mantis_output", argc, argv, seqan3::update_notifications::off}; + parser.info.author = "Svenja Mehringer, Enrico Seiler"; + parser.info.email = "enrico.seiler@fu-berlin.de"; + parser.info.short_description = "Corrects the raptor file with validated FPs and FNs."; + parser.info.version = "0.0.1"; + + config cfg{}; + init_parser(parser, cfg); + + try + { + parser.parse(); + check_output_file(cfg.output_file); + } + catch (seqan3::argument_parser_error const & ext) + { + std::cerr << "[Error] " << ext.what() << '\n'; + std::exit(-1); + } + + correct_truth_file(cfg); +} diff --git a/util/new/CMakeLists.txt b/util/new/CMakeLists.txt index ec08150b..18b43f14 100644 --- a/util/new/CMakeLists.txt +++ b/util/new/CMakeLists.txt @@ -70,6 +70,10 @@ target_compile_options ("new_common" INTERFACE "-pedantic" "-Wall" "-Wextra") add_executable ("normalise_mantis_output" applications/normalise_mantis_output.cpp) target_link_libraries ("normalise_mantis_output" "new_common") +add_executable ("normalise_raptor_output" applications/normalise_raptor_output.cpp) +target_link_libraries ("normalise_raptor_output" "common") +install (TARGETS "normalise_raptor_output" DESTINATION "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + add_executable ("compare_mantis_raptor_output" applications/compare_mantis_raptor_output.cpp) target_link_libraries ("compare_mantis_raptor_output" "new_common") diff --git a/util/src/applications/check_fastq.cpp b/util/src/applications/check_fastq.cpp new file mode 100644 index 00000000..33f00908 --- /dev/null +++ b/util/src/applications/check_fastq.cpp @@ -0,0 +1,12 @@ +#include + +int main(int argc, char ** argv) +{ + if (argc != 2) + throw std::runtime_error{"provide a fastq file!"}; + + seqan3::sequence_file_input fin{argv[1]}; + + for (auto & rec : fin) + std::cout << "parsed record:" << rec.id() << std::endl; +} diff --git a/util/src/applications/generate_reads_refseq.cpp b/util/src/applications/generate_reads_refseq.cpp index 9bdb2541..00d126d4 100644 --- a/util/src/applications/generate_reads_refseq.cpp +++ b/util/src/applications/generate_reads_refseq.cpp @@ -18,6 +18,7 @@ struct cmd_arguments { std::filesystem::path bin_file_path{}; std::vector bin_path{}; + std::vector number_of_reads_per_bin{}; std::filesystem::path output_directory{}; uint8_t errors{2u}; uint32_t read_length{100u}; @@ -77,16 +78,15 @@ void run_program(cmd_arguments const & arguments) std::uniform_int_distribution dna4_rank_dis(0, 3); size_t const number_of_bins{arguments.bin_path.size()}; - uint32_t const reads_per_bin = arguments.number_of_reads / number_of_bins; + // uint32_t const reads_per_bin = arguments.number_of_reads / number_of_bins; std::vector const quality(arguments.read_length, seqan3::assign_rank_to(40u, seqan3::phred42{})); auto worker = [&](auto && zipped_view, auto &&) { - for (auto && [bin_file, bin_number] : zipped_view) + for (auto && [bin_file, reads_per_bin, bin_number] : zipped_view) { std::mt19937_64 rng(bin_number); - uint32_t read_counter{bin_number * reads_per_bin}; // Immediately invoked initialising lambda expession (IIILE). std::filesystem::path const out_file = [&]() { @@ -111,7 +111,9 @@ void run_program(cmd_arguments const & arguments) for (auto const & [seq] : fin) { uint64_t const reference_length = std::ranges::size(seq); - std::uniform_int_distribution read_start_dis(0, reference_length - arguments.read_length); + uint64_t const dis_range_end = + reference_length - std::min(reference_length, arguments.read_length); + std::uniform_int_distribution read_start_dis(0, dis_range_end); for (uint32_t current_read_number = 0; current_read_number < reads_per_record && bin_read_counter < reads_per_bin; ++current_read_number, ++read_counter, ++bin_read_counter) @@ -123,7 +125,7 @@ void run_program(cmd_arguments const & arguments) for (uint8_t error_count = 0; error_count < arguments.errors; ++error_count) { - uint32_t const error_pos = read_error_position_dis(rng); + uint32_t const error_pos = std::min(read_error_position_dis(rng), read.size()); seqan3::dna4 const current_base = read[error_pos]; seqan3::dna4 new_base = current_base; while (new_base == current_base) @@ -131,14 +133,20 @@ void run_program(cmd_arguments const & arguments) read[error_pos] = new_base; } - fout.emplace_back(read, std::to_string(read_counter), quality); + std::vector correct_quality{quality.begin(), quality.begin() + read.size()}; + fout.emplace_back(read, + out_file.stem().string() + std::to_string(bin_read_counter), + correct_quality); + // no clue why std::views::take does not work + // fout.emplace_back(read, out_file.stem().string() + std::to_string(bin_read_counter), (quality | std::views::take(reference_length))); } } } }; size_t const chunk_size = std::bit_ceil(number_of_bins / arguments.threads); - auto chunked_view = seqan3::views::zip(arguments.bin_path, std::views::iota(0u)) | seqan3::views::chunk(chunk_size); + auto chunked_view = seqan3::views::zip(arguments.bin_path, arguments.number_of_reads_per_bin, std::views::iota(0u)) + | seqan3::views::chunk(chunk_size); seqan3::detail::execution_handler_parallel executioner{arguments.threads}; executioner.bulk_execute(std::move(worker), std::move(chunked_view), []() {}); } @@ -201,13 +209,27 @@ int main(int argc, char ** argv) std::string line; sharg::input_file_validator validator{}; + size_t sum_of_weights{}; while (std::getline(istrm, line)) { if (!line.empty()) { - std::filesystem::path bin_path{line}; + auto tab = std::find(line.begin(), line.end(), '\t'); + + // parse file path + std::filesystem::path bin_path{line.begin(), tab}; validator(bin_path); arguments.bin_path.push_back(std::move(bin_path)); + + // parse weight if given + if (tab != line.end()) + { + ++tab; + size_t tmp{}; + std::from_chars(&(*tab), &line[line.size() - 1], tmp); + sum_of_weights += tmp; + arguments.number_of_reads_per_bin.push_back(tmp); // initialise with weight + } } } @@ -219,8 +241,13 @@ int main(int argc, char ** argv) if (number_of_bins > arguments.number_of_reads) throw sharg::parser_error{"Must simulate at least one read per bin."}; - if (arguments.number_of_reads % number_of_bins) - throw sharg::parser_error{"The number of reads must distribute evenly over the bins."}; + if (number_of_bins != arguments.number_of_reads_per_bin.size()) + throw seqan3::argument_parser_error{"number_of_bins (" + std::to_string(number_of_bins) + + " != arguments.number_of_reads_per_bin.size()" + + std::to_string(arguments.number_of_reads_per_bin.size()) + ")"}; + + for (size_t & weight : arguments.number_of_reads_per_bin) // was initialised with the weights of the bins + weight = std::ceil((static_cast(weight) / sum_of_weights) * arguments.number_of_reads); std::filesystem::create_directory(arguments.output_directory); run_program(arguments);