From ad5ef6380f51aaec7c6847af47636723a15a41b7 Mon Sep 17 00:00:00 2001 From: Lydia Buntrock Date: Wed, 28 Jul 2021 13:17:59 +0200 Subject: [PATCH] [FEATURE] Get Tandem Duplications from CIGAR Signed-off-by: Lydia Buntrock --- include/iGenVar.hpp | 1 + .../analyze_cigar_method.hpp | 90 ++++++++ .../hierarchical_clustering_method.cpp | 1 - .../analyze_cigar_method.cpp | 194 ++++++++++++++++-- test/cli/iGenVar_cli_test.cpp | 22 +- test/data/datasources.cmake | 4 +- test/data/mini_example/output_err.txt | 11 +- test/data/mini_example/output_res.txt | 1 - 8 files changed, 292 insertions(+), 32 deletions(-) diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp index 07cb5bff..fb8b9e51 100644 --- a/include/iGenVar.hpp +++ b/include/iGenVar.hpp @@ -26,6 +26,7 @@ struct cmd_arguments /* -c */ clustering_methods clustering_method{hierarchical_clustering}; // default: hierarchical clustering /* -r */ refinement_methods refinement_method{no_refinement}; // default: no refinement // SV specifications: + /* -e TODO (irallia 19.08.21): duplication_errors */ /* -k */ uint64_t min_var_length = 30; /* -l */ uint64_t max_var_length = 100000; /* -m */ uint64_t max_tol_inserted_length = 50; diff --git a/include/modules/sv_detection_methods/analyze_cigar_method.hpp b/include/modules/sv_detection_methods/analyze_cigar_method.hpp index 7ab08958..79ffb00c 100644 --- a/include/modules/sv_detection_methods/analyze_cigar_method.hpp +++ b/include/modules/sv_detection_methods/analyze_cigar_method.hpp @@ -2,6 +2,96 @@ #include "structures/junction.hpp" // for class Junction +/*! \brief This function checks if the inserted bases are tandem duplicated. + * + * \param[in] config - configuration for a semi-gobal alignment + * \param[in] min_length - minimum length of variants to detect (default 30 bp, + * \param[in] sequence - suffix or prefix sequence + * \param[in] inserted_bases - the inserted bases of the possible duplication + * \param[in] is_suffix - true: suffix case, false: prefix case + * param[out] match_score - if we found a matching duplication, this value represents the maximal + * amount of matches with the reference and thus the length of the existing + * duplicated part on the reference. If there is no duplication, this value + * is 0. + * param[out] length_of_single_dupl_sequence - greatest common divisor of length of inserted sequence and length of + * maching part -> length of a single duplicated sequence + * \returns std::tie(match_score, length_of_single_dupl_sequence) - a tuple of the resulting values + * + * \details In this function the inserted bases are recursively aligned segment by segment until it has been proven that + * it is a real duplication. + * For simplicity, we only consider the suffix case here, since the prefix case works the same way: + * + * suffix_sequence AAAACCGCGTAGCGGGGCGGG + * |||||||||| + * inserted_bases GCGGGGCGGGGCGGG -> unmatched_inserted_bases: GCGGG + * -> match_score = 10 + * -> length_of_single_dupl_sequence = gcd(15, 10) = 5 + * + * suffix_sequence AAAACCGCGTAGCGGGGCGGG + * ||||| + * unmatched_inserted_bases GCGGG + * + * -> match_score = 5, length_of_single_dupl_sequence = gcd(15, 5) = 5 + */ +std::tuple align_suffix_or_prefix(auto const & config, + int32_t const min_length, + std::span & sequence, + std::span & inserted_bases, + bool is_suffix); + +/*! \brief This function checks if the inserted bases are tandem duplicated. + * + * \param[in] query_sequence - SEQ field of the SAM/BAM file + * \param[in] length - length of inserted part, given by the CIGAR + * \param[in] pos_ref - position of the inserted part in the ref (current position) + * \param[in] pos_read - position of the inserted part in the read (current position) + * \param[in] inserted_bases - the inserted bases of the possible duplication + * \param[in] min_length - minimum length of variants to detect (default 30 bp, + * expected to be non-negative) + * \param[in, out] pos_start_dup_seq - start position of the duplicated seq (excluding itself) + * \param[in, out] pos_end_dup_seq - end position of the duplicated seq (including itself) + * \param[in, out] tandem_dup_count - the number of tandem copies of the inserted sequence + * \returns duplicated_bases - the duplicated bases of the duplication + * + * \details If the inserted bases include one or more copies of a duplicated sequence, which is suffix or prefix of + * another copy, we have a Tandem Duplication. To check this, we have to do a semi-global alignment. + * + * Case 1: The duplication (insertion) comes after the matched sequence. Thus we need to check if the inserted + * sequence matches (partly) the suffix sequence. The length of the matching part yields to the amount + * of copies, thus we can calculate the tandem_dup_count and the inserted_bases. + * + * ref AAAACCGCGTAGCGGG----------TACGTAACGGTACG + * |||||||||||||| |||||||| -> inserted sequence: GCGGGGCGGG + * read AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC + * + * suffix_sequence AAAACCGCGTAGCGGG -> free_end_gaps_sequence1_leading{true}, + * ||||| free_end_gaps_sequence1_trailing{false} + * inserted_bases GCGGGGCGGG -> free_end_gaps_sequence2_leading{false}, + * free_end_gaps_sequence2_trailing{true} + * -> tandem_dup_count = 3, duplicated_bases = GCGGG + * + * Case 2: The duplication (insertion) comes before the matched sequence. + * ref AAAACCGCGTA----------GCGGGTACGTAACGGTACG + * ||||||||| ||||||||||||| -> inserted sequence: GCGGGGCGGG + * read AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC + * + * prefix_sequence GCGGGTACGTAACGGTACG -> free_end_gaps_sequence1_leading{false}, + * ||||| free_end_gaps_sequence1_trailing{true} + * inserted_bases GCGGGGCGGG -> free_end_gaps_sequence2_leading{true}, + * free_end_gaps_sequence2_trailing{false} + * -> tandem_dup_count = 3, duplicated_bases = GCGGG + * \see For some complex examples see detection_test.cpp. + */ +std::span detect_tandem_duplication(seqan3::dna5_vector const & query_sequence, + int32_t length, + int32_t pos_ref, + int32_t pos_read, + std::span & inserted_bases, + int32_t const min_length, + int32_t & pos_start_dup_seq, + int32_t & pos_end_dup_seq, + size_t & tandem_dup_count); + /*! \brief This function steps through the CIGAR string and stores junctions with their position in reference and read. * * \param[in] read_name - QNAME field of the SAM/BAM file diff --git a/src/modules/clustering/hierarchical_clustering_method.cpp b/src/modules/clustering/hierarchical_clustering_method.cpp index 5437a62e..748c6bd7 100644 --- a/src/modules/clustering/hierarchical_clustering_method.cpp +++ b/src/modules/clustering/hierarchical_clustering_method.cpp @@ -1,7 +1,6 @@ #include "iGenVar.hpp" // for global variable gVerbose #include "modules/clustering/hierarchical_clustering_method.hpp" -#include // for infinity #include // for random_device #include diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp index 368e2384..a115df1f 100644 --- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp +++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp @@ -1,3 +1,11 @@ +#include // for std::numeric_limits +#include // for std::gcd (greatest common divisor) + +// #include // for a nicer alignment view +#include +#include +#include +#include #include #include #include @@ -10,6 +18,119 @@ using seqan3::operator""_cigar_operation; using seqan3::operator""_dna5; +std::tuple align_suffix_or_prefix(auto const & config, + int32_t const min_length, + std::span & sequence, + std::span & inserted_bases, + bool is_suffix) +{ + // length of matching part of suffix or prefix sequence + size_t match_score{}; + // The number of tandem copies of this junction. + size_t length_of_single_dupl_sequence = std::numeric_limits::max(); + + auto results = seqan3::align_pairwise(std::tie(sequence, inserted_bases), config); + auto & res = *results.begin(); + // TODO (irallia 17.8.21): The mismatches should give us the opportunity to allow a given amount of errors in the + // duplication. + size_t matches = res.score() % 100; + size_t mismatches = (res.score() - matches) * (-1); + // For the beginning we do not allow mistakes, we should change this later, see todo above. + if (mismatches > 0) + return std::tie(match_score, length_of_single_dupl_sequence); + // found duplicated sequence in front of inserted sequence + if (matches >= min_length) + { + // seqan3::debug_stream << "Resulting alignment:\n" << res.alignment() << '\n'; + match_score = matches; + // The possible length of the single duplicated: greatest common divisor of length of inserted part and length + // of maching part + length_of_single_dupl_sequence = std::gcd(inserted_bases.size(), matches); + if(matches != inserted_bases.size()) + { + std::span unmatched_inserted_bases{}; + if (is_suffix) + unmatched_inserted_bases = inserted_bases | seqan3::views::slice(match_score, inserted_bases.size()); + else + unmatched_inserted_bases = inserted_bases | seqan3::views::slice(0, inserted_bases.size() - match_score); + // The first length_of_single_dupl_sequence is already the greatest common divisor and therefore the next + // recursively calculated one can be ignored. + auto [ next_match_score, + next_length_of_single_dupl_sequence ] = align_suffix_or_prefix(config, + min_length, + sequence, + unmatched_inserted_bases, + is_suffix); + // If the substring does not match, there is no real duplication. + if (next_match_score == 0) { + length_of_single_dupl_sequence = std::numeric_limits::max(); + return std::tie(next_match_score, length_of_single_dupl_sequence); + } + } + } + // The first match_score calculated is automatically the maximum, so the recursively next one can be ignored. + return std::tie(match_score, length_of_single_dupl_sequence); +} + +std::span detect_tandem_duplication(seqan3::dna5_vector const & query_sequence, + int32_t length, + int32_t pos_ref, + int32_t pos_read, + std::span & inserted_bases, + int32_t const min_length, + int32_t & pos_start_dup_seq, + int32_t & pos_end_dup_seq, + size_t & tandem_dup_count) +{ + auto scoring_scheme = seqan3::align_cfg::scoring_scheme{ + seqan3::nucleotide_scoring_scheme{seqan3::match_score{1}, + seqan3::mismatch_score{-100}}}; + + auto gap_scheme = seqan3::align_cfg::gap_cost_affine{seqan3::align_cfg::open_score{0}, + seqan3::align_cfg::extension_score{-100}}; + // Suffix Case: + auto suffix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true}, + seqan3::align_cfg::free_end_gaps_sequence2_leading{false}, + seqan3::align_cfg::free_end_gaps_sequence1_trailing{false}, + seqan3::align_cfg::free_end_gaps_sequence2_trailing{true}} | + scoring_scheme | gap_scheme; + + std::span suffix_sequence = query_sequence | seqan3::views::slice(0, pos_read); + auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(suffix_config, + min_length, + suffix_sequence, + inserted_bases, + true); + // Prefix Case: + auto prefix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{false}, + seqan3::align_cfg::free_end_gaps_sequence2_leading{true}, + seqan3::align_cfg::free_end_gaps_sequence1_trailing{true}, + seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}} | + scoring_scheme | gap_scheme; + + std::span prefix_sequence = query_sequence | seqan3::views::slice(pos_read + length, + query_sequence.size()); + auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(prefix_config, + min_length, + prefix_sequence, + inserted_bases, + false); + + pos_start_dup_seq = pos_ref - 1 - suffix_sequence_match_score; + pos_end_dup_seq = pos_ref - 1 + prefix_sequence_match_score; + + int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1, + length_of_single_dupl_sequence_2); + tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence // duplicated part on ref + + inserted_bases.size() / length_of_single_dupl_sequence // inserted duplications + + prefix_sequence_match_score / length_of_single_dupl_sequence; // duplicated part on ref + // If its a duplication instead of an insertion, we save the (possible multiple times) duplicated part as duplicated_bases + std::span duplicated_bases{}; + if (tandem_dup_count > 0) + duplicated_bases = (inserted_bases | seqan3::views::slice(0, length_of_single_dupl_sequence)); + return duplicated_bases; +} + void analyze_cigar(std::string const & read_name, std::string const & chromosome, int32_t const query_start_pos, @@ -33,38 +154,75 @@ void analyze_cigar(std::string const & read_name, pos_ref += length; pos_read += length; } - else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence) - { - if (length >= min_length) - { - // Insertions cause one junction from the insertion location to the next base - auto inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); - Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, - Breakend{chromosome, pos_ref, strand::forward}, - inserted_bases, - tandem_dup_count, - read_name}; - if (gVerbose) - seqan3::debug_stream << "INS: " << new_junction << "\n"; - junctions.push_back(std::move(new_junction)); - } - pos_read += length; - } else if (operation == 'D'_cigar_operation) { +// ---------------- DEL ---------------- if (length >= min_length) { // Deletions cause one junction from its start to its end Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, Breakend{chromosome, pos_ref + length, strand::forward}, ""_dna5, - tandem_dup_count, + 0, read_name}; if (gVerbose) seqan3::debug_stream << "DEL: " << new_junction << "\n"; junctions.push_back(std::move(new_junction)); } pos_ref += length; +// ------------------------------------- + } + else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence) + { + if (length >= min_length) + { + // Insertions cause one junction from the insertion location to the next base + std::span inserted_bases = query_sequence | seqan3::views::slice(pos_read, + pos_read + length); + +// ---------------- DUP ---------------- + + // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an + // origin somewhere else. -> global alignment + // ##ALT= + // TODO (23.7.21, irallia): This is Part of the Epic #144. Do we need a reference sequence for this? +// ---------------- DUP:TANDEM ---------------- + int32_t pos_start_dup_seq{}; + int32_t pos_end_dup_seq{}; + std::span duplicated_bases = detect_tandem_duplication(query_sequence, + length, + pos_ref, + pos_read, + inserted_bases, + min_length, + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + + if (tandem_dup_count != 0) + { + Junction new_junction{Breakend{chromosome, pos_start_dup_seq, strand::forward}, + Breakend{chromosome, pos_end_dup_seq, strand::forward}, + duplicated_bases, + tandem_dup_count, + read_name}; + seqan3::debug_stream << "DUP:TANDEM: " << new_junction << "\n"; + seqan3::debug_stream << "\t\t\tduplicated sequence: " << duplicated_bases + << " with " << tandem_dup_count << " duplications\n"; + junctions.push_back(std::move(new_junction)); + } else { +// ---------------- INS ---------------- + Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, + Breakend{chromosome, pos_ref, strand::forward}, + inserted_bases, + 0, + read_name}; + seqan3::debug_stream << "INS: " << new_junction << "\n"; + seqan3::debug_stream << "\t\t\tinserted sequence: " << inserted_bases << "\n"; + junctions.push_back(std::move(new_junction)); + } + } + pos_read += length; } else if (operation == 'S'_cigar_operation) { diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index c1128ccd..c8b31c53 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -172,9 +172,15 @@ std::string expected_res_empty "##contig=\n" }; -std::string expected_err_default_no_err +std::string expected_err_default_no_err_1 { "Detect junctions in long reads...\n" + "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n" + "\t\t\tinserted sequence: GAGTGGACCTCAGCAAACTCCCAGTAGAGCTGCAGCAGAGGGGGTCTGACTGTTAGGATGGAAAACTAACAAACAGAAGGCAATAGCATCAACAACAACAAAAAAAAACTGCCACACAAAAACCGCTATCCGAAGATCACCAACATCAAAGATCGAACAGGTAGACAAATGACGAAGAGAGGAAAAAACAGTGCAAAAAAGGCTGAAAAGCCCAGAACCACCTCTTCCCTCCAGAGGATCACAACTCCTCACCAGGAAAGGGAACAAAACTGCACAGAGAAGAGTTTGACCAATGACAGAAGTAGGCTTCAGCAGAATGGGTAATAACTCCTCTGAGCTAAAGGAGCATGTTCCTACCCTAATGCAAGGAAGCTAAAGATACTTGATAAAAGGTTACAGGAACTGCTAACTAAATAAACCAGTTCAGAGAAGAACATAAATGACCTAAATGGAGCTGAAAAACACAGCATGAGAACTTCATGAAGGAATACACAAGGTATCAACAGACAAGTCCATCAGGCAGAAGAAAGGGATATCAGAGATTGAAGATCAACTTAATGAAATAAAGCATGCAAGACGAGATTAGTGAGAAAAAAGAATTAAAAGAAATGAGCAAAGGCCTCAAGGAAATATGGGACTATGTGTAAAAGACCAAGCATACGGTTTGATTGGTGTATGTGAAAATGACAGGGAAAATGGAACCAAGTTGGAAAACACTCTTCAGGATATCATGCAGGAGAACCTCCCAACCTAGCAAGAGAAGCCAACATTCACATTCAGGAAATACAAGAGAACACCACCAAGATACTCCTTGAGAAGTAGCAAACCCCCAAGACACATAATTGTTCAGATTCAGGCAAGGGTGAAAATGAAGGAAAAAATGCTAAGAGAGCCAGAGAGAAAGGTATGGGTTATCCACAAAAGGGCCAGCCATCAGACTAAGAGCAATTCTCTGCAGAAACCCTACAACCAGAAGAGAGAAGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAAACAAAGCTTCGTAAGTGAAGGAGAAATAAATTTCTTTACAGACAAGCAAATGACTGAAGAAGATTTTTGTCACCACCATGCCTGCCTTACAAGATCTCCTGAAGGAAGCACTAAGACATGGGAAGGAAAAAATCCAGTACCAAGCCACTGCTAAACCATACCAAAATGTAGAGACTCAATGCTTAGGATAGGAAACTGCATCAACTAGCAGTCAAAATAACCAGCTAGCATTCATAATGACAGGATCAAATTCAGACCACATACAATTATTAACCTTAAATGTAAATGGGCTAAATGCCGCAATTAAAAGACACATCACTGGCAAATTGGATAAAGAGTCAAGCCCAATCGGTGTGCTGTTATTCAAGGAGACACCACTCTCACGTGCAAGAGACACAGATAGGCTCGAAAATGAATAGGGATGAAGGAAGATTACCAAGCAAATGGAAAGCAAAAAAAAAAAAAGCAGGGGTTGCAAATCCTAGTCTCTGTAAAACACTACTTTAAACCAAGAAAGATCAAAAGAGACAAAGAGGGCTTTAATAATGGTAATAGGGGGATTAATTCAACAAGAAGAGTTAACTATCCTAAATATATATGCTGCCTAATACAGGCACACCCAGATTCATAAAGCA\n" +}; + +std::string expected_err_default_no_err_2 +{ "The read depth method for long reads is not yet implemented.\n" "The read depth method for long reads is not yet implemented.\n" "The read depth method for long reads is not yet implemented.\n" @@ -182,6 +188,8 @@ std::string expected_err_default_no_err "Start clustering...\n" }; +std::string expected_err_default_no_err = expected_err_default_no_err_1 + expected_err_default_no_err_2; + TEST_F(iGenVar_cli_test, no_options) { cli_test_result result = execute_app("iGenVar"); @@ -202,8 +210,6 @@ TEST_F(iGenVar_cli_test, test_verbose_option) cli_test_result result = execute_app("iGenVar", "-j", data(default_alignment_long_reads_file_path), "--verbose"); std::string expected_err { - "Detect junctions in long reads...\n" - "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n" "The read depth method for long reads is not yet implemented.\n" "BND: chr21\t41972615\tReverse\tchr22\t17458415\tReverse\t0\t0\tm41327/11677/CCS\n" "The read depth method for long reads is not yet implemented.\n" @@ -216,7 +222,7 @@ TEST_F(iGenVar_cli_test, test_verbose_option) }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, expected_res_default); - EXPECT_EQ(result.err, expected_err); + EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err); } // Help page: @@ -398,14 +404,13 @@ TEST_F(iGenVar_cli_test, with_detection_method_arguments) "--method cigar_string --method split_read"); std::string expected_err { - "Detect junctions in long reads...\n" "Start clustering...\n" "Done with clustering. Found 2 junction clusters.\n" "No refinement was selected.\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, expected_res_default); - EXPECT_EQ(result.err, expected_err); + EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err); } TEST_F(iGenVar_cli_test, with_detection_method_duplicate_arguments) @@ -599,16 +604,15 @@ TEST_F(iGenVar_cli_test, test_direct_methods_input) "-j", data(default_alignment_long_reads_file_path), "--method cigar_string --method split_read " "--clustering_method 0 --refinement_method 0"); - std::string expected_err + std::string expected_err_clustering { - "Detect junctions in long reads...\n" "Start clustering...\n" "Done with clustering. Found 3 junction clusters.\n" "No refinement was selected.\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, expected_res_default); - EXPECT_EQ(result.err, expected_err); + EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err_clustering); } TEST_F(iGenVar_cli_test, test_unknown_argument) diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 344379b9..76279488 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -29,9 +29,9 @@ declare_datasource (FILE single_end_mini_example.sam # copies file to /data/output_err.txt declare_datasource (FILE output_err.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_err.txt - URL_HASH SHA256=606826366c63cf8ed09c0efddbc6a010dd3f9e670946f690cb1bc54d246e7fcd) + URL_HASH SHA256=b4c832bbf50cf3b9893191caa4dcc811299ec9de0270f2470bbb22a509826d6a) # copies file to /data/output_res.txt declare_datasource (FILE output_res.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_res.txt - URL_HASH SHA256=f12ee6622785660a637c8c8ae894c673cc29e4b09d729f6fd8e8911c33fe6ae6) + URL_HASH SHA256=c60e8f3d85ee0f0282a8f886750b6f95d096b590c8d6181c8e6c708c8bad3217) diff --git a/test/data/mini_example/output_err.txt b/test/data/mini_example/output_err.txt index 197e86d8..df940637 100644 --- a/test/data/mini_example/output_err.txt +++ b/test/data/mini_example/output_err.txt @@ -11,14 +11,18 @@ DEL: chr1 56 Forward chr1 70 Forward 0 0 read018 DUP:TANDEM: chr1 109 Forward chr1 124 Forward 0 2 read021 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2 INS: chr1 124 Forward chr1 125 Forward 15 0 read023 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read024 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read025 + inserted sequence: CCCCGGGGCCAATTT BND: chr1 96 Forward chr1 125 Forward 0 0 read027 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read029 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read030 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 -INS: chr1 179 Forward chr1 180 Forward 8 0 read031 +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read031 + duplicated sequence: ATATATTT with 2 duplications DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read033 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 BND: chr1 180 Reverse chr1 187 Reverse 0 0 read034 @@ -38,10 +42,15 @@ DEL: chr1 335 Forward chr1 350 Forward 0 0 read043 DEL: chr1 335 Forward chr1 350 Forward 0 0 read044 DEL: chr1 335 Forward chr1 350 Forward 0 0 read045 INS: chr1 367 Forward chr1 368 Forward 11 0 read046 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read047 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read048 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read049 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read050 + inserted sequence: GGTAACGTGTA DEL: chr1 383 Forward chr1 395 Forward 0 0 read050 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read051 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read052 diff --git a/test/data/mini_example/output_res.txt b/test/data/mini_example/output_res.txt index 56deb877..c0d39633 100644 --- a/test/data/mini_example/output_res.txt +++ b/test/data/mini_example/output_res.txt @@ -13,7 +13,6 @@ chr1 57 . N 9 PASS END=70;SVLEN=-13;SVTYPE=DEL GT ./. chr1 97 . N 1 PASS END=125;SVLEN=-28;SVTYPE=DEL GT ./. chr1 110 . N 1 PASS END=125;SVLEN=14;SVTYPE=DUP GT ./. chr1 125 . N 3 PASS END=125;SVLEN=15;SVTYPE=INS GT ./. -chr1 180 . N 1 PASS END=180;SVLEN=8;SVTYPE=INS GT ./. chr1 266 . N 4 PASS END=286;SVLEN=-20;SVTYPE=DEL GT ./. chr1 282 . N 1 PASS END=299;SVLEN=-17;SVTYPE=DEL GT ./. chr1 336 . N 4 PASS END=350;SVLEN=-14;SVTYPE=DEL GT ./.