From ad5ef6380f51aaec7c6847af47636723a15a41b7 Mon Sep 17 00:00:00 2001
From: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
Date: Wed, 28 Jul 2021 13:17:59 +0200
Subject: [PATCH] [FEATURE] Get Tandem Duplications from CIGAR

Signed-off-by: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
---
 include/iGenVar.hpp                           |   1 +
 .../analyze_cigar_method.hpp                  |  90 ++++++++
 .../hierarchical_clustering_method.cpp        |   1 -
 .../analyze_cigar_method.cpp                  | 194 ++++++++++++++++--
 test/cli/iGenVar_cli_test.cpp                 |  22 +-
 test/data/datasources.cmake                   |   4 +-
 test/data/mini_example/output_err.txt         |  11 +-
 test/data/mini_example/output_res.txt         |   1 -
 8 files changed, 292 insertions(+), 32 deletions(-)

diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp
index 07cb5bff..fb8b9e51 100644
--- a/include/iGenVar.hpp
+++ b/include/iGenVar.hpp
@@ -26,6 +26,7 @@ struct cmd_arguments
     /* -c */ clustering_methods clustering_method{hierarchical_clustering};          // default: hierarchical clustering
     /* -r */ refinement_methods refinement_method{no_refinement};                    // default: no refinement
 // SV specifications:
+    /* -e TODO (irallia 19.08.21): duplication_errors */
     /* -k */ uint64_t min_var_length = 30;
     /* -l */ uint64_t max_var_length = 100000;
     /* -m */ uint64_t max_tol_inserted_length = 50;
diff --git a/include/modules/sv_detection_methods/analyze_cigar_method.hpp b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
index 7ab08958..79ffb00c 100644
--- a/include/modules/sv_detection_methods/analyze_cigar_method.hpp
+++ b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
@@ -2,6 +2,96 @@
 
 #include "structures/junction.hpp"  // for class Junction
 
+/*! \brief This function checks if the inserted bases are tandem duplicated.
+ *
+ * \param[in] config                         - configuration for a semi-gobal alignment
+ * \param[in] min_length                     - minimum length of variants to detect (default 30 bp,
+ * \param[in] sequence                       - suffix or prefix sequence
+ * \param[in] inserted_bases                 - the inserted bases of the possible duplication
+ * \param[in] is_suffix                      - true: suffix case, false: prefix case
+ * param[out] match_score                    - if we found a matching duplication, this value represents the maximal
+ *                                             amount of matches with the reference and thus the length of the existing
+ *                                             duplicated part on the reference. If there is no duplication, this value
+ *                                             is 0.
+ * param[out] length_of_single_dupl_sequence - greatest common divisor of length of inserted sequence and length of
+ *                                             maching part -> length of a single duplicated sequence
+ * \returns std::tie(match_score, length_of_single_dupl_sequence) - a tuple of the resulting values
+ *
+ * \details In this function the inserted bases are recursively aligned segment by segment until it has been proven that
+ *          it is a real duplication.
+ *          For simplicity, we only consider the suffix case here, since the prefix case works the same way:
+ *
+ *          suffix_sequence AAAACCGCGTAGCGGGGCGGG
+ *                                     ||||||||||
+ *          inserted_bases             GCGGGGCGGGGCGGG  -> unmatched_inserted_bases: GCGGG
+ *                                                      -> match_score = 10
+ *                                                      -> length_of_single_dupl_sequence = gcd(15, 10) = 5
+ *
+ *          suffix_sequence AAAACCGCGTAGCGGGGCGGG
+ *                                          |||||
+ *          unmatched_inserted_bases        GCGGG
+ *
+ *          -> match_score = 5, length_of_single_dupl_sequence = gcd(15, 5) = 5
+ */
+std::tuple<size_t, size_t> align_suffix_or_prefix(auto const & config,
+                                                  int32_t const min_length,
+                                                  std::span<const seqan3::dna5> & sequence,
+                                                  std::span<const seqan3::dna5> & inserted_bases,
+                                                  bool is_suffix);
+
+/*! \brief This function checks if the inserted bases are tandem duplicated.
+ *
+ * \param[in]       query_sequence      - SEQ field of the SAM/BAM file
+ * \param[in]       length              - length of inserted part, given by the CIGAR
+ * \param[in]       pos_ref             - position of the inserted part in the ref (current position)
+ * \param[in]       pos_read            - position of the inserted part in the read (current position)
+ * \param[in]       inserted_bases      - the inserted bases of the possible duplication
+ * \param[in]       min_length          - minimum length of variants to detect (default 30 bp,
+ *                                                                              expected to be non-negative)
+ * \param[in, out]  pos_start_dup_seq   - start position of the duplicated seq (excluding itself)
+ * \param[in, out]  pos_end_dup_seq     - end position of the duplicated seq (including itself)
+ * \param[in, out]  tandem_dup_count    - the number of tandem copies of the inserted sequence
+ * \returns         duplicated_bases    - the duplicated bases of the duplication
+ *
+ * \details If the inserted bases include one or more copies of a duplicated sequence, which is suffix or prefix of
+ *          another copy, we have a Tandem Duplication. To check this, we have to do a semi-global alignment.
+ *
+ *          Case 1: The duplication (insertion) comes after the matched sequence. Thus we need to check if the inserted
+ *                  sequence matches (partly) the suffix sequence. The length of the matching part yields to the amount
+ *                  of copies, thus we can calculate the tandem_dup_count and the inserted_bases.
+ *
+ *                  ref AAAACCGCGTAGCGGG----------TACGTAACGGTACG
+ *                        ||||||||||||||          |||||||| -> inserted sequence: GCGGGGCGGG
+ *                  read  AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC
+ *
+ *                  suffix_sequence AAAACCGCGTAGCGGG       -> free_end_gaps_sequence1_leading{true},
+ *                                             |||||          free_end_gaps_sequence1_trailing{false}
+ *                  inserted_bases             GCGGGGCGGG  -> free_end_gaps_sequence2_leading{false},
+ *                                                            free_end_gaps_sequence2_trailing{true}
+ *                  -> tandem_dup_count = 3, duplicated_bases = GCGGG
+ *
+ *          Case 2: The duplication (insertion) comes before the matched sequence.
+ *                  ref AAAACCGCGTA----------GCGGGTACGTAACGGTACG
+ *                        |||||||||          |||||||||||||  -> inserted sequence: GCGGGGCGGG
+ *                  read  AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC
+ *
+ *                  prefix_sequence     GCGGGTACGTAACGGTACG -> free_end_gaps_sequence1_leading{false},
+ *                                      |||||                  free_end_gaps_sequence1_trailing{true}
+ *                  inserted_bases GCGGGGCGGG               -> free_end_gaps_sequence2_leading{true},
+ *                                                             free_end_gaps_sequence2_trailing{false}
+ *                  -> tandem_dup_count = 3, duplicated_bases = GCGGG
+ * \see For some complex examples see detection_test.cpp.
+ */
+std::span<seqan3::dna5 const> detect_tandem_duplication(seqan3::dna5_vector const & query_sequence,
+                                                        int32_t length,
+                                                        int32_t pos_ref,
+                                                        int32_t pos_read,
+                                                        std::span<seqan3::dna5 const> & inserted_bases,
+                                                        int32_t const min_length,
+                                                        int32_t & pos_start_dup_seq,
+                                                        int32_t & pos_end_dup_seq,
+                                                        size_t & tandem_dup_count);
+
 /*! \brief This function steps through the CIGAR string and stores junctions with their position in reference and read.
  *
  * \param[in]       read_name       - QNAME field of the SAM/BAM file
diff --git a/src/modules/clustering/hierarchical_clustering_method.cpp b/src/modules/clustering/hierarchical_clustering_method.cpp
index 5437a62e..748c6bd7 100644
--- a/src/modules/clustering/hierarchical_clustering_method.cpp
+++ b/src/modules/clustering/hierarchical_clustering_method.cpp
@@ -1,7 +1,6 @@
 #include "iGenVar.hpp"                                            // for global variable gVerbose
 #include "modules/clustering/hierarchical_clustering_method.hpp"
 
-#include <limits>                                                 // for infinity
 #include <random>                                                 // for random_device
 
 #include <seqan3/core/debug_stream.hpp>
diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
index 368e2384..a115df1f 100644
--- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp
+++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
@@ -1,3 +1,11 @@
+#include <limits>   // for std::numeric_limits
+#include <numeric>  // for std::gcd (greatest common divisor)
+
+// #include <seqan3/alignment/aligned_sequence/debug_stream_alignment.hpp> // for a nicer alignment view
+#include <seqan3/alignment/configuration/align_config_method.hpp>
+#include <seqan3/alignment/configuration/align_config_scoring_scheme.hpp>
+#include <seqan3/alignment/pairwise/align_pairwise.hpp>
+#include <seqan3/alignment/scoring/nucleotide_scoring_scheme.hpp>
 #include <seqan3/alphabet/cigar/cigar.hpp>
 #include <seqan3/alphabet/nucleotide/dna5.hpp>
 #include <seqan3/core/debug_stream.hpp>
@@ -10,6 +18,119 @@
 using seqan3::operator""_cigar_operation;
 using seqan3::operator""_dna5;
 
+std::tuple<size_t, size_t> align_suffix_or_prefix(auto const & config,
+                                                  int32_t const min_length,
+                                                  std::span<seqan3::dna5 const> & sequence,
+                                                  std::span<seqan3::dna5 const> & inserted_bases,
+                                                  bool is_suffix)
+{
+    // length of matching part of suffix or prefix sequence
+    size_t match_score{};
+    // The number of tandem copies of this junction.
+    size_t length_of_single_dupl_sequence = std::numeric_limits<size_t>::max();
+
+    auto results = seqan3::align_pairwise(std::tie(sequence, inserted_bases), config);
+    auto & res = *results.begin();
+    // TODO (irallia 17.8.21): The mismatches should give us the opportunity to allow a given amount of errors in the
+    // duplication.
+    size_t matches = res.score() % 100;
+    size_t mismatches = (res.score() - matches) * (-1);
+    // For the beginning we do not allow mistakes, we should change this later, see todo above.
+    if (mismatches > 0)
+        return std::tie(match_score, length_of_single_dupl_sequence);
+    // found duplicated sequence in front of inserted sequence
+    if (matches >= min_length)
+    {
+        // seqan3::debug_stream << "Resulting alignment:\n" << res.alignment() << '\n';
+        match_score = matches;
+        // The possible length of the single duplicated: greatest common divisor of length of inserted part and length
+        // of maching part
+        length_of_single_dupl_sequence = std::gcd(inserted_bases.size(), matches);
+        if(matches != inserted_bases.size())
+        {
+            std::span<seqan3::dna5 const> unmatched_inserted_bases{};
+            if (is_suffix)
+                unmatched_inserted_bases = inserted_bases | seqan3::views::slice(match_score, inserted_bases.size());
+            else
+                unmatched_inserted_bases = inserted_bases | seqan3::views::slice(0, inserted_bases.size() - match_score);
+            // The first length_of_single_dupl_sequence is already the greatest common divisor and therefore the next
+            // recursively calculated one can be ignored.
+            auto [ next_match_score,
+                   next_length_of_single_dupl_sequence ] = align_suffix_or_prefix(config,
+                                                                                  min_length,
+                                                                                  sequence,
+                                                                                  unmatched_inserted_bases,
+                                                                                  is_suffix);
+            // If the substring does not match, there is no real duplication.
+            if (next_match_score == 0) {
+                length_of_single_dupl_sequence = std::numeric_limits<size_t>::max();
+                return std::tie(next_match_score, length_of_single_dupl_sequence);
+            }
+        }
+    }
+    // The first match_score calculated is automatically the maximum, so the recursively next one can be ignored.
+    return std::tie(match_score, length_of_single_dupl_sequence);
+}
+
+std::span<seqan3::dna5 const> detect_tandem_duplication(seqan3::dna5_vector const & query_sequence,
+                                                        int32_t length,
+                                                        int32_t pos_ref,
+                                                        int32_t pos_read,
+                                                        std::span<seqan3::dna5 const> & inserted_bases,
+                                                        int32_t const min_length,
+                                                        int32_t & pos_start_dup_seq,
+                                                        int32_t & pos_end_dup_seq,
+                                                        size_t & tandem_dup_count)
+{
+    auto scoring_scheme = seqan3::align_cfg::scoring_scheme{
+                              seqan3::nucleotide_scoring_scheme{seqan3::match_score{1},
+                                                                seqan3::mismatch_score{-100}}};
+
+    auto gap_scheme = seqan3::align_cfg::gap_cost_affine{seqan3::align_cfg::open_score{0},
+                                                         seqan3::align_cfg::extension_score{-100}};
+    // Suffix Case:
+    auto suffix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_leading{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence1_trailing{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_trailing{true}} |
+                         scoring_scheme | gap_scheme;
+
+    std::span<seqan3::dna5 const> suffix_sequence = query_sequence | seqan3::views::slice(0, pos_read);
+    auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(suffix_config,
+                                                                                                    min_length,
+                                                                                                    suffix_sequence,
+                                                                                                    inserted_bases,
+                                                                                                    true);
+    // Prefix Case:
+    auto prefix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_leading{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence1_trailing{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}} |
+                         scoring_scheme | gap_scheme;
+
+    std::span<seqan3::dna5 const> prefix_sequence = query_sequence | seqan3::views::slice(pos_read + length,
+                                                                                          query_sequence.size());
+    auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(prefix_config,
+                                                                                                    min_length,
+                                                                                                    prefix_sequence,
+                                                                                                    inserted_bases,
+                                                                                                    false);
+
+    pos_start_dup_seq = pos_ref - 1 - suffix_sequence_match_score;
+    pos_end_dup_seq = pos_ref - 1 + prefix_sequence_match_score;
+
+    int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1,
+                                                      length_of_single_dupl_sequence_2);
+    tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence     // duplicated part on ref
+                     + inserted_bases.size() / length_of_single_dupl_sequence           // inserted duplications
+                     + prefix_sequence_match_score / length_of_single_dupl_sequence;    // duplicated part on ref
+    // If its a duplication instead of an insertion, we save the (possible multiple times) duplicated part as duplicated_bases
+    std::span<seqan3::dna5 const> duplicated_bases{};
+    if (tandem_dup_count > 0)
+        duplicated_bases = (inserted_bases | seqan3::views::slice(0, length_of_single_dupl_sequence));
+    return duplicated_bases;
+}
+
 void analyze_cigar(std::string const & read_name,
                    std::string const & chromosome,
                    int32_t const query_start_pos,
@@ -33,38 +154,75 @@ void analyze_cigar(std::string const & read_name,
             pos_ref += length;
             pos_read += length;
         }
-        else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence)
-        {
-            if (length >= min_length)
-            {
-                // Insertions cause one junction from the insertion location to the next base
-                auto inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length);
-                Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
-                                      Breakend{chromosome, pos_ref, strand::forward},
-                                      inserted_bases,
-                                      tandem_dup_count,
-                                      read_name};
-                if (gVerbose)
-                    seqan3::debug_stream << "INS: " << new_junction << "\n";
-                junctions.push_back(std::move(new_junction));
-            }
-            pos_read += length;
-        }
         else if (operation == 'D'_cigar_operation)
         {
+// ---------------- DEL ----------------
             if (length >= min_length)
             {
                 // Deletions cause one junction from its start to its end
                 Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
                                       Breakend{chromosome, pos_ref + length, strand::forward},
                                       ""_dna5,
-                                      tandem_dup_count,
+                                      0,
                                       read_name};
                 if (gVerbose)
                     seqan3::debug_stream << "DEL: " << new_junction << "\n";
                 junctions.push_back(std::move(new_junction));
             }
             pos_ref += length;
+// -------------------------------------
+        }
+        else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence)
+        {
+            if (length >= min_length)
+            {
+                // Insertions cause one junction from the insertion location to the next base
+                std::span<seqan3::dna5 const> inserted_bases = query_sequence | seqan3::views::slice(pos_read,
+                                                                                                     pos_read + length);
+
+// ---------------- DUP ----------------
+
+                // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an
+                //                     origin somewhere else. -> global alignment
+                // ##ALT=<ID=DUP,Description="Duplication">
+                // TODO (23.7.21, irallia): This is Part of the Epic #144. Do we need a reference sequence for this?
+// ---------------- DUP:TANDEM ----------------
+                int32_t pos_start_dup_seq{};
+                int32_t pos_end_dup_seq{};
+                std::span<seqan3::dna5 const> duplicated_bases = detect_tandem_duplication(query_sequence,
+                                                                                           length,
+                                                                                           pos_ref,
+                                                                                           pos_read,
+                                                                                           inserted_bases,
+                                                                                           min_length,
+                                                                                           pos_start_dup_seq,
+                                                                                           pos_end_dup_seq,
+                                                                                           tandem_dup_count);
+
+                if (tandem_dup_count != 0)
+                {
+                    Junction new_junction{Breakend{chromosome, pos_start_dup_seq, strand::forward},
+                                          Breakend{chromosome, pos_end_dup_seq, strand::forward},
+                                          duplicated_bases,
+                                          tandem_dup_count,
+                                          read_name};
+                    seqan3::debug_stream << "DUP:TANDEM: " << new_junction << "\n";
+                    seqan3::debug_stream << "\t\t\tduplicated sequence: " << duplicated_bases
+                                         << " with " << tandem_dup_count << " duplications\n";
+                    junctions.push_back(std::move(new_junction));
+                } else {
+// ---------------- INS ----------------
+                    Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
+                                          Breakend{chromosome, pos_ref, strand::forward},
+                                          inserted_bases,
+                                          0,
+                                          read_name};
+                    seqan3::debug_stream << "INS: " << new_junction << "\n";
+                    seqan3::debug_stream << "\t\t\tinserted sequence: " << inserted_bases << "\n";
+                    junctions.push_back(std::move(new_junction));
+                }
+            }
+            pos_read += length;
         }
         else if (operation == 'S'_cigar_operation)
         {
diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp
index c1128ccd..c8b31c53 100644
--- a/test/cli/iGenVar_cli_test.cpp
+++ b/test/cli/iGenVar_cli_test.cpp
@@ -172,9 +172,15 @@ std::string expected_res_empty
     "##contig=<ID=chr1,length=368>\n"
 };
 
-std::string expected_err_default_no_err
+std::string expected_err_default_no_err_1
 {
     "Detect junctions in long reads...\n"
+    "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n"
+    "\t\t\tinserted sequence: GAGTGGACCTCAGCAAACTCCCAGTAGAGCTGCAGCAGAGGGGGTCTGACTGTTAGGATGGAAAACTAACAAACAGAAGGCAATAGCATCAACAACAACAAAAAAAAACTGCCACACAAAAACCGCTATCCGAAGATCACCAACATCAAAGATCGAACAGGTAGACAAATGACGAAGAGAGGAAAAAACAGTGCAAAAAAGGCTGAAAAGCCCAGAACCACCTCTTCCCTCCAGAGGATCACAACTCCTCACCAGGAAAGGGAACAAAACTGCACAGAGAAGAGTTTGACCAATGACAGAAGTAGGCTTCAGCAGAATGGGTAATAACTCCTCTGAGCTAAAGGAGCATGTTCCTACCCTAATGCAAGGAAGCTAAAGATACTTGATAAAAGGTTACAGGAACTGCTAACTAAATAAACCAGTTCAGAGAAGAACATAAATGACCTAAATGGAGCTGAAAAACACAGCATGAGAACTTCATGAAGGAATACACAAGGTATCAACAGACAAGTCCATCAGGCAGAAGAAAGGGATATCAGAGATTGAAGATCAACTTAATGAAATAAAGCATGCAAGACGAGATTAGTGAGAAAAAAGAATTAAAAGAAATGAGCAAAGGCCTCAAGGAAATATGGGACTATGTGTAAAAGACCAAGCATACGGTTTGATTGGTGTATGTGAAAATGACAGGGAAAATGGAACCAAGTTGGAAAACACTCTTCAGGATATCATGCAGGAGAACCTCCCAACCTAGCAAGAGAAGCCAACATTCACATTCAGGAAATACAAGAGAACACCACCAAGATACTCCTTGAGAAGTAGCAAACCCCCAAGACACATAATTGTTCAGATTCAGGCAAGGGTGAAAATGAAGGAAAAAATGCTAAGAGAGCCAGAGAGAAAGGTATGGGTTATCCACAAAAGGGCCAGCCATCAGACTAAGAGCAATTCTCTGCAGAAACCCTACAACCAGAAGAGAGAAGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAAACAAAGCTTCGTAAGTGAAGGAGAAATAAATTTCTTTACAGACAAGCAAATGACTGAAGAAGATTTTTGTCACCACCATGCCTGCCTTACAAGATCTCCTGAAGGAAGCACTAAGACATGGGAAGGAAAAAATCCAGTACCAAGCCACTGCTAAACCATACCAAAATGTAGAGACTCAATGCTTAGGATAGGAAACTGCATCAACTAGCAGTCAAAATAACCAGCTAGCATTCATAATGACAGGATCAAATTCAGACCACATACAATTATTAACCTTAAATGTAAATGGGCTAAATGCCGCAATTAAAAGACACATCACTGGCAAATTGGATAAAGAGTCAAGCCCAATCGGTGTGCTGTTATTCAAGGAGACACCACTCTCACGTGCAAGAGACACAGATAGGCTCGAAAATGAATAGGGATGAAGGAAGATTACCAAGCAAATGGAAAGCAAAAAAAAAAAAAGCAGGGGTTGCAAATCCTAGTCTCTGTAAAACACTACTTTAAACCAAGAAAGATCAAAAGAGACAAAGAGGGCTTTAATAATGGTAATAGGGGGATTAATTCAACAAGAAGAGTTAACTATCCTAAATATATATGCTGCCTAATACAGGCACACCCAGATTCATAAAGCA\n"
+};
+
+std::string expected_err_default_no_err_2
+{
     "The read depth method for long reads is not yet implemented.\n"
     "The read depth method for long reads is not yet implemented.\n"
     "The read depth method for long reads is not yet implemented.\n"
@@ -182,6 +188,8 @@ std::string expected_err_default_no_err
     "Start clustering...\n"
 };
 
+std::string expected_err_default_no_err = expected_err_default_no_err_1 + expected_err_default_no_err_2;
+
 TEST_F(iGenVar_cli_test, no_options)
 {
     cli_test_result result = execute_app("iGenVar");
@@ -202,8 +210,6 @@ TEST_F(iGenVar_cli_test, test_verbose_option)
     cli_test_result result = execute_app("iGenVar", "-j", data(default_alignment_long_reads_file_path), "--verbose");
     std::string expected_err
     {
-        "Detect junctions in long reads...\n"
-        "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n"
         "The read depth method for long reads is not yet implemented.\n"
         "BND: chr21\t41972615\tReverse\tchr22\t17458415\tReverse\t0\t0\tm41327/11677/CCS\n"
         "The read depth method for long reads is not yet implemented.\n"
@@ -216,7 +222,7 @@ TEST_F(iGenVar_cli_test, test_verbose_option)
     };
     EXPECT_EQ(result.exit_code, 0);
     EXPECT_EQ(result.out, expected_res_default);
-    EXPECT_EQ(result.err, expected_err);
+    EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err);
 }
 
 // Help page:
@@ -398,14 +404,13 @@ TEST_F(iGenVar_cli_test, with_detection_method_arguments)
                                          "--method cigar_string --method split_read");
     std::string expected_err
     {
-        "Detect junctions in long reads...\n"
         "Start clustering...\n"
         "Done with clustering. Found 2 junction clusters.\n"
         "No refinement was selected.\n"
     };
     EXPECT_EQ(result.exit_code, 0);
     EXPECT_EQ(result.out, expected_res_default);
-    EXPECT_EQ(result.err, expected_err);
+    EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err);
 }
 
 TEST_F(iGenVar_cli_test, with_detection_method_duplicate_arguments)
@@ -599,16 +604,15 @@ TEST_F(iGenVar_cli_test, test_direct_methods_input)
                                          "-j", data(default_alignment_long_reads_file_path),
                                          "--method cigar_string --method split_read "
                                          "--clustering_method 0 --refinement_method 0");
-    std::string expected_err
+    std::string expected_err_clustering
     {
-        "Detect junctions in long reads...\n"
         "Start clustering...\n"
         "Done with clustering. Found 3 junction clusters.\n"
         "No refinement was selected.\n"
     };
     EXPECT_EQ(result.exit_code, 0);
     EXPECT_EQ(result.out, expected_res_default);
-    EXPECT_EQ(result.err, expected_err);
+    EXPECT_EQ(result.err, expected_err_default_no_err_1 + expected_err_clustering);
 }
 
 TEST_F(iGenVar_cli_test, test_unknown_argument)
diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake
index 344379b9..76279488 100644
--- a/test/data/datasources.cmake
+++ b/test/data/datasources.cmake
@@ -29,9 +29,9 @@ declare_datasource (FILE single_end_mini_example.sam
 # copies file to <build>/data/output_err.txt
 declare_datasource (FILE output_err.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_err.txt
-                    URL_HASH SHA256=606826366c63cf8ed09c0efddbc6a010dd3f9e670946f690cb1bc54d246e7fcd)
+                    URL_HASH SHA256=b4c832bbf50cf3b9893191caa4dcc811299ec9de0270f2470bbb22a509826d6a)
 
 # copies file to <build>/data/output_res.txt
 declare_datasource (FILE output_res.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_res.txt
-                    URL_HASH SHA256=f12ee6622785660a637c8c8ae894c673cc29e4b09d729f6fd8e8911c33fe6ae6)
+                    URL_HASH SHA256=c60e8f3d85ee0f0282a8f886750b6f95d096b590c8d6181c8e6c708c8bad3217)
diff --git a/test/data/mini_example/output_err.txt b/test/data/mini_example/output_err.txt
index 197e86d8..df940637 100644
--- a/test/data/mini_example/output_err.txt
+++ b/test/data/mini_example/output_err.txt
@@ -11,14 +11,18 @@ DEL: chr1	56	Forward	chr1	70	Forward	0	0	read018
 DUP:TANDEM: chr1	109	Forward	chr1	124	Forward	0	2	read021
 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read023
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read024
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read025
+			inserted sequence: CCCCGGGGCCAATTT
 BND: chr1	96	Forward	chr1	125	Forward	0	0	read027
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read029
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read030
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read031
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read031
+			duplicated sequence: ATATATTT with 2 duplications
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read033
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 BND: chr1	180	Reverse	chr1	187	Reverse	0	0	read034
@@ -38,10 +42,15 @@ DEL: chr1	335	Forward	chr1	350	Forward	0	0	read043
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read044
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read045
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read046
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read047
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read048
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read049
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read050
+			inserted sequence: GGTAACGTGTA
 DEL: chr1	383	Forward	chr1	395	Forward	0	0	read050
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read051
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read052
diff --git a/test/data/mini_example/output_res.txt b/test/data/mini_example/output_res.txt
index 56deb877..c0d39633 100644
--- a/test/data/mini_example/output_res.txt
+++ b/test/data/mini_example/output_res.txt
@@ -13,7 +13,6 @@ chr1	57	.	N	<DEL>	9	PASS	END=70;SVLEN=-13;SVTYPE=DEL	GT	./.
 chr1	97	.	N	<DEL>	1	PASS	END=125;SVLEN=-28;SVTYPE=DEL	GT	./.
 chr1	110	.	N	<DUP:TANDEM>	1	PASS	END=125;SVLEN=14;SVTYPE=DUP	GT	./.
 chr1	125	.	N	<INS>	3	PASS	END=125;SVLEN=15;SVTYPE=INS	GT	./.
-chr1	180	.	N	<INS>	1	PASS	END=180;SVLEN=8;SVTYPE=INS	GT	./.
 chr1	266	.	N	<DEL>	4	PASS	END=286;SVLEN=-20;SVTYPE=DEL	GT	./.
 chr1	282	.	N	<DEL>	1	PASS	END=299;SVLEN=-17;SVTYPE=DEL	GT	./.
 chr1	336	.	N	<DEL>	4	PASS	END=350;SVLEN=-14;SVTYPE=DEL	GT	./.