From 77451c4feda4b5e8bf9e58957b73a3f34b8b3036 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 2 Jul 2024 19:29:58 +0000 Subject: [PATCH] feat: added per-read anchor requirement to junction extract - closes #186, reads now only 'support' a junction if they have at least a given minimum anchor length, supplied with the '-A' flag (default 0) --- .../cis_splice_effects_identifier.cc | 10 ++++++++-- .../cis_splice_effects_identifier.h | 6 ++++-- src/junctions/junctions_extractor.cc | 17 +++++++++++++++-- src/junctions/junctions_extractor.h | 10 +++++++--- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.cc b/src/cis-splice-effects/cis_splice_effects_identifier.cc index 5bead6d..d2d2119 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.cc +++ b/src/cis-splice-effects/cis_splice_effects_identifier.cc @@ -44,6 +44,8 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) { out << "\t\t" << "-t STR\tTag used in bam to label strand. [XS]" << endl; out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n" << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; + out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n" + << "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; @@ -116,7 +118,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { optind = 1; //Reset before parsing again. stringstream help_ss; char c; - while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) { + while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:A:m:M:f:F:q:b:C")) != -1) { switch(c) { case 'o': output_file_ = string(optarg); @@ -167,6 +169,9 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { case 'a': min_anchor_length_ = atoi(optarg); break; + case 'A': + min_read_anchor_length_ = atoi(optarg); + break; case 'm': min_intron_length_ = atoi(optarg); break; @@ -298,7 +303,8 @@ void CisSpliceEffectsIdentifier::identify() { ref_to_pass = "NA"; } JunctionsExtractor je1(bam_, variant_region, strandness_, - strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, + strand_tag_, min_anchor_length_, min_read_anchor_length_, + min_intron_length_, max_intron_length_, filter_flags_, require_flags_, min_map_qual_, ref_to_pass); je1.identify_junctions_from_BAM(); vector junctions = je1.get_all_junctions(); diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.h b/src/cis-splice-effects/cis_splice_effects_identifier.h index adc9a67..fc70131 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.h +++ b/src/cis-splice-effects/cis_splice_effects_identifier.h @@ -87,9 +87,10 @@ class CisSpliceEffectsIdentifier { //tag used in BAM to denote strand, default "XS" string strand_tag_; //Minimum anchor length for junctions - //Junctions need at least this many bp overlap - // on both ends. + //Junctions need at least this many bp overlap on both ends. uint32_t min_anchor_length_; + //Reads need at least this many bp overlap on both ends to support junction. + uint32_t min_read_anchor_length_; //Minimum length of an intron, i.e min junction width uint32_t min_intron_length_; //Maximum length of an intron, i.e max junction width @@ -118,6 +119,7 @@ class CisSpliceEffectsIdentifier { strandness_(-1), strand_tag_("XS"), min_anchor_length_(8), + min_read_anchor_length_(0), min_intron_length_(70), max_intron_length_(500000), filter_flags_(0), diff --git a/src/junctions/junctions_extractor.cc b/src/junctions/junctions_extractor.cc index 998f611..aed926b 100644 --- a/src/junctions/junctions_extractor.cc +++ b/src/junctions/junctions_extractor.cc @@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { optind = 1; //Reset before parsing again. int c; stringstream help_ss; - while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) { + while((c = getopt(argc, argv, "ha:A:m:M:f:F:q:o:r:t:s:b:")) != -1) { switch(c) { case 'h': usage(help_ss); @@ -51,6 +51,9 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { case 'a': min_anchor_length_ = atoi(optarg); break; + case 'A': + min_read_anchor_length_ = atoi(optarg); + break; case 'm': min_intron_length_ = atoi(optarg); break; @@ -119,6 +122,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { } cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl; + cerr << "Minimum read anchor length: " << min_read_anchor_length_ << endl; cerr << "Minimum intron length: " << min_intron_length_ << endl; cerr << "Maximum intron length: " << max_intron_length_ << endl; cerr << "Require alignment flags: " << require_flags_ << endl; @@ -140,6 +144,8 @@ int JunctionsExtractor::usage(ostream& out) { out << "Options:" << endl; out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n" << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; + out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n" + << "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; @@ -171,12 +177,19 @@ string JunctionsExtractor::get_new_junction_name() { return name_ss.str(); } -//Do some basic qc on the junction +//Update if junction passes QC based on current read alignment bool JunctionsExtractor::junction_qc(Junction &j1) { + // don't add support for junction if intron is wrong size if(j1.end - j1.start < min_intron_length_ || j1.end - j1.start > max_intron_length_) { return false; } + + // don't add support for junction if read isn't sufficiently anchored + if(j1.start - j1.thick_start < min_read_anchor_length_) return false; + if(j1.thick_end - j1.end < min_read_anchor_length_) return false; + + // add support, update if this junction is sufficiently anchored if(j1.start - j1.thick_start >= min_anchor_length_) j1.has_left_min_anchor = true; if(j1.thick_end - j1.end >= min_anchor_length_) diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index 496bb13..c969033 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -153,9 +153,10 @@ class JunctionsExtractor { //Reference FASTA file string ref_; //Minimum anchor length for junctions - //Junctions need atleast this many bp overlap - // on both ends. + //Junctions need at least this many bp overlap on both ends. uint32_t min_anchor_length_; + //Reads need at least this many bp overlap to support a junction + uint32_t min_read_anchor_length_; //Minimum length of an intron, i.e min junction width uint32_t min_intron_length_; //Maximum length of an intron, i.e max junction width @@ -190,6 +191,7 @@ class JunctionsExtractor { //Default constructor JunctionsExtractor() { min_anchor_length_ = 8; + min_read_anchor_length_ = 0; min_intron_length_ = 70; max_intron_length_ = 500000; filter_flags_ = 0; @@ -211,6 +213,7 @@ class JunctionsExtractor { int strandness1, string strand_tag1, uint32_t min_anchor_length1, + uint32_t min_read_anchor_length1, uint32_t min_intron_length1, uint32_t max_intron_length1, uint16_t filter_flags, @@ -222,7 +225,8 @@ class JunctionsExtractor { strandness_(strandness1), strand_tag_(strand_tag1), min_anchor_length_(min_anchor_length1), - min_intron_length_(min_anchor_length1), + min_read_anchor_length_(min_read_anchor_length1), + min_intron_length_(min_intron_length1), max_intron_length_(max_intron_length1), filter_flags_(filter_flags), require_flags_(require_flags),