Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added per-read anchor requirement to junction extract #195

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,13 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) {
out << "\t\t" << "-t STR\tTag used in bam to label strand. [XS]" << endl;
out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
Expand Down Expand Up @@ -113,7 +118,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
optind = 1; //Reset before parsing again.
stringstream help_ss;
char c;
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) {
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:A:m:M:f:F:q:b:C")) != -1) {
switch(c) {
case 'o':
output_file_ = string(optarg);
Expand Down Expand Up @@ -164,12 +169,24 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
case 'a':
min_anchor_length_ = atoi(optarg);
break;
case 'A':
min_read_anchor_length_ = atoi(optarg);
break;
case 'm':
min_intron_length_ = atoi(optarg);
break;
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'b':
output_barcodes_file_ = string(optarg);
break;
Expand Down Expand Up @@ -285,7 +302,10 @@ void CisSpliceEffectsIdentifier::identify() {
} else {
ref_to_pass = "NA";
}
JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass);
JunctionsExtractor je1(bam_, variant_region, strandness_,
strand_tag_, min_anchor_length_, min_read_anchor_length_,
min_intron_length_, max_intron_length_,
filter_flags_, require_flags_, min_map_qual_, ref_to_pass);
je1.identify_junctions_from_BAM();
vector<Junction> junctions = je1.get_all_junctions();
//Add all the junctions to the unique set
Expand Down
15 changes: 13 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,20 @@ class CisSpliceEffectsIdentifier {
//tag used in BAM to denote strand, default "XS"
string strand_tag_;
//Minimum anchor length for junctions
//Junctions need atleast this many bp overlap
// on both ends.
//Junctions need at least this many bp overlap on both ends.
uint32_t min_anchor_length_;
//Reads need at least this many bp overlap on both ends to support junction.
uint32_t min_read_anchor_length_;
//Minimum length of an intron, i.e min junction width
uint32_t min_intron_length_;
//Maximum length of an intron, i.e max junction width
uint32_t max_intron_length_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
//whether to override strand of extracted junctions using intron-motif method
bool override_strand_with_canonical_intron_motif_;
public:
Expand All @@ -112,8 +119,12 @@ class CisSpliceEffectsIdentifier {
strandness_(-1),
strand_tag_("XS"),
min_anchor_length_(8),
min_read_anchor_length_(0),
min_intron_length_(70),
max_intron_length_(500000),
filter_flags_(0),
require_flags_(0),
min_map_qual_(0),
override_strand_with_canonical_intron_motif_(false) {}
//Destructor
~CisSpliceEffectsIdentifier() {
Expand Down
47 changes: 44 additions & 3 deletions src/junctions/junctions_extractor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,32 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
optind = 1; //Reset before parsing again.
int c;
stringstream help_ss;
while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) {
while((c = getopt(argc, argv, "ha:A:m:M:f:F:q:o:r:t:s:b:")) != -1) {
switch(c) {
case 'h':
usage(help_ss);
throw common::cmdline_help_exception(help_ss.str());
case 'a':
min_anchor_length_ = atoi(optarg);
break;
case 'A':
min_read_anchor_length_ = atoi(optarg);
break;
case 'm':
min_intron_length_ = atoi(optarg);
break;
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'o':
output_file_ = string(optarg);
break;
Expand Down Expand Up @@ -108,10 +120,18 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n");
}
}
if ( (require_flags_ & filter_flags_) != 0) {
usage();
throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n");
}

cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl;
cerr << "Minimum read anchor length: " << min_read_anchor_length_ << endl;
cerr << "Minimum intron length: " << min_intron_length_ << endl;
cerr << "Maximum intron length: " << max_intron_length_ << endl;
cerr << "Require alignment flags: " << require_flags_ << endl;
cerr << "Filter alignment flags: " << filter_flags_ << endl;
cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl;
cerr << "Alignment: " << bam_ << endl;
cerr << "Output file: " << output_file_ << endl;
if (output_barcodes_file_ != "NA"){
Expand All @@ -128,8 +148,13 @@ int JunctionsExtractor::usage(ostream& out) {
out << "Options:" << endl;
out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
Expand All @@ -156,12 +181,19 @@ string JunctionsExtractor::get_new_junction_name() {
return name_ss.str();
}

//Do some basic qc on the junction
//Update if junction passes QC based on current read alignment
bool JunctionsExtractor::junction_qc(Junction &j1) {
// don't add support for junction if intron is wrong size
if(j1.end - j1.start < min_intron_length_ ||
j1.end - j1.start > max_intron_length_) {
return false;
}

// don't add support for junction if read isn't sufficiently anchored
if(j1.start - j1.thick_start < min_read_anchor_length_) return false;
if(j1.thick_end - j1.end < min_read_anchor_length_) return false;

// add support, update if this junction is sufficiently anchored
if(j1.start - j1.thick_start >= min_anchor_length_)
j1.has_left_min_anchor = true;
if(j1.thick_end - j1.end >= min_anchor_length_)
Expand Down Expand Up @@ -358,7 +390,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i
return;
}

//Get the the barcode
//Get the barcode
void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str());
if(p != NULL) {
Expand All @@ -373,6 +405,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
}
}

//Filters alignments based on filtering flags and mapping quality
bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) {
if ((aln->core.flag & filter_flags_) != 0) return true;
if ((aln->core.flag | require_flags_) != aln->core.flag) return true;
if (aln->core.qual < min_map_qual_) return true;
return false;
}

//Parse junctions from the read and store in junction map
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
int n_cigar = aln->core.n_cigar;
Expand Down Expand Up @@ -523,6 +563,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() {
//Initiate the alignment record
bam1_t *aln = bam_init1();
while(sam_itr_next(in, iter, aln) >= 0) {
if (filter_alignment(header, aln)) continue;
parse_alignment_into_junctions(header, aln);
}
hts_itr_destroy(iter);
Expand Down
44 changes: 40 additions & 4 deletions src/junctions/junctions_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,10 @@ class JunctionsExtractor {
//Reference FASTA file
string ref_;
//Minimum anchor length for junctions
//Junctions need atleast this many bp overlap
// on both ends.
//Junctions need at least this many bp overlap on both ends.
uint32_t min_anchor_length_;
//Reads need at least this many bp overlap to support a junction
uint32_t min_read_anchor_length_;
//Minimum length of an intron, i.e min junction width
uint32_t min_intron_length_;
//Maximum length of an intron, i.e max junction width
Expand All @@ -180,12 +181,22 @@ class JunctionsExtractor {
string strand_tag_;
//tag used in BAM to denote single cell barcode
string barcode_tag_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
public:
//Default constructor
JunctionsExtractor() {
min_anchor_length_ = 8;
min_read_anchor_length_ = 0;
min_intron_length_ = 70;
max_intron_length_ = 500000;
filter_flags_ = 0;
require_flags_ = 0;
min_map_qual_ = 0;
junctions_sorted_ = false;
strandness_ = -1;
strand_tag_ = "XS";
Expand All @@ -196,8 +207,31 @@ class JunctionsExtractor {
region_ = ".";
ref_ = "NA";
}
JunctionsExtractor(string bam1, string region1, int strandness1, string strand_tag1, uint32_t min_anchor_length1, uint32_t min_intron_length1, uint32_t max_intron_length1, string ref1) :
bam_(bam1), region_(region1), strandness_(strandness1), strand_tag_(strand_tag1), min_anchor_length_(min_anchor_length1), min_intron_length_(min_anchor_length1), max_intron_length_(max_intron_length1), ref_(ref1){
JunctionsExtractor(
string bam1,
string region1,
int strandness1,
string strand_tag1,
uint32_t min_anchor_length1,
uint32_t min_read_anchor_length1,
uint32_t min_intron_length1,
uint32_t max_intron_length1,
uint16_t filter_flags,
uint16_t require_flags,
uint8_t min_map_qual,
string ref1) :
bam_(bam1),
region_(region1),
strandness_(strandness1),
strand_tag_(strand_tag1),
min_anchor_length_(min_anchor_length1),
min_read_anchor_length_(min_read_anchor_length1),
min_intron_length_(min_intron_length1),
max_intron_length_(max_intron_length1),
filter_flags_(filter_flags),
require_flags_(require_flags),
min_map_qual_(min_map_qual),
ref_(ref1) {
junctions_sorted_ = false;
output_file_ = "NA";
output_barcodes_file_ = "NA";
Expand Down Expand Up @@ -226,6 +260,8 @@ class JunctionsExtractor {
void create_junctions_vector();
//Pull out the cigar string from the read
int parse_read(bam_hdr_t *header, bam1_t *aln);
//Returns whether alignment should be filtered from junction analysis
bool filter_alignment(bam_hdr_t *header, bam1_t *aln);
//Parse junctions from the read and store in junction map
int parse_cigar_into_junctions(string chr, int read_pos,
uint32_t *cigar, int n_cigar);
Expand Down
2 changes: 2 additions & 0 deletions src/utils/bedtools/gzstream/README
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
===========================================================================

See index.html for documentation and installation instructions.

Version 1.5 (08 Jan 2003)
1 change: 0 additions & 1 deletion src/utils/bedtools/gzstream/version

This file was deleted.