Skip to content

Commit

Permalink
discard partially mapped genes at the ends of chrs (query2ref)
Browse files Browse the repository at this point in the history
  • Loading branch information
brunocontrerasmoreira committed Apr 3, 2023
1 parent d45bc08 commit d6a187e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pangenes/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@
09032023: _collinear_genes.pl now maps genes in WGAs in both strands, added optional -n
17032023: _cluster_analysis.pl now merges disjoint clusters (diff species, 75% supporting edges) caused often by split gene models
30032023: check_evidence.pl -P prints python code to plot genomic context of pangene cluster, requires pyGenomeViz
02042023: BED start coord is >= 0 in query2ref
03042023: discard partially mapped genes at the ends of chrs (query2ref) in _collinear_genes.pl
50 changes: 34 additions & 16 deletions pangenes/_collinear_genes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,7 @@ sub help_message {
my $geneBEDfile1mapped = $tmpdir . "_$sp1.$sp2.$alg.gene.mapped.rev.bed";

my ( $ref_matched, $ref_unmatched, $perc_blocks_3genes ) =
query2ref_coords( $sp2wgaBEDfile_sorted, $geneBEDfile2mapped,
query2ref_coords( "$fasta1.fai", $sp2wgaBEDfile_sorted, $geneBEDfile2mapped,
$qual, $MINALNLEN, $no_inversions, $VERBOSE );

printf( "# %d genes mapped (%1.1f%% in 3+blocks) in %s (%d unmapped)\n\n",
Expand All @@ -730,7 +730,7 @@ sub help_message {
# now with reversed WGA alignment, to find matching sp2 segments for unpaired sp1 genes

my ( $ref_matched1, $ref_unmatched1, $perc_blocks_3genes1 ) =
query2ref_coords( $sp1wgaBEDfile_sorted, $geneBEDfile1mapped,
query2ref_coords( "$fasta2.fai", $sp1wgaBEDfile_sorted, $geneBEDfile1mapped,
$qual, $MINALNLEN, $no_inversions, $VERBOSE );

printf( "# %d genes mapped (%1.1f%% in 3+blocks) in %s (reverse, %d unmapped)\n\n",
Expand Down Expand Up @@ -1207,22 +1207,27 @@ sub mask_intergenic_regions {
}

# Takes
# i) input BED intersect filename (string)
# ii) output BED filename (string)
# iii) min quality score (real)
# iv) min alignment length (natural)
# v) same strand only (boolean)
# vi) verbose, optional (boolean)
# i) reference FAI filename (string)
# ii) input BED intersect filename (string)
# iii) output BED filename (string)
# iv) min quality score (real)
# v) min alignment length (natural)
# vi) same strand only (boolean)
# vii) verbose, optional (boolean)
#
# Parses sorted BED intersect -wo output and writes to BED file
# features (cDNA/transcripts) mapped on reference genome. Note:
# features might be unsorted.
#
# Returns
# i) ref to list of BED-like lines of matched genes
# ii) ref to list of BED-like lines of unmatched genes
# iii) % genes in WGA blocks of at least 3 genes (float)
#
# Note: able to parse cs::Z (minimap2) and cg::Z (wfmash) strings
# Note: takes first match of each cDNA/gene only
# Note: discards partially mapped genes at the ends of chrs
#
# example input:
# 1 4848 20752 ONIVA01G00010 9999 + 1 3331 33993 + 6 26020714 26051403 29819 60 cs:Z::303*ag:30*ga... 15904
# 1 104921 116326 ONIVA01G00100 9999 + 1 103118 152580 + 1 1132 47408 45875 60 cs:Z::70*tc:... 11405
Expand All @@ -1231,17 +1236,25 @@ sub mask_intergenic_regions {
# <-- (c)DNA/gene --> <- (q)uery genome -> <-- (r)eference genome --> ovlp
sub query2ref_coords {

my ( $infile, $outfile, $minqual, $minalnlen, $samestrand, $verbose ) = @_;
my ( $refaifile, $infile, $outfile, $minqual, $minalnlen, $samestrand, $verbose ) = @_;

my ( $cchr, $cstart, $cend, $cname, $cmatch, $cstrand );
my ( $qchr, $qstart, $qend, $cigartype, $bedline );
my ( $WGAstrand, $rchr, $rstart, $rend );
my ( $rmatch, $rmapqual, $SAMPAFtag, $overlap, $done, $strand );
my ( $SAMqcoord, $SAMrcoord, $feat, $coordr );
my ( $deltaq, $deltar, $start_deltar, $end_deltar );
my ( %ref_coords, %genes_per_block, %matched_gene, %unmatched);
my ( %ref_coords, %genes_per_block, %matched_gene, %unmatched, %ref_max_length);
my ( @matched, @filt_unmatched, @segments );

# parse reference chr lengths
my $ref_chr_bed = read_FAI_regex2hash($refaifile);
foreach $rchr (keys(%$ref_chr_bed)){
chomp($ref_chr_bed->{$rchr});
$ref_max_length{$rchr} = (split(/\t/,$ref_chr_bed->{$rchr}))[2];
}

# parse input file
open( BED, "<", $infile )
|| die "# ERROR(query2ref_coords): cannot read $infile\n";
while (<BED>) {
Expand Down Expand Up @@ -1425,11 +1438,7 @@ sub query2ref_coords {
}
}

# print coords in ref frame only if segment is long enough
if($ref_coords{$cname}{'start'} < 0) {
$ref_coords{$cname}{'start'} = 0
}
$overlap = 1 + $ref_coords{$cname}{'end'} - $ref_coords{$cname}{'start'};
# put together BED line
$bedline = sprintf("%s\t%d\t%d\t%s\t%d\t%s\n",
$ref_coords{$cname}{'chr'},
$ref_coords{$cname}{'start'},
Expand All @@ -1438,7 +1447,16 @@ sub query2ref_coords {
$overlap,
$strand);

if ( $overlap >= $minalnlen ) {
# actually compute overlap between cDNA/CDS and ref genome
$overlap = 1 + $ref_coords{$cname}{'end'} - $ref_coords{$cname}{'start'};

# check match is within reference chr bounds
if($ref_coords{$cname}{'start'} < 0 ||
(defined($ref_max_length{$ref_coords{$cname}{'chr'}}) &&
$ref_coords{$cname}{'end'} > $ref_max_length{$ref_coords{$cname}{'chr'}})) {
$unmatched{$cname} = "[overlap outside chr] $bedline";

} elsif ( $overlap >= $minalnlen ) { # if overlapping segment is long enough
push(@matched, $bedline);
$matched_gene{ $cname } = 1;
print "$bedline\n" if($verbose > 1)
Expand Down

0 comments on commit d6a187e

Please sign in to comment.