forked from The-Sequence-Ontology/GAL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotation_summary
executable file
·120 lines (98 loc) · 2.79 KB
/
annotation_summary
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use FindBin;
use lib "$FindBin::RealBin/../lib/";
use GAL::Annotation;
use GAL::List;
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $usage = "
Synopsis:
annotation_summary feature.gff3 genome.fasta
Description:
This script will summarize the annotations in GFF3 file.
";
my ($help);
my $opt_success = GetOptions('help|h' => \$help,
);
if (! $opt_success) {
print STDERR join ' : ', ('FATAL',
'command_line_parse_error',
'Use annotation_summary --help to see correct usage');
}
if ($help || !@ARGV) {
print $usage;
exit(0);
}
my ($feature_file, $fasta_file) = @ARGV;
my $annotation = GAL::Annotation->new($feature_file,
$fasta_file);
my $features = $annotation->features;
#for my $column (qw(seqids sources types strands phases)) {
# print $features->$column->count_table($column, qw(count));
#}
#
my @types = $features->types->uniq;
#
#for my $method (qw(length gc_content)) {
# for my $type (@types) {
# my $type_rs = $features->search({type => $type});
# my $values = $type_rs->method_list($method);
# print join "\t", $type, $values->mean;
# print "\n";
# print '';
# }
# print '';
#}
for my $type (@types) {
my $type_rs = $features->search({type => $type});
my %counts;
while (my $feature = $type_rs->next) {
my $children = $feature->children;
while (my $child = $children->next) {
$counts{$child->type}++;
}
}
next unless %counts;
print "$type\n";
for my $child_type (keys %counts) {
print "$child_type\t" . $counts{$child_type};
print "\n";
print '';
}
print '';
}
print '';
# Do the search and get an interator for all matching features
my $mrnas = $features->search({type => 'mRNA'});
# Iterate over the features
while (my $mrna = $mrnas->next) {
# Get the feature ID
my $mrna_id = $mrna->feature_id;
# Get all the exons for this mRNA
my $exons = $mrna->exons;
my $e_count = $exons->count;
my ($e_length, $e_gc);
# Iterate over each exon
while (my $exon = $exons->next) {
$e_length += $exon->length;
$e_gc += $exon->gc_content;
}
print join "\t", int($e_length/$e_count), $e_gc/$e_count;
print "\n";
# Introns don't exist in the dataset, so GAL
# will infer them on the fly.
my $introns = $mrna->introns;
my $i_count = $introns->count;
my ($i_length, $i_gc);
while (my $intron = $introns->next) {
$i_length += $intron->length;
$i_gc += $intron->gc_content;
}
print $i_length ? join("\t", int($i_length/$i_count), '',
$i_gc/$i_count) : "\t\t";
print "\n";
}