-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_gmt.pl
executable file
·54 lines (44 loc) · 1.18 KB
/
convert_gmt.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/perl
use strict;
use warnings;
## this one is to convert AraCyc file into some resemblance of systematic gene naming.
my $gmt = shift @ARGV;
my $ann = shift @ARGV;
open GMT,"<",$gmt or die "$!";
open ANN,"<",$ann or die "$!";
my %ann = ();
my %gmt = ();
while (<ANN>) {
chomp;
my @tt = split /\t+/;
$ann{$tt[1]} = $tt[0];
}
while (<GMT>) {
my ($c1,$c2,$c3)=(0,0,0);
my @unclaimed;
chomp;
my @tt = split /\t+/;
my $pid = shift @tt;
my $name = shift @tt;
my $newline = "";
foreach my $gene (@tt) {
if ($gene =~ /(AT\wG\d{5})/) {
$newline=$newline."\t".$1;
$c1++;
$gmt{$1} = "TURD";
} elsif (defined $ann{$gene}) {
$newline=$newline."\t".$ann{$gene};
$c2++;
$gmt{$ann{$gene}} = "BIRD";
} else {
$c3++;
push @unclaimed,$gene;
}
}
print STDERR "Found $c1 LOCUS IDs, $c2 defined SYMBOLS, $c3 undef SYMBOLS: @unclaimed\n";
print "$pid\t$name$newline\n";
}
print STDERR "==========================================================================================\n";
printf STDERR "The processed GMT file has %d unique gene IDs.\n",scalar keys %gmt;
close GMT;
close ANN;