-
Notifications
You must be signed in to change notification settings - Fork 0
/
pick_pan_set.pl
executable file
·67 lines (64 loc) · 2.27 KB
/
pick_pan_set.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env perl
use strict;
use warnings;
use autodie;
# input to this script comes from an API call
# try this: curl "https://data.sorghumbase.org/sorghum_v3/search?q=taxonomy__ancestors:4557&fq=biotype:protein_coding&fl=id,taxon_id,homology__all_orthologs&wt=csv&rows=1000000" | awk -F "," '$2!=45770000' > sorghum_v3.orthologs.csv
# also create a blacklist of genes to avoid when choosing representatives
# echo "select g1.stable_id, g2.stable_id from gene_member g1, gene_member g2, homology_member hm, homology_member hm2, homology h, gene_tree_node_attr gtna where gtna.node_type=\"gene_split\" and gtna.node_id = h.gene_tree_node_id and h.homology_id = hm.homology_id and hm.gene_member_id = g1.gene_member_id and hm2.homology_id = h.homology_id and hm2.gene_member_id = g2.gene_member_id and hm.gene_member_id > hm2.gene_member_id" | mysql -h HOSTNAME -u USERNAME -pPASSWORD ensembl_compara_3_87_sorghum1021 | awk 'NR>1{print $1"\n"$2}' |sort | uniq > sorghum_v3.split_genes.txt
my %blacklist;
my %pan; # $pan{id} = rep
my %hasTax;
my %taxa;
my %isRep;
my $bl_file = shift @ARGV;
open(my $fh, "<", $bl_file);
while (<$fh>) {
chomp;
$blacklist{$_} = 1;
}
close $fh;
while (<>) {
chomp;
s/\"//g;
my ($id, $taxon_id, @orthologs) = split /,/, $_;
next if ($id eq "id");
$taxa{$taxon_id} = 1;
$hasTax{$id}=$taxon_id;
next if $pan{$id};
if ($blacklist{$id}) {
print STDERR "$id has been blacklisted\n";
next;
}
$isRep{$id}=\@orthologs;
for my $o (@orthologs) {
$pan{$o} = $id;
}
}
print join ("\t", 'rep_id','taxon_id','set_size','members','n_taxa', join(',',sort keys %taxa)),"\n";
for my $rep (keys %isRep) {
my @orthologs = grep {$hasTax{$_} and $_ ne $rep} @{$isRep{$rep}};
my %taxTally;
for my $tid (keys %taxa) {
$taxTally{$tid} = 0;
}
for my $id (@{$isRep{$rep}}) {
next unless $hasTax{$id};
$taxTally{$hasTax{$id}}++;
}
if (@orthologs == 0) {
$taxTally{$hasTax{$rep}}++;
}
my $n_taxa=0;
for my $tid (keys %taxTally) {
$n_taxa++ if ($taxTally{$tid} > 0);
}
print join("\t"
, $rep
, $hasTax{$rep}
, scalar @orthologs + 1
, join(',',@orthologs)
, $n_taxa
, join(',',map {$taxTally{$_}} sort keys %taxa)
),"\n";
}