-
Notifications
You must be signed in to change notification settings - Fork 9
/
taxcollector.py
executable file
·74 lines (64 loc) · 2.09 KB
/
taxcollector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
from taxcollector import *
import sys
skip_if = [
'eukaryota',
'eukarya',
'clone',
'plasmid',
'vector',
'uncultured'
]
required = (1, 2, 3, 4, 5)
def main():
# parse arguments
try:
names = sys.argv[1]
nodes = sys.argv[2]
fasta = sys.argv[3]
except IndexError:
print(f'USAGE: {sys.argv[0]} names nodes fasta out', file=sys.stderr)
quit(-1)
# load names database.
with open(names) as handle:
names = Names(handle)
# load nodes database
with open(nodes) as handle:
nodes = Nodes(handle)
# create a function to collect taxonomic description given
# names and nodes databases from NCBI
collect_taxes = tax_collector(names, nodes)
# collect RDP database
with open(fasta) as handle:
skipped = 0
records = Fasta(handle)
for i, record in enumerate(records):
h = record.header
name = h[h.find(' ')+1:h.find(';')].replace('(T)', '').strip()
taxes = collect_taxes(name)
phylogeny = format_name(taxes)
record.header = "%s;[8]%s|%s|%s" % (phylogeny.replace(' ', '_'),
record.orig_name.replace(' ', '_'),
record.accession, i)
# p = print?
p = True
# make sure taxonomic description is complete enough
for r in required:
if '[%s]' % r not in phylogeny:
skipped += 1
p = False
break
if p:
# don't print record if certain words are in name
# these records are in skip_if
for word in skip_if:
if word.upper() in record.header.upper():
p = False
break
# yay our record is good, print it!
if p:
print(record)
# tell the user how bad we did
print(f"{skipped} names not found in NCBI database", file=sys.stderr)
if __name__ == '__main__':
main()