-
Notifications
You must be signed in to change notification settings - Fork 0
/
taxid2taxonomy.py
250 lines (193 loc) · 8.33 KB
/
taxid2taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
'''
taxid2taxonomy.py - fetch full taxonomy for ncbi taxids
Purpose
_______
This scripts contains methods for retrieving full taxonomic information for one
or more NCBI taxids.
Consider using Entez Direct for this:
https://www.ncbi.nlm.nih.gov/books/NBK179288/
Using names.dmp and nodes.dmp
+++++++++++++++++++++++++++++
The NCBI taxonomy files names.dmp and nodes.dmp can be downloaded from
ftp.ncbi.nlm.nih.gov/pub/taxonomy/. If provided, then the requested taxonomic
levels will be retrieved from these files.
Please note that there are 44 unique taxonomic ranks specified in nodes.dmp,
only generic ranks (see --levels defaults) can be returned using this script.
If you get 'missing' kingdom, then consider extending the maximum number of
recursive lookups allowed (you will have to edit the script for this).
If you request taxonomic information for levels below the taxid provided,
they will be returned as 'unclassified'
Accessing a BioSQL db
+++++++++++++++++++++
This script currently expects a MySQL version of BioSQL.
NOT YET IMPLEMENTED
Usage:
cat taxids | taxid2taxonomy --names-dmp=/path/to/names.dmp --nodes.dmp=/path/to/nodes.dmp
'''
import os,sys,re
import collections
import cgatcore.experiment as E
import cgatcore.iotools as iotools
limit = 0
upper_limit=30
def fetchTaxonomy(out_dict, taxid, taxid2scname, taxid2parent, taxid2rank):
'''Receive a taxid number and return k,p,c,o,g,s for that taxid'''
for k,v in out_dict.items():
out_dict[k] = 'unclassified'
# HACK: To stop infinate recursion a finite limit is set
global limit
limit = 0
def _fetchTax(taxid):
global limit
try:
if taxid2rank[taxid].endswith('kingdom') or limit > upper_limit:
if taxid2rank[taxid].endswith('kingdom'):
out_dict['kingdom'] = taxid2scname[taxid]
return out_dict
elif taxid2rank[taxid] in out_dict.keys():
limit += 1
out_dict[taxid2rank[taxid]] = taxid2scname[taxid]
return _fetchTax(taxid2parent[taxid])
else:
limit += 1
return _fetchTax(taxid2parent[taxid])
except KeyError:
# If value is missing, assume it must be first value
for k in out_dict.keys():
out_dict[k] = 'missing'
return out_dict
return _fetchTax(taxid)
def parseNCBINodes(names_dmp, nodes_dmp):
'''
Iterate over NCBI names and nodes files to create mapping dictionaries
for recursive look up of taxids.
'''
# parse names.dmp (taxid | name | unique name | type of name)
taxid2scname = {}
for line in iotools.open_file(names_dmp):
line = line.strip('\t|\n').split('\t|\t')
if line[3] == 'scientific name':
taxid2scname[line[0]] = line[1]
else:
continue
E.info('taxid2scname is %fMb, with %i keys' \
% (sys.getsizeof(taxid2scname)/1000000.0, len(taxid2scname.keys())))
# parse nodes.dmp (taxid | parent taxid | rank |...)
taxid2parent = {}
taxid2rank = {}
for line in iotools.open_file(nodes_dmp):
line = line.strip('\t|\n').split('\t|\t')
taxid2parent[line[0]] = line[1]
taxid2rank[line[0]] = line[2]
E.info('taxid2parent is %fMb, with %i keys' \
% (sys.getsizeof(taxid2parent)/1000000.0, len(taxid2parent.keys())))
E.info('taxid2rank is %fMb with %i keys' \
% (sys.getsizeof(taxid2rank)/1000000.0, len(taxid2rank.keys())))
return taxid2scname, taxid2parent, taxid2rank
def fetchBioSQLTaxonomy(ncbi_tax_id,
host='',
user='guest',
passwd='',
db='biosql'):
'''NOT YET IMPLEMENTED
Fetch the full parent taxonomy (to Kingdom) for
a given NCBI tax id number'''
phylogeny = collections.OrderedDict()
# connect to BioSQL database
db = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db)
cur = db.cursor()
# break if more than 50 iterations...
safety = 50
first = True
node_rank = None
while node_rank != 'superkingdom':
safety -= 1
if safety < 0:
break
# internal BioSQL taxon_ids don't correspond to ncbi taxon IDs
# WARNING: There is overlap between the two.
field = 'taxon_id'
if first:
tax_id = ncbi_tax_id
field = 'ncbi_taxon_id'
first = False
# fetch tax_id, parent_tax_id node_rank, scientific_name
statement = ("SELECT "
" taxon.taxon_id,"
" taxon.parent_taxon_id,"
" taxon.node_rank,"
" taxon_name.name"
" FROM taxon JOIN taxon_name"
" ON taxon.taxon_id = taxon_name.taxon_id"
" WHERE taxon.{} = {} "
"AND name_class = 'scientific name'".format(field, tax_id))
cur.execute(statement)
taxonomy = cur.fetchall()
if len(taxonomy) == 1:
old_tax_id, tax_id, node_rank, scientific_name = taxonomy[0]
phylogeny[node_rank] = scientific_name
elif len(taxonomy) == 0:
E.warn('No taxonomy information for taxon ID %s beyond %s' \
% (ncbi_tax_id, node_rank))
phylogeny['genus'] = 'unavailable'
phylogeny['species'] = 'unavailable'
return phylogeny
else:
raise ValueError('multiple entries for tax id %s: %s' \
% (ncbi_tax_id, str(taxonomy)))
return phylogeny
def main(argv=None):
if not argv:
argv = sys.argv
parser = E.ArgumentParser(description=__doc__)
parser.add_argument("--version", action="version", version="1.0")
parser.add_argument("--names-dmp", dest="names_dmp", type=str,
help="Path to ncbi names.dmp file")
parser.add_argument("--nodes-dmp", dest="nodes_dmp", type=str,
help="Path to ncbi nodes.dmp file")
parser.add_argument("--header", action="store_true", dest="header",
help="Does input contain a header?")
parser.add_argument("--missing", dest="missing", type=str,
help="Text to interpret as a missing taxid")
parser.add_argument("--levels", dest="levels", type=str,
help="A comma-separated list of the taxonomic"\
" levels to be returned")
parser.add_argument("--header-out", dest="header_out", action="store_true",
help="Include a header in output")
parser.set_defaults(names_dmp=None,
nodes_dmp=None,
header=False,
header_out=False,
missing="missing",
levels="kingdom,phylum,class,order,family,genus,species,strain")
(args) = E.start(parser, argv=argv)
# Set dictionary for taxonomic levels to be returned
t = collections.OrderedDict()
for i in args.levels.split(','):
t[i] = 'missing'
# Output header if requested
if args.header_out:
args.stdout.write('\t'.join([x for x in args.levels.split(',')]) + '\n')
# Recursive look up of names.dmp and nodes.dmp files
if args.names_dmp or args.nodes_dmp:
assert args.names_dmp and args.nodes_dmp, \
"Please supply path to both names.dmp and nodes.dmp files"
E.info("Generating dictionaries for recursive taxonomic look up:")
taxid2scname, taxid2parent, taxid2rank = parseNCBINodes(args.names_dmp,
args.nodes_dmp)
header = args.header
for tax_id in args.stdin:
tax_id = tax_id.strip()
if header:
header = False
continue
if not tax_id == args.missing:
t = fetchTaxonomy(t,
tax_id,
taxid2scname,
taxid2parent,
taxid2rank)
args.stdout.write('\t'.join([t[x] for x in args.levels.split(',')])\
+ '\n')
if __name__=="__main__":
sys.exit(main(sys.argv))