-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKrakenGrafter.py
270 lines (227 loc) · 10.8 KB
/
KrakenGrafter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
## the purpose of this script is to graft on the contents of a .fasta file to the nodes.dmp and names.dmp used by Kraken2
## in effect "grafting" on a new branch to the tree of life inside the nodes.dmp and names.dmp files
## this allows the user to then use the "kraken2-build --add-to-library" functionality during database building
## thereby making sure their own custom sequences are included
## the script takes in three input files:
## nodes.dmp to modify
## names.dmp to modify
## .fasta of sequences to graft
## NOTE: the seqIDs need to contain no whitespace and no pipe characters ("|")
## the entire seqID will be grafted into the nodes.dmp and names.dmp files so maybe keep it concise
## the script then generates three output files:
## new_nodes.dmp (or specified name) of the modified nodes.dmp
## new_names.dmp (or specified name) of the modified names.dmp
## K2.fasta (or specified name) of the modified .fasta file
## the script can be used in two different ways -
## in both cases, brand new taxon IDs are generated that have not been seen in the nodes.dmp and names.dmp files:
## 1) a given .fasta file's sequence(s) is/are inserted directly beneath (taxonomically speaking) a specified "root" node
##
## example:
##
## KrakenGrafter.py -i_nodes nodes.dmp -i_names names.dmp -i_fasta input.fasta -root 32630
##
## 2) the user declares a brand new "parent" node that is inserted beneath the "root" node (at a user-specified taxonomic depth)
## and then the given .fasta sequence(s) is/are inserted directly beneath this new "parent" node
## this functionality is enabled by declaring both -parent_taxon and -parent_rank
##
## example:
##
## KrakenGrafter.py -i_nodes nodes.dmp -i_names names.dmp -i_fasta input.fasta -root 32630 -parent_taxon new_genus -parent_rank genus
##
## NOTE: i_nodes, i_names, root, o_nodes, o_names, o_fasta, and debug are all optional variables to declare
## the default "root" node is 32630 - which is the NCBI's "synthetic construct" node - a safe habour for new sequences
## otherwise, one can either go on the NCBI taxonomy browser or use grep 'taxon' nopes.dmp to try and work out what taxon ID to use for "root"
## written by INZ - 04/21/22
## Stanford Unversity
## provided with no acceptance of liability or promise of functionality
## version 0.1.0
## ===============================================================================================================================================
## import libraries:
import copy
import argparse
def main(input_nodes_name, input_names_name, input_fasta_name, root_taxID, new_parent_taxon, new_parent_rank, output_nodes_name, output_names_name, output_fasta_name, debug):
## input variables:
##input_nodes_name = 'nodes.dmp'
##input_names_name = 'names.dmp'
##input_fasta_name = 'HSVd.fasta'
##root_taxID = '147262'
##new_parent_taxon = ''
##new_parent_rank = ''
##output_nodes_name = 'new_nodes.dmp'
##output_names_name = 'new_names.dmp'
##output_fasta_name = 'HSVd_K2.fasta'
##debug = 1
version = '0.1.0'
## necessary fixed strings/lists:
blank_node_line = 'taxid\t|\tparent\t|\trank\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|'
blank_name_line = 'taxid\t|\tvar\t|\t\t|\tscientific name\t|'
kraken_seqID_portion = '|kraken:taxid|replace '
linnean_taxonomy_list = ['kingdom','phylum','class','order','family','genus','species','subspecies']
last_linnean_rank = linnean_taxonomy_list[-1]
## check that the ranks make sense
if new_parent_taxon:
if not new_parent_rank:
print("error: because a new parent taxon has been specified, a taxonomic rank for it needs to be chosen")
print("please choose a rank above " + last_linnean_rank)
print("quitting")
quit()
##
if (new_parent_rank == last_linnean_rank):
print("error: the new parent taxonomic rank cannot be " + last_linnean_rank)
print("please choose a higher rank")
print("quitting")
quit()
##
## open files:
def opener(filename):
open_file = open(filename, mode='r')
open_list = open_file.read().splitlines()
for entry_ind, entry in enumerate(open_list):
open_list[entry_ind] = entry
##
return open_list
##
input_nodes_list = opener(input_nodes_name)
input_names_list = opener(input_names_name)
input_fasta_list = opener(input_fasta_name)
## convert fasta to singleline (based on https://stackoverflow.com/a/50856787):
def singleline(in_fasta_list):
out_fasta_list = []
new_seq_line = []
for line in in_fasta_list:
if line.startswith('>'):
if new_seq_line:
out_fasta_list.append(''.join(new_seq_line))
new_seq_line = []
out_fasta_list.append(line)
else:
new_seq_line.append(line.strip())
##
if new_seq_line:
out_fasta_list.append(''.join(new_seq_line))
##
return out_fasta_list
##
input_fasta_list = singleline(input_fasta_list)
## determine the rank of the sequences to be inserted:
## one lower than either the root_taxID or the new_parent_rank
if not new_parent_taxon:
for node_ind, node in enumerate(input_nodes_list):
node_taxid = node.split('\t')[0]
if (node_taxid == root_taxID):
root_rank = node.split('\t')[4]
break
if (root_rank == last_linnean_rank):
print("error: selected root node is too low in taxonomic rank (no available ranks beneath)")
print("root rank: " + root_rank)
print("quitting")
quit()
else:
input_seq_rank = linnean_taxonomy_list[(linnean_taxonomy_list.index(root_rank)+1)]
if debug:
print("new sequences will be inserted at the rank: " + input_seq_rank)
else:
input_seq_rank = linnean_taxonomy_list[(linnean_taxonomy_list.index(new_parent_rank)+1)]
if debug:
print("new sequences will be inserted at the rank: " + input_seq_rank)
##
## rename the input .fasta seqIDs to have the kraken2 apropriate names with unique taxIDs
## make sure to reserve the first unique taxID for a new parent taxon if specified
if new_parent_taxon:
new_parent_taxID = int(input_nodes_list[-1].split('\t')[0]) + 1
first_child_taxID = new_parent_taxID + 1
else:
first_child_taxID = int(input_nodes_list[-1].split('\t')[0]) + 1
##
def renamer(in_fasta_list, init_taxID):
out_fasta_list = []
out_taxid_list = []
out_seqID_list = []
for line_ind, line in enumerate(in_fasta_list):
if line.startswith('>'):
new_seqID = line + kraken_seqID_portion + line[1:]
new_seqID = new_seqID.replace('replace', str(init_taxID))
out_fasta_list.append(new_seqID)
out_fasta_list.append(in_fasta_list[line_ind+1])
out_taxid_list.append(init_taxID)
out_seqID_list.append(line[1:])
init_taxID += 1
##
return out_fasta_list, out_taxid_list, out_seqID_list
##
renamed_fasta_list, appended_taxID_list, appended_seqID_list = renamer(input_fasta_list, first_child_taxID)
## graft on the nodes to the input_nodes_list
def nodegrafter(in_node_list, in_taxID_list, in_parent_taxID, in_child_rank):
out_nodes_list = copy.deepcopy(in_node_list)
for in_child_taxID in in_taxID_list:
new_node_line = blank_node_line.replace('taxid', str(in_child_taxID))
new_node_line = new_node_line.replace('parent', str(in_parent_taxID))
new_node_line = new_node_line.replace('rank', str(in_child_rank))
out_nodes_list.append(new_node_line)
##
return out_nodes_list
##
if new_parent_taxon:
output_nodes_list = nodegrafter(input_nodes_list, [new_parent_taxID], root_taxID, new_parent_rank)
output_nodes_list = nodegrafter(output_nodes_list, appended_taxID_list, new_parent_taxID, input_seq_rank)
else:
output_nodes_list = nodegrafter(input_nodes_list, appended_taxID_list, root_taxID, input_seq_rank)
##
## graft on the names to the input_names_list
def namegrafter(in_name_list, in_taxID_list, in_seqID_list):
out_names_list = copy.deepcopy(in_name_list)
for in_child_taxID_ind, in_child_taxID in enumerate(in_taxID_list):
new_name_line = blank_name_line.replace('taxid', str(in_child_taxID))
new_name_line = new_name_line.replace('var', str(in_seqID_list[in_child_taxID_ind]))
out_names_list.append(new_name_line)
##
return out_names_list
##
if new_parent_taxon:
output_names_list = namegrafter(input_names_list, [new_parent_taxID], [new_parent_taxon])
output_names_list = namegrafter(output_names_list, appended_taxID_list, appended_seqID_list)
else:
output_names_list = namegrafter(input_names_list, appended_taxID_list, appended_seqID_list)
##
## report some stats if debug is enabled
if debug:
print("KrakenGrafter.py version: " + version)
print("added " + input_fasta_name + " to nodes.dmp / names.dmp")
if new_parent_taxon:
print("expecting 1 (new parent taxon) + " + str(len(appended_seqID_list)) + " = " + str(1+len(appended_seqID_list)) + " new lines added")
else:
print("expecting: " + str(len(appended_seqID_list)) + " new lines added")
##
print("added: " + str(len(output_nodes_list)-len(input_nodes_list)) + " new nodes")
print("added: " + str(len(output_names_list)-len(input_names_list)) + " new names")
print("saving")
##
## save the new files
def saver(input_name, input_list):
name_obj = open(input_name, "w")
for element in input_list:
if isinstance(element, list):
element = "\t".join(element)
name_obj.write(element + "\n")
##
name_obj.close()
##
saver(output_nodes_name, output_nodes_list)
saver(output_names_name, output_names_list)
saver(output_fasta_name, renamed_fasta_list)
##
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i_nodes', type=str, default = 'nodes.dmp', help='input nodes.dmp file - default = "nodes.dmp"')
parser.add_argument('-i_names', type=str, default = 'names.dmp', help='input names.dmp file - default = "names.dmp"')
parser.add_argument('-i_fasta', type=str, help='input .fasta file - no whitespace in seqIDs!')
parser.add_argument('-root', type=str, default = '32630', help='taxID of the root node under which new sequences will be grafted - default = 32630 ("synthetic construct")')
parser.add_argument('-parent_taxon', type=str, default = '', help='optionally declare the name of new node to be grafted beneath the root node')
parser.add_argument('-parent_rank', type=str, default = '', help='if a new node is to be grafted, a taxonomic rank for it must be declared - cannot be "subspecies"!')
parser.add_argument('-o_nodes', type=str, default = 'new_nodes.dmp', help='output nodes.dmp filename - default = "new_nodes.dmp"')
parser.add_argument('-o_names', type=str, default = 'new_names.dmp', help='output names.dmp filename - default = "new_names.dmp"')
parser.add_argument('-o_fasta', type=str, default = 'K2.fasta', help='output .fasta filename - default = "K2.fasta"')
parser.add_argument('-debug', type=int, default = 1, help='print more detailed results - default = 1')
args = parser.parse_args()
main(args.i_nodes, args.i_names, args.i_fasta, args.root, args.parent_taxon, args.parent_rank, args.o_nodes, args.o_names, args.o_fasta, args.debug)
##