-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecombinator.py
executable file
·435 lines (355 loc) · 16.4 KB
/
recombinator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/usr/bin/python3
# This script takes in a reference GFF file, a genbank formatted
# construct, then searches around for homology to recombine it in
# on one chromosome, applies those changes and returns a GFF file.
import argparse
import copy
from Bio import Seq
from Bio import SeqRecord
from Bio import SeqFeature
from Bio import SeqIO
from Bio import motifs
from Bio import Alphabet
from BCBio import GFF # This BCBio is installable as bcbio-gff in pip
from subprocess import call
def find_recombination_sites( reference, construct,
left_homology_slice, right_homology_slice,
mismatches_5=0, mismatches_3=0 ):
# This looks for recombination sites, given a reference and
# construct sequence, and also slices of how much to use and
# what mismatches are tolerated (to trigger fuzzy matching)
# List to hold all matches
matches = []
# For each chromosome
for chromosome_number, chromosome in enumerate(reference):
# Note that these are biopython Seqs or SeqRecords now
print("Looking on chromosome "+chromosome.id)
# For debugging you can uncomment this to speed things up...
# if chromosome_number != 1:
# continue
# We start looking from the plus strand, for each match on the
# left
for each_result in (
list_of_slices_for_recombine(chromosome, construct,
left_homology_slice,right_homology_slice,
mismatches_5=mismatches_5, mismatches_3=mismatches_3
)
):
# If we get one, we keep it
matches.append([chromosome_number,"+",each_result])
# and report it
print("\tI may have found a match ? ")
print("\t+ "+str(each_result))
# Then we flip around the rev_chromosome and look from the other end
rev_chromosome = chromosome.reverse_complement()
for each_result in (
list_of_slices_for_recombine(rev_chromosome, construct,
left_homology_slice,right_homology_slice,
mismatches_5=mismatches_5, mismatches_3=mismatches_3
)
):
matches.append([chromosome_number,"-",each_result])
print("\tI may have found a match ? ")
print("\t- "+str(each_result))
# Then we return now all the matches from left and right on that
# reference genome
return(matches)
def list_of_slices_for_recombine(reference, construct
,left_slice, right_slice,
mismatches_5=0, mismatches_3=0,
):
# Here we want to return the slices for recombination, just for
# a reference and construct, and a certain slice off the ends
recombines = []
# We start by looking from the left, and when we find an
# extended match
for left_match in (
extended_matches(reference, 0,
construct, left_slice, mismatches_5 )
):
# Then we continue from that match along to the right,
# looking for a right match to finish recombining at.
for right_match in (
extended_matches(reference, left_match[0][0],
construct, right_slice, mismatches_3 )
):
# Note there's some inefficiencies here. If I match twice
# coming from the left, and twice looking to the right,
# then I have to re-match the two right ones for each
# left match, if that makes sense. Would be more
# efficient to find all matches, index them, then feed
# them in as needed. But, this was easier to write.
# If we find a match, append
recombines.append([left_match,right_match])
# Return all recombination slices
return(recombines)
def extended_matches(chromosome,offset,construct,seed_slice,mismatches=0):
# We start in from the left of the sequence, and try to find
# perfect match, extend it a bit, then return start and stop
# We've got a query seed
query_seq = construct[slice(*seed_slice)]
# And the chromosome can be partial, especially if we're looking
# for that second match for recombination
chromosome = chromosome[offset:]
# Look for the motif
searches = search_for_motif(chromosome, query_seq, mismatches=mismatches)
matches = []
for seed_pos, seed in searches :
# For each seed that sticks, we get the positions
match_start = seed_pos
match_end = seed_pos+len(seed)
# Then we try to walk along and test if we can resect the
# start of it back on both construct and genome, but only
# where there's perfect matching. If there isn't and it's
# not biologically relevant, than it'll just recombine here.
# If you've got a SNP way outside your altered construct
# that's annotated, then that just means that you'll lose the
# native annotations in the genome.
try:
while (construct.seq[match_start-seed_pos+seed_slice[0]] ==
chromosome.seq[match_start]
):
if (match_start-seed_pos+seed_slice[0]) <= 0:
break
match_start -= 1
else:
match_start += 1
except:
match_start += 1
# We want to resect it forwards so that we can maintain
# annotations that are mainly in the reference GFF, so the
# resect on the forward doesn't really matter at all.
try:
while (construct.seq[match_end-seed_pos+seed_slice[0]] ==
chromosome.seq[match_end]
):
if (match_end-seed_pos+seed_slice[0]) > len(construct.seq):
break
match_end += 1
else:
match_end -= 1
except:
match_end -= 1
# These are where things match, so let's build it nicely
reference_match = [match_start+offset,match_end+offset]
construct_match = [match_start-seed_pos+seed_slice[0]
,match_end-seed_pos+seed_slice[0]
]
# Stick it on our list of matches
matches.append([reference_match,construct_match])
# And return them all
return(matches)
def search_for_motif(target,query,mismatches=0):
# Here we're just searching for a motif
# We copy the query sequence we're searching for so that we can
# make sure the alphabet is Unambiguous
query = Seq.Seq(str(query.seq),
alphabet=Alphabet.IUPAC.IUPACUnambiguousDNA())
# Then we make a motif
seed_motif = motifs.create([query])
# We do the same Unambiguous alphabet on the target,
# the reference genome
target = Seq.Seq(str(target.seq),
alphabet=Alphabet.IUPAC.IUPACUnambiguousDNA())
if mismatches == 0:
# This is perfect matches only, super faster
searches = seed_motif.instances.search(target)
elif mismatches > 0:
# Otherwise, we make a position scoring matrix, and convert
# to log odds so we can scan it along the genome
pssm = ( seed_motif.counts
.normalize(pseudocounts=0.1).log_odds() )
# Some little helper vars
pssm_length = len(query)
pssm_max = pssm.max/pssm_length
pssm_min = pssm.min/pssm_length
# Then we search with a threshold of mismatches tolerated
searches = pssm.search(target,
threshold=-0.001+(pssm_length-int(mismatches))*pssm_max+
int(mismatches)*pssm_min
)
# This is sooooo slow. A real tool should have some indexing
# to quicken the search/alignment. Maybe some sort of bash
# script using a BWA alignment to seed, then extend and
# splice using a BioPython script like this.
return(searches)
def recombine_at_match(gff,construct,this_match):
# Here we regenerate the gff as we need it
# Here, we pick out the chromosome we had a hit on
chromosome = gff[this_match[0]]
# Then we identify the start and stop coordinates of the match
# along the genome. Sorry about the depths of indexing, it's
# not transparent and you'll just have to read the function
# that returned the matches.
if this_match[1] == "+":
excise_start = this_match[2][0][0][1]
excise_end = this_match[2][1][0][0]
else:
excise_end = len(chromosome) - this_match[2][0][0][1]
excise_start = len(chromosome) - this_match[2][1][0][0]
# Then we identify the start and stop coordinates of the match
# along the construct
integrate_start = this_match[2][0][1][1]
integrate_end = this_match[2][1][1][0]
if this_match[1] == "+":
the_chunk_going_in = construct[integrate_start:integrate_end]
else:
the_chunk_going_in = \
construct[integrate_start:integrate_end]\
.reverse_complement()
# We report what's up
print("Recombining reference, splicing on chromosome #"+
str(this_match[0]+1)+" after "+str(excise_start)+
", starting at position "+str(integrate_start)+
" in the construct through to "+str(integrate_end)+
", then back to "+str(excise_end)+" in the reference.")
# Then we just splice the reference chromosome from the end of
# the homology, then the construct from the end of the left
# homology to the start of the right homology, then from the
# start of the homology in the genome to the end of the
# chromosome.
gff[this_match[0]] = (
chromosome[:excise_start] +
the_chunk_going_in +
chromosome[excise_end:]
)
gff[this_match[0]].id = chromosome.id
return(gff)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="recombinator.py"+
"Help Update GFF Files (HUGFFF)."+
"It's designed to take a construct in as "+
"a GenBank formatted sequence record (so ApE does that as "+
"an export), and a reference GFF (you can get it from SGD "+
"downloads), search for homology, and splice together a "+
"reference. YMMV, check the work for little indels to make"+
"sure the indexing is right. And re-implement if you want"+
"the fuzzy matching to run in this lifetime."+
""+
"You can specify the starts and stops for each end of the"+
"construct with the arguments.")
parser.add_argument("--referenceGFF", required=True)
parser.add_argument("--constructGenBank", required=True)
parser.add_argument("--homology_start", default=0)
parser.add_argument("--homology_end", default=20)
parser.add_argument("--homology_start_5", default=None)
parser.add_argument("--homology_end_5", default=None)
parser.add_argument("--homology_start_3", default=None)
parser.add_argument("--homology_end_3", default=None)
parser.add_argument("--mismatches", default=0)
parser.add_argument("--mismatches_5", default=None)
parser.add_argument("--mismatches_3", default=None)
parser.add_argument("--output_base", required=True)
args=parser.parse_args()
if args.homology_start_5 is None:
args.homology_start_5 = args.homology_start
if args.homology_end_5 is None:
args.homology_end_5 = args.homology_end
if args.homology_start_3 is None:
args.homology_start_3 = args.homology_start
if args.homology_end_3 is None:
args.homology_end_3 = args.homology_end
if args.mismatches_5 is None:
args.mismatches_5 = args.mismatches
if args.mismatches_3 is None:
args.mismatches_3 = args.mismatches
print("===")
print("SETTINGS:")
for key, value in vars(args).items():
print("Argument : "+key.ljust(16)+"\tis\t"+str(value))
if int(args.mismatches) > 0:
print()
print("FYI, you set mismatches > 0. That is slooooooooow. "+
"If you can get by with using 0 mismatches, please do "+
"so.")
print()
# We want a GFF 3 with the sequence information at the bottom,
# because that's our source of the chromosome info.
the_reference = list(GFF.parse(args.referenceGFF))
for chromosome in the_reference:
# Because alphabets are hard to inherit I suppose
chromosome.seq.alphabet = Alphabet.IUPAC.IUPACUnambiguousDNA()
# You could modify this to take it from a different source, but
# then you'd want to modify this to make a BioPython SeqRecord
# with all the annotation in it. Your choice. The GFF3 is readily
# downloadable from SGD, and I really like the atomic format of
# it, I think the two together is real smart.
# Here we read in a GenBank construct
try:
constructs = list(SeqIO.parse(args.constructGenBank, "genbank"))
except:
raise("Well, I couldn't read that construct in. "+
"Maybe that's not real genbank format?")
# Here we clean up that construct or constructs (you can put
# multiple in a GenBank maybe?). This is primarily for IGV to
# make the colors pretty, by renaming them as color. Mainly for
# ApE outputs.
for each_construct in constructs:
# Make a copy for iterating
features_copy = list(each_construct.features)
for i, each_feature in enumerate(features_copy):
# Make a copy for iterating
qualifiers = dict(each_feature.qualifiers)
for each_key in list(qualifiers.keys()):
# We're just cutting out the ApEinfo_ to make it
# more of a general genbank format
qualifiers[each_key.replace("ApEinfo_","")] = \
qualifiers[each_key]
# If it's not defined, then set it as fwdcolor
if "color" not in qualifiers.keys():
try:
qualifiers["color"] = qualifiers["fwdcolor"]
except:
pass
# Redefine the feature with the new qualifiers
each_construct.features[i] = SeqFeature.SeqFeature(
location=each_construct.features[i].location,
type=each_construct.features[i].type,
strand=each_construct.features[i].strand,
ref=each_construct.features[i].ref,
ref_db=each_construct.features[i].ref_db,
qualifiers=qualifiers
)
each_construct.seq.alphabet = Alphabet.IUPAC.IUPACUnambiguousDNA()
# Go find some matches
some_matches = find_recombination_sites(
the_reference, each_construct,
[int(args.homology_start_5), int(args.homology_end_5)],
[len(each_construct.seq)-int(args.homology_end_3)+1,
len(each_construct.seq)-int(args.homology_start_3)+1],
mismatches_5=int(args.mismatches_5),
mismatches_3=int(args.mismatches_3)
)
print()
print("I found "+str(len(some_matches))+" matches.")
print()
# For each match in series, edit the genome. Note that
# each construct gets edited after each other, so if you
# can put multiple in a GenBank (can you???) then they could
# edit each other's edits.
for each_match in some_matches:
the_reference = recombine_at_match(the_reference,
each_construct,each_match)
# Finally, write out the recombined files, both as a GFF and
# as a fasta, and index it for you so you can IGV easily.
with open(args.output_base+".gff", "w") as out_gff, \
open(args.output_base+".fa", "w") as out_fasta:
GFF.write(the_reference,out_gff)
out_fasta.write("")
with open(args.output_base+".gff", "a") as out_gff, \
open(args.output_base+".fa", "a") as out_fasta, \
open(args.output_base+".fa.fai", "w") as out_fasta_index:
out_gff.write("##FASTA\n")
index_offset = 0
for i,chromosome in enumerate(the_reference):
out_gff.write(">"+chromosome.id+"\n")
out_gff.write(str(chromosome.seq)+"\n")
out_fasta.write(">"+chromosome.id+"\n")
out_fasta.write(str(chromosome.seq)+"\n")
out_fasta_index.write(chromosome.id+"\t"+
str(2+len(chromosome.seq))+"\t"+
str(2+len(chromosome.id)+index_offset)+"\t"+
str(2+len(chromosome.id)+len(chromosome.seq))+"\t"+
str(2+len(chromosome.id)+len(chromosome.seq)+1)+"\n"
)
index_offset += 3 + len(chromosome.id) + len(chromosome.seq)