-
Notifications
You must be signed in to change notification settings - Fork 0
/
doSameSpeciesLiftOver.nf
461 lines (413 loc) · 15 KB
/
doSameSpeciesLiftOver.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
//Inspired by: https://genome-source.gi.ucsc.edu/gitlist/kent.git/raw/master/src/hg/utils/automation/doSameSpeciesLiftOver.pl
//And this: https://iamphioxus.org/2013/06/25/using-liftover-to-convert-genome-assembly-coordinates/
//Also this: https://github.com/wurmlab/flo
//This is related: https://bioconductor.org/packages/release/bioc/vignettes/CNEr/inst/doc/PairwiseWholeGenomeAlignment.html
//Should preinstall the conda environment, as otherwise the script just spends all its time
//installing dependencies.
//conda activate doSameSpeciesLiftOver
//conda install ucsc-fatotwobit blat ucsc-fasplit ucsc-liftup ucsc-axtchain ucsc-chainmergesort ucsc-chainsplit ucsc-chainsort seqkit ucsc-chainnet ucsc-netchainsubset ucsc-liftover genometools gffutils
//-fastMap in blat: skip the time-consuming stage of merging alignment blocks that have gaps between them
params.oldFasta = "old.fasta"
params.newFasta = "new.fasta"
params.gff = "example.gff3"
params.recordSplit = 1 //Split MultiFasta into files with this many fasta records. Default = 1 . Higher for less parallelization.
params.splitSize = 1000000000 //length of a section, in base pairs. 5000 bp is the maximum size allowed for blat -fastmap. Interacts with param.extraBases. Larger to disable.
params.extraBases = 0 //Extra bases for splitFA for overlaps. E.g. if splitSize is 2500, and extraBases is 2500, splitFA will make 5000 sections that overlap 2500 bp. 0 to disable.
params.splitDepth = 1000000000 //number of subrecord chunks (of length splitSize) per blat invocation. 100 for typical invocation (if chainMerge worked properly). Smaller for more parallelization. Larger to disable.
oldGenome = Channel.fromPath(params.oldFasta)
newGenome = Channel.fromPath(params.newFasta)
gffFiles = params.gff.tokenize(",")
gffFiles_ch = Channel.fromPath(gffFiles).flatten()
gffFiles_ch.into{ gffFile_1 ; gffFile_2 ; gffFile_3 }
oldGenome.into { oldGenome_1 ; oldGenome_2 ; oldGenome_3 ; oldGenome_4 }
newGenome.into { newGenome_1 ; newGenome_2 ; newGenome_3 }
//Split multi-FASTA file into muliple files with typically one FASTA record per file
newGenome_2.splitFasta(by:params.recordSplit,file:true).set{fastaChunks}
process convertFAto2bit_old {
conda "ucsc-fatotwobit"
//Alternatively, can preinstall all the requirements to a particular location:
// conda params.totalCondaEnvPath
tag "$fasta"
input:
file fasta from oldGenome_1
output:
file "${fasta}.2bit" into old_2bit
script:
"""
faToTwoBit ${fasta} ${fasta}.2bit
"""
}
old_2bit.into{ old_2bit_1 ; old_2bit_2 }
process constructOocFile {
conda "blat"
// conda params.totalCondaEnvPath
tag "$old_2bit"
input:
file old_2bit from old_2bit_2
output:
file "${old_2bit}.ooc" into ooc
script:
"""
##TODO Should follow protocol for repMatch using the "Construct ooc file" instructions from http://genomewiki.ucsc.edu/index.php/DoSameSpeciesLiftOver.pl
blat ${old_2bit} /dev/null /dev/null -stepSize=1 -tileSize=11 -makeOoc=${old_2bit}.ooc -repMatch=4096
"""
}
process subRecordChunks {
conda "ucsc-fasplit"
//conda params.totalCondaEnvPath
input:
file fastaChunk from fastaChunks
output:
set file("${fastaChunk}"), file("${fastaChunk}.lft"),file("${fastaChunk}.subsplit.fa") into subsplitFasta_liftUp
tag "${fastaChunk}"
script:
"""
faSplit size ${fastaChunk} ${params.splitSize} ${fastaChunk}.subsplit -lift=${fastaChunk}.lft -oneFile -extra=${params.extraBases}
"""
}
//Take the 1-per-scaffold FASTA files, which were subsplit internally into ~3000-4000 bp chunks, and separate them into 1-per-section chunks
//In memory splitting.
//Transpose is really the secret sauce below for getting the files setup properly.
subsplitFasta_liftUp.map{ values ->
subChunks = values[2].splitFasta(by:params.splitDepth,file:true) //split a subrecord multiFASTA into individual FASTAs
return tuple(values[0],values[1],subChunks)}.transpose().set{ subFastaChunks }
subFastaChunks.combine(old_2bit_1).combine(ooc).set{blatCmds}
process blat_align {
conda "blat ucsc-fasplit ucsc-liftup"
//conda params.totalCondaEnvPath
memory '4 GB'
input:
set file(originalFasta),file(liftupFile),file(fastaSubChunk),file(old_2bit),file(ooc) from blatCmds
output:
file "${fastaSubChunk}.lifted.psl" into axtChainCmds
tag "${fastaSubChunk}"
script:
"""
if [ "${params.splitSize}" -lt "4000" ]; then
blat ${old_2bit} ${fastaSubChunk} -ooc=${ooc} -maxIntron=0 -stepSize=1 -tileSize=11 -minIdentity=98 -noHead -minScore=100 -fastMap -extendThroughN ${fastaSubChunk}.subsplit.psl
else
blat ${old_2bit} ${fastaSubChunk} -ooc=${ooc} -maxIntron=0 -stepSize=1 -tileSize=11 -minIdentity=98 -noHead -minScore=100 -extendThroughN ${fastaSubChunk}.subsplit.psl
fi
liftUp -pslQ ${fastaSubChunk}.lifted.psl ${liftupFile} warn ${fastaSubChunk}.subsplit.psl
"""
}
process axtChain {
conda "ucsc-axtchain ucsc-fatotwobit"
//conda params.totalCondaEnvPath
input:
file pslFile from axtChainCmds.collectFile(name:"merged.psl",keepHeader:true,skip:5)
file oldFasta from oldGenome_3
file newFasta from newGenome_3
output:
file "${pslFile}.chain" into chains
tag "${pslFile}"
script:
"""
axtChain -linearGap=loose -faQ -faT -psl ${pslFile} ${oldFasta} ${newFasta} ${pslFile}.chain
"""
}
process chainSortFirst {
conda "ucsc-chainsort"
input:
file chainFile from chains
output:
file "sorted.${chainFile}" into sorted_chains
script:
"""
chainSort ${chainFile} sorted.${chainFile}
"""
}
process chainMergeSort_chainSplit {
conda "ucsc-chainmergesort ucsc-chainsplit"
//conda params.totalCondaEnvPath
tag "$chainFile"
input:
file chainFile from sorted_chains.collect()
output:
file "chainMerge/*.chain" into sortMergedChains
script:
"""
##chainMerge is the output directory
chainMergeSort ${chainFile} | chainSplit chainMerge stdin -lump=50
"""
}
process chainSortSecond {
conda "ucsc-chainsort"
//conda params.totalCondaEnvPath
tag "$chainFile"
input:
file chainFile from sortMergedChains.collectFile(name: 'all.chain')
output:
file "all.sorted.chain" into allSortedChain_1, allSortedChain_2
script:
"""
chainSort all.chain all.sorted.chain
"""
}
process calculateChromInfo {
conda "seqkit"
//conda params.totalCondaEnvPath
tag "$oldGenome and $newGenome"
input:
file oldGenome from oldGenome_2
file newGenome from newGenome_1
output:
set file("${oldGenome}.chromInfo"),file("${newGenome}.chromInfo") into chromInfos
script:
"""
##Equivalent command that can be run on FASTA files:
seqkit fx2tab --only-id -nl ${oldGenome} | tr -s "\t" | sort -k2,2nr > ${oldGenome}.chromInfo
seqkit fx2tab --only-id -nl ${newGenome} | tr -s "\t" | sort -k2,2nr > ${newGenome}.chromInfo
##Old way that used ucsc-twobitinfo from a 2bit file.
##twoBitInfo new.2bit new.2bit.chromInfo
##twoBitInfo old.2bit old.2bit.chromInfo
"""
}
process chainNet {
conda "ucsc-chainnet"
//conda params.totalCondaEnvPath
tag "$allSortedChain"
input:
file allSortedChain from allSortedChain_1
set file(oldInfo),file(newInfo) from chromInfos
output:
file "all.net" into netFile
script:
"""
chainNet ${allSortedChain} ${oldInfo} ${newInfo} all.net /dev/null
"""
}
process produceLiftOverFile {
conda "ucsc-netchainsubset"
//conda params.totalCondaEnvPath
publishDir './liftover_output/',mode:'copy',overwrite:true
tag "$netFile & $allSortedChain_2"
input:
file netFile
file allSortedChain_2
output:
file "final.liftOver" into liftOverFile_1, liftOverFile_2
script:
"""
netChainSubset ${netFile} ${allSortedChain_2} final.liftOver
"""
}
process crossmap_liftover {
conda "crossmap=0.3.7"
input:
file gffFile from gffFile_3
file liftOverFile from liftOverFile_1
output:
file "crossmap-lifted_${gffFile}" into crossmap_lifted_gff
script:
"""
crossmap.py -v
crossmap.py gff ${liftOverFile} ${gffFile} crossmap-lifted_${gffFile}
###Below line is to fix a bug in crossmap where it outputs coordinates as floats rather than integers
##Also bug where certain scores for features were set to null?
##cat lifted_unsorted_unfixed_${gffFile} | sed \$'s/.0\t/\t/g' | sed \$'s/\t\t/\t0\t/g'> crossmap-lifted_${gffFile}
###
"""
}
gffFile_1.combine(oldGenome_4).set{normalizeCmds}
process normalizeGff {
publishDir './liftover_output/',mode:'copy',overwrite:true
tag "$gff by $fasta"
input:
set file(gff),file(fasta) from normalizeCmds
output:
file "target.${gff}.gff3" optional true into normalizedGff
file "ignored.${gff}.gff3"
script:
"""
seqkit fx2tab --only-id -n ${fasta} | tr -s "\t" > target_scaffolds.txt
echo "##gff-version 3" >> target_scaffolds.txt
gt gff3 -tidy -sort -retainids -fixregionboundaries ${gff} > normalized.${gff}.gff3
grep -f target_scaffolds.txt normalized.${gff}.gff3 > target.${gff}.gff3
grep -v -f target_scaffolds.txt normalized.${gff}.gff3 > ignored.${gff}.gff3
if [[ \$(wc -l <target.${gff}.gff3) -le 1 ]]
then
echo "No targets were found. Deleting target gff3 file."
rm -f target.${gff}.gff3
fi
"""
}
//process ucsc_gff3togenepred {
//conda "ucsc-gff3togenepred genometools"
//input:
// file gffFile from gffFile_1
//output:
// file "${gffFile}.gp" into gpFile
//script:
//"""
//
//gt gff3 -tidy -sort -retainids -fixregionboundaries ${gffFile} > tmp.gff3
//gff3ToGenePred tmp.gff3 ${gffFile}.gp
//"""
//}
normalizedGff.combine(liftOverFile_2).set{ liftoverCmds }
process ucsc_liftover {
conda "ucsc-liftover"
//conda params.totalCondaEnvPath
tag "$gffFile & liftOverFile"
input:
set file(gffFile),file(liftOverFile) from liftoverCmds
output:
file "original_${gffFile}" into gffOriginal
file "ucsc-lifted_${gffFile}" into ucsc_lifted_gff, ucsc_lifted_gff_ch2
file "unmapped_${gffFile}" into unmapped_gff, unmapped_gff_ch2
script:
"""
liftOver -gff ${gffFile} ${liftOverFile} ucsc-lifted_${gffFile} unmapped_${gffFile}
ln -s ${gffFile} original_${gffFile}
"""
}
process rescue_unlifted_features {
tag "$unmapped_gff_ch2"
input:
file gffOriginal
file ucsc_lifted_gff_ch2
file unmapped_gff_ch2
output:
file "rescued.gff" into rescuedGff
script:
"""
#!/usr/bin/env python
##Oftentimes gene and mRNA features are not lifted over, as they presumably span regions that changed
##Or also possible, they span "NNNN" regions that blat just doesn't want to deal with
##This node takes those unmapped parent features, checks if they originally simply spanned the extent
##of their child features, and if so simply updates the unmapped extent to the new extent of the child features
##1) Load unmapped features. Get the ID(s)
##2) Load the original file, check if the extent of the feature is == to extent of the children
##3) Load the lifted over file, find the aforementioned child ID. Use the start/end of the left/right child feature respectively.
import glob
import re
lifted_path = glob.glob("./ucsc-lifted_*")[0]
unmapped_path = glob.glob("./unmapped_*")[0]
original_path = glob.glob("./original_*")[0]
unmapped_ids = dict()
##Load unmapped features
handle = open(unmapped_path,"r")
for l in handle.readlines():
if l[0] == "#":
##Skip comment lines
continue
splitline = l.split("\t")
start = int(splitline[3]) ##Start of the feature
end = int(splitline[4]) ##End of the feature
re_result = re.search("ID=(.+)[;\$]",splitline[8])
if re_result == None:
continue
feature_ID = re_result.group(1)
unmapped_ids[feature_ID] = dict()
unmapped_ids[feature_ID]["line"] = l
unmapped_ids[feature_ID]["type"] = splitline[2]
unmapped_ids[feature_ID]["start"] = start
unmapped_ids[feature_ID]["end"] = end
unmapped_ids[feature_ID]["left_extent_defined_by_child"] = False
unmapped_ids[feature_ID]["right_extent_defined_by_child"] = False
unmapped_ids[feature_ID]["left_child"] = None
unmapped_ids[feature_ID]["right_child"] = None
unmapped_ids[feature_ID]["childs_scaffold"] = None
handle.close()
##Find the child features.
##Make a mapping of child IDs to parent IDs
child_ids = []
child_to_parent = dict()
handle = open(original_path,"r")
for l in handle.readlines():
if l[0] == "#":
continue
splitline = l.split("\t")
re_result = re.search("Parent=(.+)[;\$]",splitline[8])
if re_result == None:
continue
parent_ID = re_result.group(1)
re_result = re.search("ID=(.+)[;\$]",splitline[8])
if re_result == None:
continue
child_ID = re_result.group(1)
if parent_ID not in unmapped_ids.keys():
continue
start = int(splitline[3])
end = int(splitline[4])
scaffold = splitline[0]
if start == unmapped_ids[parent_ID]["start"]:
unmapped_ids[parent_ID]["left_extent_defined_by_child"] = True
unmapped_ids[parent_ID]["left_child"] = child_ID
child_ids.append(child_ID)
child_to_parent[child_ID] = parent_ID
if end == unmapped_ids[parent_ID]["end"]:
unmapped_ids[parent_ID]["right_extent_defined_by_child"] = True
unmapped_ids[parent_ID]["right_child"] = child_ID
child_ids.append(child_ID)
child_to_parent[child_ID] = parent_ID
handle.close()
handle = open(lifted_path,"r")
for l in handle.readlines():
if l[0] == "#":
continue
print("skipping due to comment line...")
splitline = l.split("\t")
re_result = re.search("ID=(.+)[;\$]",splitline[8])
if re_result == None:
continue
print("skipping due to no ID found...")
feature_ID = re_result.group(1)
if feature_ID not in child_ids:
print("skipping due to feature_ID not in child_ids...")
continue
start = int(splitline[3])
end = int(splitline[4])
scaffold = splitline[0]
print(scaffold)
unmapped_ids[child_to_parent[feature_ID]]["childs_scaffold"] = scaffold
if unmapped_ids[child_to_parent[feature_ID]]["left_child"] == feature_ID:
unmapped_ids[child_to_parent[feature_ID]]["start"] = start
if unmapped_ids[child_to_parent[feature_ID]]["right_child"] == feature_ID:
unmapped_ids[child_to_parent[feature_ID]]["end"] = end
handle.close()
write_handle = open("rescued.gff","w")
for k in unmapped_ids.keys():
unmapped_line = unmapped_ids[k]["line"]
unmapped_splitline = unmapped_line.split("\t")
if unmapped_ids[k]["childs_scaffold"] == None:
theScaffold = unmapped_splitline[0]
else:
theScaffold = unmapped_ids[k]["childs_scaffold"]
newline = "\t".join([theScaffold,unmapped_splitline[1],unmapped_splitline[2],str(unmapped_ids[k]["start"]),str(unmapped_ids[k]["end"]),unmapped_splitline[5],unmapped_splitline[6],unmapped_splitline[7],unmapped_splitline[8]])
write_handle.write(newline)
"""
}
//ucsc_lifted_gff.mix(crossmap_lifted_gff).set{liftedGffUnsorted}
process sort_gff {
publishDir './liftover_output/',mode:'copy',overwrite:true
conda "genometools-genometools"
//conda params.totalCondaEnvPath
input:
file gff from ucsc_lifted_gff
file unmapped from unmapped_gff
file rescued from rescuedGff
output:
file "srt_${gff}" into final_gff
file "${unmapped}"
tag "${gff}"
script:
"""
THENAME=${gff}
NEWNAME=lifted_\${THENAME#lifted_unsorted_}
cat ${gff} ${rescued} | grep -v "#" | gt gff3 -tidy -sort -retainids > srt_${gff}
"""
}
process compare_gffs {
echo true
tag "$ogff vs. $fgff"
input:
file ogff from gffFile_2
file fgff from final_gff
script:
"""
cat ${ogff} | grep -v "#" | cut -f 3 | sort | uniq > feature_types.txt
echo "Done with pipeline. Check the liftover_output folder"
"""
}