-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathvariant_to_realignment.nf
executable file
·464 lines (396 loc) · 15.4 KB
/
variant_to_realignment.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/env nextflow
// Enable syntax extension
// See https://www.nextflow.io/docs/latest/dsl2.html
nextflow.enable.dsl=2
/*
* Convert the VCF file to BED format storing the VCF line in the "name" column and the reference allele in the
* "strand" column.
*/
process convertVCFToBed {
label 'default_time', 'med_mem'
input:
path "source.vcf"
output:
path "variants.bed", emit: variants_bed
'''
# Convert the vcf file to bed format:
# - Remove all headers
# - Switch to 0 based coordinates system
# - Add the reference allele so it can be used in flankingRegionBed to adjust the position of the right flank
# - add all VCF fields separated by 2 characters pipe and caret (|^) to avoid impacting existing formatting of
# the VCF line. The sub replacing percent is to protect the % character that would be interpreted by printf
# otherwise. the sub replacing space is to prevent bedtools from using them as a field separator
awk -F '\\t' '{ if (!/^#/){ \
printf $1"\\t"$2-1"\\t"$2"\\t"$1; \
for (i=2; i<=NF; i++){ gsub(/%/, "%%", $i); gsub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \
}' source.vcf \
> variants.bed
'''
}
/*
* Based on variants BED, generate the BED file for each flank.
*/
process flankingRegionBed {
label 'default_time', 'med_mem'
input:
path "variants.bed"
path "genome.chrom.sizes"
val flankingseq
output:
path "flanking_r1.bed", emit: flanking_r1_bed
path "flanking_r2.bed", emit: flanking_r2_bed
script:
// The Flanking sequence will start/end one base up/downstream of the variant.
// We need to add only (flankingseq - 1) to that base to have the correct flank length
flankingseq = flankingseq - 1
"""
# Adjust the end position of the flank to be one base upstream of the variant
awk 'BEGIN{OFS="\\t"}{\$2=\$2-1; \$3=\$3-1; print \$0}' variants.bed \
| bedtools slop -g genome.chrom.sizes -l $flankingseq -r 0 > flanking_r1.bed
# Adjust the start position of the flank to be one base downstream of the end of variant (\$5 is the reference allele)
awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$5); \$3=\$3+length(\$5); print \$0}' variants.bed \
| bedtools slop -g genome.chrom.sizes -l 0 -r $flankingseq > flanking_r2.bed
"""
}
/*
* Extract the actual flanking region in fasta format.
*/
process flankingRegionFasta {
label 'default_time', 'med_mem'
input:
path "flanking_r1.bed"
path "flanking_r2.bed"
path "genome.fa"
path "genome.fa.fai"
output:
path "variants_read1.fa", emit: variants_read1
path "variants_read2.fa", emit: variants_read2
'''
# Get the fasta sequences for these intervals
bedtools getfasta -fi genome.fa -bed flanking_r1.bed -fo variants_read1.fa
bedtools getfasta -fi genome.fa -bed flanking_r2.bed -fo variants_read2.fa
'''
}
/*
* Extract information about the original variants and put it in the fasta header
*/
process extractVariantInfoToFastaHeader {
label 'default_time', 'med_mem'
input:
path "flanking_r1.bed"
path "flanking_r2.bed"
path "variants_read1.fa"
path "variants_read2.fa"
output:
path "interleaved.fa", emit: interleaved_fasta
// Disable Nextflow string interpolation using single quotes
// https://www.nextflow.io/docs/latest/script.html#string-interpolation
'''
# Store variant position in the file to have a unique name
awk '{print ">" NR }' flanking_r1.bed > position.txt
# Store position of the variant in the file and replace '£€' with the original whitespace from convertVCFToBed
cut -f 4 flanking_r1.bed | sed 's/£€/ /g' > vcf_fields.txt
# Paste the names, variant bases, then fasta sequences into a new file
# A space will be inserted between the position and the vcf fields
# Then a newline is inserted between the vcf fields and the sequence
# The vcf fields are regarded as comment to the fasta entry.
paste -d ' \\n' position.txt vcf_fields.txt <(grep -v '^>' variants_read1.fa) > variant_read1.out.fa
paste -d '\\n' position.txt <(grep -v '^>' variants_read2.fa) > variant_read2.out.fa
paste variant_read1.out.fa variant_read2.out.fa | paste - - | awk -F "\\t" 'BEGIN {OFS="\\n"} {print $1,$3,$2,$4}' > interleaved.fa
'''
}
/*
* Split fasta entries into multiple chunks
*/
process split_fasta {
label 'short_time', 'small_mem'
input:
path interleaved_fasta
val chunk_size
output:
path("read_chunk-*", emit: read_split)
script:
if (interleaved_fasta.size() > 0)
"""
split -a 5 -d -l ${chunk_size * 4} ${interleaved_fasta} read_chunk-
"""
else
"""
ln -s ${interleaved_fasta} read_chunk-00001
"""
}
/*
* Align sequence with minimap2
*/
process alignWithMinimap {
label 'med_time'
// Memory required is 10 times the size of the fasta in Bytes or at least 2GB
// Overwrite base_memory so that the standard retry strategy is used
ext base_memory: { Math.max(file(params.newgenome).size() * 10, 2000000000) }
input:
// reads contains paired interleaved (first and second read in the same file)
each path(reads)
// indexing is done on the fly so get the genome directly
path "genome.fa"
val flanklength
output:
path "reads_aligned.bam", emit: reads_aligned_bam
script:
index_size = ""
if (file(params.newgenome).size() > 4000000000){
index_size = " -I " + file(params.newgenome).size() * 1.1
}
if (flanklength < 500)
"""
# Options used by the 'sr' preset with some modifications:
# -O6,16 instead of -O12,32 --> reduce indel cost
# -B5 instead of -B10 --> reduce mismatch cost
# --end-bonus 20 --> bonus score when the end of the read aligns to mimic global alignment.
# --secondary=yes -N 2 --> allow up to 2 secondary alignments
# -y option will take the comment from the fasta entry and output it
# the awk script will convert this comment in valid SAM tag
minimap2 $index_size -k21 -w11 --sr --frag=yes -A2 -B5 -O6,16 --end-bonus 20 -E2,1 -r50 -p.5 -z 800,200\
-f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=yes -N 2 -y \
-a genome.fa ${reads} | \
awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
samtools view -bS - > reads_aligned.bam
"""
else
"""
minimap2 $index_size -k19 -w19 -A2 -B5 -O6,16 --end-bonus 20 -E3,1 -s200 -z200 -N50 --min-occ-floor=100 \
--secondary=yes -N 2 -y \
-a genome.fa ${reads} | \
awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
samtools view -bS - > reads_aligned.bam
"""
}
/*
* Sort BAM file by name
*/
process sortByName {
label 'default_time', 'med_mem'
input:
path "reads_aligned.bam"
output:
path "reads_aligned_name_sorted.bam", emit: reads_aligned_sorted_bam
"""
samtools sort -n -O BAM -o reads_aligned_name_sorted.bam reads_aligned.bam
"""
}
/*
* Align sequence with bowtie2
*/
process alignWithBowtie {
label 'med_time'
// Memory required is 5 times the size of the fasta in Bytes or at least 1GB
// Overwrite base_memory so that the standard retry strategy is used
ext base_memory: { Math.max(file(params.newgenome).size() * 5, 1073741824) }
input:
path "variant_read1.fa"
path "variant_read2.fa"
// This will get all the index files named as they were placed in a subdirectory
file "bowtie_index/*"
output:
path "reads_aligned.bam", emit: reads_aligned_bam
"""
bowtie2 -k 2 --end-to-end --np 0 --sam-append-comment -f -x bowtie_index/bowtie_index \
-1 variant_read1.fa -2 variant_read2.fa \
| awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' \
| samtools view -bS - > reads_aligned.bam
"""
}
/*
* Take the reads and process them to get the remapped variants
*/
process readsToRemappedVariants {
label 'default_time', 'med_mem'
input:
path "reads.bam"
path "genome.fa"
val flank_length
val filter_align_with_secondary
output:
path "variants_remapped.vcf", emit: variants_remapped
path "variants_unmapped.vcf", emit: variants_unmapped
path "summary.yml", emit: summary_yml
script:
if (filter_align_with_secondary)
"""
# Ensure that we will use the reads_to_remapped_variants.py from this repo
${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads.bam \
-o variants_remapped.vcf --newgenome genome.fa --out_failed_file variants_unmapped.vcf \
--flank_length $flank_length --summary summary.yml --filter_align_with_secondary
"""
else
"""
# Ensure that we will use the reads_to_remapped_variants.py from this repo
${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads*.bam \
-o variants_remapped.vcf --newgenome genome.fa --out_failed_file variants_unmapped.vcf \
--flank_length $flank_length --summary summary.yml
"""
}
/*
* Gather step for remapped and unmapped variants and the summary yaml file
*
*/
process merge_variants {
label 'short_time', 'small_mem'
input:
path "remapped*.vcf"
path "unmapped*.vcf"
path "summary*.yml"
output:
path "variants_remapped.vcf", emit: variants_remapped
path "variants_unmapped.vcf", emit: variants_unmapped
path "output_summary.yml", emit: summary_yml
"""
cat remapped*.vcf > variants_remapped.vcf
cat unmapped*.vcf > variants_unmapped.vcf
${baseDir}/variant_remapping_tools/merge_yaml.py --input summary*.yml --output output_summary.yml
"""
}
workflow process_split_reads_generic {
take:
source_vcf
old_genome_fa
old_genome_fa_fai
old_genome_chrom_sizes
new_genome_fa
new_genome_fa_fai
flank_length
filter_align_with_secondary
chunk_size
main:
convertVCFToBed(source_vcf)
flankingRegionBed(convertVCFToBed.out.variants_bed, old_genome_chrom_sizes, flank_length)
flankingRegionFasta(
flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed,
old_genome_fa, old_genome_fa_fai
)
extractVariantInfoToFastaHeader(
flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed,
flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2
)
split_fasta(
extractVariantInfoToFastaHeader.out.interleaved_fasta,
chunk_size
)
alignWithMinimap(
split_fasta.out.read_split,
new_genome_fa,
flank_length
)
sortByName(alignWithMinimap.out.reads_aligned_bam)
// Collect all the bam files in the next step
readsToRemappedVariants(
sortByName.out.reads_aligned_sorted_bam, new_genome_fa,
flank_length, filter_align_with_secondary
)
merge_variants(
readsToRemappedVariants.out.variants_remapped.collect(),
readsToRemappedVariants.out.variants_unmapped.collect(),
readsToRemappedVariants.out.summary_yml.collect()
)
emit:
variants_remapped = merge_variants.out.variants_remapped
variants_unmapped = merge_variants.out.variants_unmapped
summary_yml = merge_variants.out.summary_yml
}
workflow process_split_reads {
take:
source_vcf
old_genome_fa
old_genome_fa_fai
old_genome_chrom_sizes
new_genome_fa
new_genome_fa_fai
main:
flank_length = 50
filter_align_with_secondary = true
chunk_size = 5000000
process_split_reads_generic(
source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
chunk_size
)
emit:
variants_remapped = process_split_reads_generic.out.variants_remapped
variants_unmapped = process_split_reads_generic.out.variants_unmapped
summary_yml = process_split_reads_generic.out.summary_yml
}
workflow process_split_reads_mid {
take:
source_vcf
old_genome_fa
old_genome_fa_fai
old_genome_chrom_sizes
new_genome_fa
new_genome_fa_fai
main:
flank_length = 2000
filter_align_with_secondary = true
chunk_size = 500000
process_split_reads_generic(
source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
chunk_size
)
emit:
variants_remapped = process_split_reads_generic.out.variants_remapped
variants_unmapped = process_split_reads_generic.out.variants_unmapped
summary_yml = process_split_reads_generic.out.summary_yml
}
workflow process_split_reads_long {
take:
source_vcf
old_genome_fa
old_genome_fa_fai
old_genome_chrom_sizes
new_genome_fa
new_genome_fa_fai
main:
flank_length = 50000
filter_align_with_secondary = false
chunk_size = 50000
process_split_reads_generic(
source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
chunk_size
)
emit:
variants_remapped = process_split_reads_generic.out.variants_remapped
variants_unmapped = process_split_reads_generic.out.variants_unmapped
summary_yml = process_split_reads_generic.out.summary_yml
}
workflow process_split_reads_with_bowtie {
take:
source_vcf
old_genome_fa
old_genome_fa_fai
old_genome_chrom_sizes
new_genome_fa
new_genome_fa_fai
new_genome_bowtie_index
main:
flank_length = 50
convertVCFToBed(source_vcf)
flankingRegionBed(convertVCFToBed.out.variants_bed, old_genome_chrom_sizes, flank_length)
flankingRegionFasta(
flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed,
old_genome_fa, old_genome_fa_fai
)
extractVariantInfoToFastaHeader(
flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed,
flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2
)
alignWithBowtie(
extractVariantInfoToFastaHeader.out.variant_read1_with_info,
extractVariantInfoToFastaHeader.out.variant_read2_with_info,
new_genome_bowtie_index
)
readsToRemappedVariants(alignWithBowtie.out.reads_aligned_bam, new_genome_fa, flank_length, true)
emit:
variants_remapped = readsToRemappedVariants.out.variants_remapped
variants_unmapped = readsToRemappedVariants.out.variants_unmapped
summary_yml = readsToRemappedVariants.out.summary_yml
}