-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
317 lines (279 loc) · 10.7 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import gzip
import os
import tempfile
from tempfile import TemporaryDirectory
shell.prefix(' set -euo pipefail ;')
VDB_CONFIG_PRELUDE = 'export VDB_CONFIG=/usr/local/apps/ncbi/config/biowulf.kfg'
#WES test data comes from a published "reference dataset" for WGS and WES https://www.nature.com/articles/s41597-021-01077-5#Sec28
WGS_accessions = {
'normal':'SRR7890855',
'tumor':'SRR7890854',
}
reference = 'data/GRCh38.6.20.fa.gz'
mapped_n = 2000000
unmapped_n = 1000
known = 'data/known_variation_noiupac.vcf.gz'
dbnsfp = 'data/dbnsfp_6_20.vcf.gz'
#Final results of the test data should be a subset exon bedfile, a subset fasta reference, and subset reads
rule all:
input:
expand('data/{sample}_R{n}.6.20.fq.gz', n=[1,2], sample=WGS_accessions.keys()),
'data/exons_subset.bed',
known,
dbnsfp
rule known_variation:
"""
Download the known variation file. Remove iupac codes and subset.
"""
input:
fai = 'data/GRCh38.6.20.fa.gz.fai',
limit = 'data/LIMIT.bed'
resources:
mem_mb=1024 * 16,
disk_mb=1024 * 16,
runtime=60
output: 'data/known_variation_noiupac.vcf.gz'
run:
workdir = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
shell(
'( cd {tmpdir}; '
'curl -O ftp://ftp.ensembl.org/pub/release-102/variation/vcf/homo_sapiens/homo_sapiens-chr6.vcf.gz -O ftp://ftp.ensembl.org/pub/release-102/variation/vcf/homo_sapiens/homo_sapiens-chr6.vcf.gz.csi -O ftp://ftp.ensembl.org/pub/release-102/variation/vcf/homo_sapiens/homo_sapiens-chr20.vcf.gz -O ftp://ftp.ensembl.org/pub/release-102/variation/vcf/homo_sapiens/homo_sapiens-chr20.vcf.gz.csi && bcftools concat -Oz --naive homo_sapiens-chr6.vcf.gz homo_sapiens-chr20.vcf.gz > concat.vcf.gz && bcftools reheader --fai {workdir}/{input.fai} concat.vcf.gz > tmp_known_variation.vcf.gz && '
'rbt vcf-fix-iupac-alleles < tmp_known_variation.vcf.gz | bcftools view -Oz > tmp_no_iupac.vcf.gz && '
'zgrep ^# tmp_no_iupac.vcf.gz > {workdir}/{output} && '
'tabix -p vcf tmp_no_iupac.vcf.gz && '
'tabix -R {workdir}/{input.limit} tmp_no_iupac.vcf.gz >> {workdir}/{output} ) '
# This line may not work. Do it manually
#'bcftools view {output} -Oz > {output} '
)
rule dbnsfp:
"""
Download example chromosomes from dbnsfp
"""
resources:
mem_mb=1024*200,
disk_mb=1024*200,
runtime=60*8
threads: 16
input:
bed = 'data/LIMIT.bed'
output:
vcf='data/dbnsfp_6_20.vcf.gz',
tbi='data/dbnsfp_6_20.vcf.gz.tbi'
run:
workdir = os.getcwd()
with tempfile.TemporaryDirectory() as tmpdir:
shell(
'''(cd {tmpdir}; wget -O- https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc > dbnsfp.zip && '''
'''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
'''zgrep -v "^#" dbNSFP*_variant.chr6* > chrs && '''
'''zgrep -v "^#" dbNSFP*_variant.chr20* >> chrs && '''
'''sort -S 50% --parallel=8 chrs -k1,1 -k2,2n > chrs_sorted && '''
'''cat h chrs_sorted > chrs_sorted_header && '''
'''bgzip -c chrs_sorted_header > tmp.vcf.gz && '''
'''tabix -p vcf tmp.vcf.gz && '''
'''zgrep ^# tmp.vcf.gz > full.tmp.vcf && '''
'''tabix -R {workdir}/{input.bed} tmp.vcf.gz >> full.tmp.vcf && '''
'''bgzip -c full.tmp.vcf > {workdir}/{output.vcf}) && '''
'''tabix -s 1 -b 2 -e 2 {output.vcf} '''
)
rule exons:
"""
Get the exons for chromosomes 6 and 20. The paper that published the reference dataset indicates a high coverage in chromosome 6 for Illumina HiSeq 4000 reads
"""
resources:
mem_mb= 1024 * 2,
disk_mb= 1024 * 2,
runtime=30
output: 'data/full_exons.bed'
shell:
'wget https://ftp.ensembl.org/pub/release-108/gff3/homo_sapiens/Homo_sapiens.GRCh38.108.chromosome.6.gff3.gz -O- > gff ; '
'wget https://ftp.ensembl.org/pub/release-108/gff3/homo_sapiens/Homo_sapiens.GRCh38.108.chromosome.20.gff3.gz -O- >> gff ; '
'''zgrep 'exon' gff | awk '{{print $1, $4, $5, $6, $7}}' > tmp ; '''
'''zgrep 'exon' gff | awk '{{print $9}}' > IDs ; '''
'sed -i "s/Parent=transcript:\\(.*\\);Name.*/\\1/" IDs ; '
'paste tmp IDs > {output} ; '
'sed -i "s/[[:blank:]]/\\t/g" {output} ; '
'rm tmp IDs gff '
rule LIMIT:
"""
Get a limited bed file to subset the exon bed and select reads
"""
resources:
mem_mb= 1024 * 2,
disk_mb= 1024 * 2,
runtime=20
output: 'data/LIMIT.bed'
shell:
'echo "6 42900000 42970000 chr6" > {output}; '
'echo "20 46900000 46970000 chr20" >> {output}'
rule fasta:
"""
Get the fasta file for chromosome 6 and 20
"""
resources:
mem_mb= 1024 * 4,
disk_mb= 1024 * 4,
runtime=60
output: 'data/GRCh38.6.20.fa.gz',
shell:
'curl -L ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz > temp '
'&& curl -L ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz >> temp '
'&& zcat temp | bgzip -c > {output} '
'&& rm temp '
rule fasta_index:
"""
Make index for fasta file
"""
input:
reference
resources:
mem_mb = 1024*4,
runtime = 60
output: reference + '.fai'
shell:
"samtools faidx {input} > {output} "
rule WGS_download_normal:
"""
Get the WGS samples using the RRA accession code for R1
"""
resources:
mem_mb= 1024 * 64,
disk_mb= 1024 * 2000,
runtime=24*60
output:
normal_R1='data/normal_R1_full.fq.gz',
normal_R2='data/normal_R2_full.fq.gz',
run:
accession = WGS_accessions['normal']
shell('{VDB_CONFIG_PRELUDE}; fasterq-dump --split-files -t /lscratch/$SLURM_JOBID -O /data/NICHD-core0/test/$USER/sra {accession} ' )
shell('bgzip -c /data/NICHD-core0/test/$USER/sra/{accession}_1.fastq > {output.normal_R1} ')
shell('bgzip -c /data/NICHD-core0/test/$USER/sra/{accession}_2.fastq > {output.normal_R2} ')
shell('rm -rf /data/NICHD-core0/test/$USER/sra ')
rule WGS_download_tumor:
"""
Get the WGS samples using the RRA accession code for R2
"""
resources:
mem_mb= 1024 * 64,
disk_mb= 1024 * 2000,
runtime=24*60
output:
tumor_R2='data/tumor_R2_full.fq.gz',
tumor_R1='data/tumor_R1_full.fq.gz'
run:
accession = WGS_accessions['tumor']
shell('{VDB_CONFIG_PRELUDE}; fasterq-dump --split-files -t /lscratch/$SLURM_JOBID -O /data/NICHD-core0/test/$USER/sra {accession} ' )
shell('bgzip -c /data/NICHD-core0/test/$USER/sra/{accession}_1.fastq > {output.tumor_R1} ')
shell('bgzip -c /data/NICHD-core0/test/$USER/sra/{accession}_2.fastq > {output.tumor_R2} ')
shell('rm -rf /data/NICHD-core0/test/$USER/sra ')
rule bwa_index:
"""
Get the index for BWA
"""
resources:
mem_mb= 1024 * 16,
disk_mb= 1024 * 16,
runtime=4*60
input:
reference,
output:
multiext(reference, '.amb', '.ann', '.bwt', '.pac', '.sa')
shell:
'bwa index -a bwtsw {input}'
rule align_reads:
"""
Align the reads to the reference, since the reference is only Chr6, and Chr20 we will only have Chr6/20 reads
"""
resources:
mem_mb= 1024 * 32,
disk_mb= 1024 * 24,
runtime=54*60
input:
reads=['data/{sample}_R1_full.fq.gz', 'data/{sample}_R2_full.fq.gz'],
idx=rules.bwa_index.output,
output:
bam=temp('data/{sample}.sorted.bam')
params:
index=lambda w, input: os.path.splitext(input.idx[0])[0],
threads: 32
shell:
"bwa mem -t {threads} {params.index} {input.reads} | samtools view -bh | samtools sort -o {output} -O BAM "
rule small_fastq:
"""
Extract reads that are paired and mapped in proper pair
Extract reads that are unmapped
Collect the tags of unmapped and mapped reads, store them separately
Use Seqtk to subset the fastq using the read tags
"""
resources:
disk_mb= 1024 * 100,
mem_mb= 1024 * 32,
runtime=24*60
input:
bam=rules.align_reads.output,
limits=rules.LIMIT.output,
r1='data/{sample}_R1_full.fq.gz',
r2='data/{sample}_R2_full.fq.gz',
output:
R1='data/{sample}_R1.6.20.fq.gz',
R2='data/{sample}_R2.6.20.fq.gz',
mapped_names = 'data/{sample}.names.mapped.list',
unmapped_names= 'data/{sample}.names.unmapped.list'
threads: 32
run:
N = mapped_n
UN = unmapped_n
shell(
'samtools view -h -L {input.limits} {input.bam} | samtools view -f 3 - | cut -f1 | sort -u > {wildcards.sample}.intermediate.file'
)
shell(
'head -n {N} {wildcards.sample}.intermediate.file > {output.mapped_names} '
)
shell(
'samtools view -f 4 {input.bam} | cut -f1 | sort -u > {wildcards.sample}.int2.file'
)
shell (
'head -n {UN} {wildcards.sample}.int2.file > {output.unmapped_names} '
)
shell(
'seqtk subseq {input.r1} {output.mapped_names} '
'> {wildcards.sample}.r1.tmp '
)
shell(
'seqtk subseq {input.r1} {output.unmapped_names} '
'>> {wildcards.sample}.r1.tmp '
)
shell(
'bgzip -c {wildcards.sample}.r1.tmp > {output.R1} '
)
shell(
'seqtk subseq {input.r2} {output.mapped_names} '
'> {wildcards.sample}.r2.tmp '
)
shell(
'seqtk subseq {input.r2} {output.unmapped_names} '
'>> {wildcards.sample}.r2.tmp '
)
shell(
'bgzip -c {wildcards.sample}.r2.tmp > {output.R2} '
)
shell(
'rm {wildcards.sample}.intermediate.file {wildcards.sample}.int2.file {wildcards.sample}.r1.tmp {wildcards.sample}.r2.tmp'
)
rule small_bed:
"""
Create a subset bed file from the exons file of chr 6 and chr 20.
The small bed file will correspond to the reads that are in the small fastqs via subsetting with the LIMITs bed file
"""
resources:
mem_mb= 1024 * 2,
disk_mb= 1024 * 2,
runtime=4*60
input:
limit=rules.LIMIT.output,
bed=rules.exons.output
output:
'data/exons_subset.bed'
shell:
'bedtools intersect -a {input.bed} -b {input.limit} > {output} '