-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVariantCalling.sh
249 lines (167 loc) · 10.9 KB
/
VariantCalling.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env bash
export PATH=$PATH:/home/user/AIDD/AIDD_tools/bin
echo "Please enter the path to external hard drive with at least 1 terabyte of space if you want to run all 18 files"
read path
mkdir $path
cd $path
##/media/user/ExtraSpace3/AML2
echo "Please enter up to 18 sra file numbers seperated by a space with no punctuation for example SRR0000000 SRR0000000 SRR0000000."
read varname1 varname2 varname3 varname4 varname5 varname6 varname7 varname8 varname9 varname10 varname11 varname12 varname13 varname14 varname15 varname16 varname17 varname18
##SRR1575102 SRR1575103 SRR1575104 SRR1575105 SRR3895734 SRR3895735 SRR3895736 SRR3895737 SRR3895738 SRR3895739 SRR3895741 SRR3895742 SRR3895743 SRR3895744 SRR3895746 SRR3895747
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running java picard AddorReplaceReadGroups with RGID=4 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=20 for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/picard.jar AddOrReplaceReadGroups I=$path/${samp}/${samp}.bam O=$path/${samp}/${samp}2.bam RGID=4 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=20
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running java picard ReorderSam and creating an index for the new order for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/picard.jar ReorderSam I=$path/${samp}/${samp}2.bam O=$path/${samp}/${samp}3.bam R=$path/ref2.fa CREATE_INDEX=TRUE
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running java picard CollectAlignmentSummaryMetrics to create text file for downstream variant calling for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/picard.jar CollectAlignmentSummaryMetrics R=$path/ref2.fa I=$path/${samp}/${samp}3.bam O=$path/${samp}/${samp}_alignment_metrics.txt
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running java picard CollectInsertSizeMetrics to creat both a text file and pdf file summarizing insert size for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/picard.jar CollectInsertSizeMetrics INPUT=$path/${samp}/${samp}3.bam OUTPUT=$path/${samp}/${samp}_insert_metrics.txt HISTOGRAM_FILE=$path/${samp}/${samp}_insert_size_histogram.pdf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running samtools to filter short or cutoff reads for more accurate variant calling for ${samp}"
samtools view -b -f2 $path/${samp}/${samp}3.bam > $path/${samp}/${samp}4.bam
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running samtools depth to create depth text for downstream variant calling for ${samp}"
samtools depth $path/${samp}/${samp}4.bam > $path/${samp}/${samp}depth_out.txt
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "java picard MarkDuplicates to annotate PCR duplicates for more accurate variant calling in RNA editing experiments for ${samp}"
java -d64 -Xmx20G -XX:-UseGCOverheadLimit -XX:ParallelGCThreads=2 -XX:ReservedCodeCacheSize=1024M -Djava.io.tmpdir=$path/tmp -jar /home/user/AIDD/AIDD_tools/picard.jar MarkDuplicates INPUT=$path/${samp}/${samp}4.bam OUTPUT=$path/${samp}/${samp}dedup_reads.bam METRICS_FILE=$path/${samp}/${samp}metrics.txt TMP_DIR=$path/tmp
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running java picard to build Bam index for downstream variant calling for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/picard.jar BuildBamIndex INPUT=$path/${samp}/${samp}dedup_reads.bam
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Starting realigners using java GATK with reference sequences previously downloads for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T RealignerTargetCreator -R $path/ref2.fa --filter_reads_with_N_cigar -I $path/${samp}/${samp}dedup_reads.bam -o $path/${samp}/${samp}realignment_targets.list
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Re aligning indels with java GATK with same reference sequences as previous step for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T IndelRealigner -R $path/ref2.fa --filter_reads_with_N_cigar -I $path/${samp}/${samp}dedup_reads.bam -targetIntervals $path/${samp}/${samp}realignment_targets.list -o $path/${samp}/${samp}realigned_reads.bam
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Starting GATK HaplotypeCaller using reference sequences from the previous step and known snp sites with special options for RNA editing detection for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T HaplotypeCaller -R $path/ref2.fa -I $path/${samp}/${samp}realigned_reads.bam --dbsnp $path/dbsnp.vcf -dontUseSoftClippedBases -stand_call_conf 20.0 -o $path/${samp}/${samp}raw_variants.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK Select Variants to create vcf file of raw snps for filtering for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T SelectVariants -R $path/ref2.fa -V $path/${samp}/${samp}raw_variants.vcf -selectType SNP -o $path/${samp}/${samp}raw_snps.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK Select Variants to create vcf file of raw indels for filtering for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T SelectVariants -R $path/ref2.fa -V $path/${samp}/${samp}raw_variants.vcf -selectType INDEL -o $path/${samp}/${samp}raw_indels.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Starting first filtering step for raw snps using GATK for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T VariantFiltration -R $path/ref2.fa -V $path/${samp}/${samp}raw_snps.vcf --filterExpression 'QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || SOR > 4.0' --filterName "basic_snp_filter" -o $path/${samp}/${samp}filtered_snps.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Starting first filtering step for raw indels using GATK for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T VariantFiltration -R $path/ref2.fa -V $path/${samp}/${samp}raw_indels.vcf --filterExpression 'QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0' --filterName "basic_indel_filter" -o $path/${samp}/${samp}filtered_indels.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK baseRecalibrator to incorporate snp and indels from first variant calling step into a table to use for second variant calling for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T BaseRecalibrator -R $path/ref2.fa -I $path/${samp}/${samp}realigned_reads.bam -knownSites $path/${samp}/${samp}filtered_snps.vcf -knownSites $path/${samp}/${samp}filtered_indels.vcf -o $path/${samp}/${samp}recal_data.table
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Rerunniing GATK basRecalibrator as in the previous step but with BQSR option for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T BaseRecalibrator -R $path/ref2.fa -I $path/${samp}/${samp}realigned_reads.bam -knownSites $path/${samp}/${samp}filtered_snps.vcf -knownSites $path/${samp}/${samp}filtered_indels.vcf -BQSR $path/${samp}/${samp}recal_data.table -o $path/${samp}/${samp}post_recal_data.table
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK AnalyzeCovariates to filter for less false positives for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T AnalyzeCovariates -R ref2.fa -before $path/${samp}/${samp}recal_data.table -after $path/${samp}/${samp}post_recal_data.table -plots $path/${samp}/${samp}recalibration_plots.pdf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK printreads to collect the previous filtering and annotations into the new .bam file for the last variant calling step for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T PrintReads -R ref2.fa -I $path/${samp}/${samp}realigned_reads.bam -BQSR $path/${samp}/${samp}recal_data.table -o $path/${samp}/${samp}recal_reads.bam
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running GATK HaplotypeCaller for a second time with previous discovered variants already annotated in the starting bam file for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T HaplotypeCaller -R $path/ref2.fa --dbsnp $path/dbsnp.vcf -dontUseSoftClippedBases -stand_call_conf 20.0 -I $path/${samp}/${samp}recal_reads.bam -o $path/${samp}/${samp}raw_variants_recal.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Using GATK to select snps for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T SelectVariants -R $path/ref2.fa -V $path/${samp}/${samp}raw_variants_recal.vcf -selectType SNP -o $path/${samp}/${samp}raw_snps_recal.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Using GATK to select indels for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T SelectVariants -R $path/ref2.fa -V $path/${samp}/${samp}raw_variants_recal.vcf -selectType INDEL -o $path/${samp}/${samp}raw_indels_recal.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Filtering raw snp from the second variant calling step for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T VariantFiltration -R $path/ref2.fa -V $path/${samp}/${samp}raw_snps_recal.vcf --filterExpression 'QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || SOR > 4.0' --filterName "basic_snp_filter" -o $path/${samp}/${samp}filtered_snps_final.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Filtering raw indels from the second variant calling step for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/GenomeAnalysisTK.jar -T VariantFiltration -R $path/ref2.fa -V $path/${samp}/${samp}raw_indels_recal.vcf --filterExpression 'QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0' --filterName "basic_indel_filter" -o $path/${samp}/${samp}filtered_indels_recal.vcf
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running snpEff to predict effects of RNA editing events found in the variant calling on the protein stucture and function for ${samp}"
java -jar /home/user/AIDD/AIDD_tools/snpEff.jar -v GRCh37.75 $path/${samp}/${samp}filtered_snps_final.vcf > $path/${samp}/${samp}filtered_snps_final.ann.vcf
mv $path/snpEff_* $path/${samp}
done
for fn in $varname{1..18};
do
samp=`basename ${fn}`
echo "Running bedtools on the bam file generated with the first haplotype variant calling annotated to create a bedgraph file for ${samp}"
bedtools genomecov -bga -ibam $path/${samp}/${samp}recal_reads.bam > $path/${samp}/${samp}genomecov.bedgraph
done