-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGenomics2A_Linux_script.sh
150 lines (111 loc) · 4.88 KB
/
Genomics2A_Linux_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
echo -e "\n Downloading data... \n"
mkdir -p raw_data
cd raw_data
wget https://zenodo.org/record/2582555/files/SLGFSK-N_231335_r1_chr5_12_17.fastq.gz
wget https://zenodo.org/record/2582555/files/SLGFSK-N_231335_r2_chr5_12_17.fastq.gz
wget https://zenodo.org/record/2582555/files/SLGFSK-T_231336_r1_chr5_12_17.fastq.gz
wget https://zenodo.org/record/2582555/files/SLGFSK-T_231336_r2_chr5_12_17.fastq.gz
echo -e "\n Downloading reference sequence... \n"
wget https://zenodo.org/record/2582555/files/hg19.chr5_12_17.fa.gz
#unzip reference
unzip hg19.chr5_12_17.fa.gz
echo -e "\n Data Preprocessing... \n"
mkdir -p Fastqc_Reports #create directory for the fastqc output
#Qc on reads
for sample in `cat list.txt`
do
fastqc raw_data/${sample}*.fastq.gz -o Fastqc_Reports
done
multiqc Fastqc_Reports -o Fastqc_Reports
mkdir -p trimmed_reads
for sample in `cat list.txt`
do
trimmomatic PE -threads 8 raw_data/${sample}_r1_chr5_12_17.fastq.gz raw_data/${sample}_r2_chr5_12_17.fastq.gz \
trimmed_reads/${sample}_r1_paired.fq.gz trimmed_reads/${sample}_r1_unpaired.fq.gz \
trimmed_reads/${sample}_r2_paired.fq.gz trimmed_reads/${sample}_r2_unpaired.fq.gz \
ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:8:keepBothReads \
LEADING:3 TRAILING:10 MINLEN:25
fastqc trimmed_reads/${sample}_r1_paired.fq.gz trimmed_reads/${sample}_r2_paired.fq.gz \
-o trimmed_reads/Fastqc_results
done
multiqc trimmed_reads/Fastqc_results -o trimmed_reads/Fastqc_results
#Index reference file
bwa index hg19.chr5_12_17.fa
mkdir Mapping
#Perform alignment
bwa mem -R '@RG\tID:231335\tSM:Normal' hg19.chr5_12_17.fa trimmed_reads/SLGFSK-N_231335_r1_paired.fq.gz \
trimmed_reads/SLGFSK-N_231335_r2_paired.fq.gz > Mapping/SLGFSK-N_231335.sam
bwa mem -R '@RG\tID:231336\tSM:Tumor' hg19.chr5_12_17.fa trimmed_reads/SLGFSK-T_231336_r1_paired.fq.gz \
trimmed_reads/SLGFSK-T_231336_r2_paired.fq.gz > Mapping/SLGFSK-T_231336.sam
for sample in `cat list.txt`
do
# Convert SAM to BAM and sort it
samtools view -@ 20 -S -b Mapping/${sample}.sam | samtools sort -@ 32 > Mapping/${sample}.sorted.bam
Index BAM file
samtools index Mapping/${sample}.sorted.bam
done
for sample in `cat list.txt`
do
#Filter BAM files
samtools view -q 1 -f 0x2 -F 0x8 -b Mapping/${sample}.sorted.bam > Mapping/${sample}.filtered1.bam
done
#use the command markdup
for sample in `cat list.txt`
do
samtools collate -o Mapping/${sample}.namecollate.bam Mapping/${sample}.filtered1.bam
samtools fixmate -m Mapping/${sample}.namecollate.bam Mapping/${sample}.fixmate.bam
samtools sort -@ 32 -o Mapping/${sample}.positionsort.bam Mapping/${sample}.fixmate.bam
samtools markdup -@32 -r Mapping/${sample}.positionsort.bam Mapping/${sample}.clean.bam
done
#or rmdup
#samtools rmdup SLGFSK35.sorted.bam SLGFSK35.rdup and samtools rmdup SLGFSK36.sorted.bam SLGFSK36.rdup
for sample in `cat list.txt`
do
cat Mapping/${sample}.clean.bam | bamleftalign -f hg19.chr5_12_17.fa -m 5 -c > Mapping/${sample}.leftAlign.bam
done
#Recalibrate read mapping qualities
for sample in `cat list.txt`
do
samtools calmd -@ 32 -b Mapping/${sample}.leftAlign.bam hg19.chr5_12_17.fa > Mapping/${sample}.recalibrate.bam
done
#Refilter read mapping qualities
for sample in `cat list.txt`
do
bamtools filter -in Mapping/${sample}.recalibrate.bam -mapQuality <=254 > Mapping/${sample}.refilter.bam
done
#Variant Calling
#wget https://sourceforge.net/projects/varscan/files/VarScan.v2.3.9.jar
#Convert data to pileup
mkdir Variants
for sample in `cat list.txt`
do
samtools mpileup -f hg19.chr5_12_17.fa Mapping/${sample}.refilter.bam --min-MQ 1 --min-BQ 28 \
> Variants/${sample}.pileup
done
#Call variants
java -jar VarScan.v2.3.9.jar somatic Variants/SLGFSK-N_231335.pileup \
Variants/SLGFSK-T_231336.pileup Variants/SLGFSK \
--normal-purity 1 --tumor-purity 0.5 --output-vcf 1
#Merge vcf
#merge vcf
bgzip Variants/SLGFSK.snp.vcf > Variants/SLGFSK.snp.vcf.gz
bgzip Variants/SLGFSK.indel.vcf > Variants/SLGFSK.indel.vcf.gz
tabix Variants/SLGFSK.snp.vcf.gz
tabix Variants/SLGFSK.indel.vcf.gz
bcftools merge Variants/SLGFSK.snp.vcf.gz Variants/SLGFSK.indel.vcf.gz > Variants/SLGFSK.vcf
# Variant Annotation
#download jar file
wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip
# Unzip file
unzip snpEff_latest_core.zip
#download snpEff database
java -jar snpEff.jar download hg19
#annotate variants
java -Xmx8g -jar snpEff/snpEff.jar hg19 Variants/SLGFSK.vcf > Variants/SLGFSK.ann.vcf
#Clinical Annotation using gemini
#Installation
wget https://raw.github.com/arq5x/gemini/master/gemini/scripts/gemini_install.py
python gemini_install.py /usr/local /usr/local/share/gemini
#Loading the Variants
gemini load -v Variants/SLGFSK.ann.vcf -t snpEff Annotation/gemini.db