-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminION_align.sh
executable file
·132 lines (87 loc) · 4.41 KB
/
minION_align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/bin/sh
################################# minION_align.sh created by Rebecca Nance ########################################
## sequence data (fastq files) must be located in a /minION_align/data
## a reference sequence file (fasta) must include ref.fasta in the file name and be located in minION_align directory
#make output directory
mkdir -p output
#cd to output directory
cd output
#make other subdirectories
mkdir -p consensus_sequence_fasta
mkdir -p draft_assembly_gfa
#cd back into main directory
cd ../
#move into data directory where fastq seq files are contained
cd data
#cut file names at underscores and keep unique names stored in list
ls | grep ".fastq" | cut -d "_" -f 3 | sort | uniq > list
#combine all fastq files for a single barcode
while read i
do
cat *"$i"*.fastq > "$i"_all.fastq
done<list
mv *_all.fastq ../
cd ../
#use NanoFilt to filter reads (based on avg length=500, quality=12, remove first 50 bp reads)
cat *_all.fastq | NanoFilt -l 500 -q 12 --headcrop 50 > trim.fastq
### Generate first assembly (from reference sequence, using minimap2)
#generate draft assembly using reference seq with minimap2
minimap2 -x map-ont *_ref.fasta trim.fastq | gzip -1 > map_minimap_ref.paf.gz
#assemble untigs from the mapped reads
miniasm -f trim.fastq map_minimap_ref.paf.gz > map_minimap_ref.gfa
#generate consensus seq by converting gfa to fasta
awk '/^S/{print ">"$2"\n"$3}' map_minimap_ref.gfa > map_minimap_ref.fasta
#map trimmed reads onto miniasm assembly
minimap2 map_minimap_ref.fasta trim.fastq | gzip -1 > map_minimap_ref.racon.paf.gz
echo "FIRST ASSEMBLY COMPLETE"
### Generate second assembly (from de novo, using minimap2)
#map reads onto themsevles to identify overlaps (aka untigs)
minimap2 -x ava-ont trim.fastq trim.fastq | gzip -1 > overlaps.paf.gz
#assemble untigs from the mapped overlapping reads
miniasm -f trim.fastq overlaps.paf.gz > untigs.gfa
#generate consensus seq by converting gfa to fasta
awk '/^S/{print ">"$2"\n"$3}' untigs.gfa > untigs.fasta
#map trimmed reads onto miniasm untigs assembly
minimap2 untigs.fasta trim.fastq | gzip -1 > untigs.racon.paf.gz
echo "SECOND ASSEMBLY COMPLETE"
### Generate third assembly (from de novo, using shasta)
./shasta-Linux-0.7.0 --input trim.fastq
echo "THIRD ASSEMBLY COMPLETE"
cd ShastaRun
mv Assembly.gfa ../
mv Assembly.fasta ../
cd ../
#map trimmed reads onto shasta assembly
minimap2 Assembly.fasta trim.fastq | gzip -1 > shasta.racon.paf.gz
### Error correction/consensus sequence with Racon
#build consensus using the first assembly (from reference seq, using minimap2)
racon trim.fastq map_minimap_ref.racon.paf.gz map_minimap_ref.fasta > map_minimap_ref.consensus.fasta
#build consensus using the second assembly (from de novo, using minimap2)
racon trim.fastq untigs.racon.paf.gz untigs.fasta > map_minimap_denovo.racon.consensus.fasta
#build consensus (really just error correction) using the third assembly (from de novo, using shasta)
racon trim.fastq shasta.racon.paf.gz Assembly.fasta > map_shasta_denovo.consensus.fasta
#copy gfa files into the draft_assembly_gfa subdirectory and the consensus.fasta files into the consensus_sequences_fasta subdirectory
cp *.gfa output/draft_assembly_gfa
cp *.consensus.fasta output/consensus_sequence_fasta
echo "Script complete. Consensus sequences generated in /minION_align/output/draft_assembly_gfa. Assembly graphs generated in /minION_align/output/consensus_sequences_fasta. Examine these with Bandage."
############################ OPTIONAL ################################
##use medaka to generate consensus seq and generate variants
#medaka_haploid_variant -t 2 trim.fastq *_ref.fasta
##annotate medaka's vcf file so it can be uploaded into IGV
#medaka tools annotate consensus_to_ref.vcf *_ref.fasta mapped.sorted.bam variants.annotated.vcf
##use IGV to visualize
#igv
#genomes --> upload the reference fasta file
#file --> upload the vcf file (variants.vcf) AND the sorted bam file (mapped.sorted.bam)
##use Bandage to visualize different gfa assembly files
#Bandage
#upload each of the gfa files into the Bandage gui
########## THIS HAS NO ERROR CORRECTION ##############
##use minimap2 to map to reference genome
#minimap2 -ax map-ont --MD *_ref.fasta trim.fastq > mapped.sam
##use samtools to convert sam to bam
#samtools view -bS mapped.sam > mapped.bam
#use samtools to sort the bam file
#samtools sort -o mapped.sorted.bam mapped.bam
##use samtools to create an index file
#samtools index mapped.sorted.bam