From a7cdac9b00c52a42511beef04558c9d96e82e2c8 Mon Sep 17 00:00:00 2001 From: davidaray Date: Tue, 21 Jul 2015 14:19:40 -0500 Subject: [PATCH] working version --- bwa_map_generic_driver.sh | 294 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 bwa_map_generic_driver.sh diff --git a/bwa_map_generic_driver.sh b/bwa_map_generic_driver.sh new file mode 100644 index 0000000..02e3bb2 --- /dev/null +++ b/bwa_map_generic_driver.sh @@ -0,0 +1,294 @@ +#!/bin/bash +#$ -V +#$ -cwd +#$ -S /bin/bash +#$ -N M8132_assembly +#$ -o $JOB_NAME.o$JOB_ID +#$ -e $JOB_NAME.e$JOB_ID +#$ -q ray512cc,raycc +#$ -pe fill 10 +#$ -P communitycluster + +#This file will process raw Illumina data using Trimmomatic. This will be followed by mapping to a reference genome to create a new genome assembly. + +BASEDIR=/lustre/scratch/daray/Ray_low_cov_work +WORKDIR=$BASEDIR/bwa_work2 + +mkdir $WORKDIR +cd $WORKDIR + +###### +#set up variables - will vary according to your needs, files and file locations +###### +ABBREV=M8132 #This will be the name you use to process your files. You will need to change the RP1, RP2, RU1, and RU2 slots just below here accordingly. +#These files will be generated by trimmomatic +RP1=$ABBREV"_R1_paired.fastq" +RP2=$ABBREV"_R2_paired.fastq" +RU1=$ABBREV"_R1_unpaired.fastq" +RU2=$ABBREV"_R2_unpaired.fastq" + +THREADS=9 #Line 9 sets this up to run 20 processors. If you want fewer, make sure to change that line as well as this one. + +refgenome=myoLuc2.fa # your reference genome for the assembly +CLEAN_REF=myoLuc2.fa.clean # your cleaned reference genome +REF_HOME=$BASEDIR/Mluc.reference #the location of your reference genome + +RAW_READS_HOME=$BASEDIR/data.raw #the location of your raw data +mkdir $BASEDIR/data_processed +PROCESSED_READS_HOME=$BASEDIR/data_processed #the location of your processed data +mkdir $PROCESSED_READS_HOME/quality_info +QUALITY_INFO=$PROCESSED_READS_HOME/quality_info #Where the quality stats will be saved +mkdir $BASEDIR/support_files +SUPPORT_FILES=$BASEDIR/support_files #where the support files like the adapter sequences will be located. + +###### +#set up alias' for major programs +###### +BWA_HOME=/lustre/work/apps/bwa-0.6.2 +SAMTOOLS_HOME=/lustre/work/apps/samtools-1.2 +SAMTOOLS1_8_HOME=/lustre/work/apps/samtools-0.1.18 +PICARD_HOME=/lustre/work/apps/picard-tools-1.91 +BCFTOOLS_HOME=/lustre/work/apps/samtools-0.1.18/bcftools +RAY_SOFTWARE=/lustre/work/daray/software +TRIM_HOME=/lustre/work/apps/Trimmomatic-0.27 +FASTX_HOME=/lustre/work/apps/fastx_toolkit-0.0.14/bin +VCFTOOLS_HOME=/lustre/work/daray/software/vcftools_0.1.12b/bin +BEDTOOLS_HOME=/lustre/work/apps/bedtools-2.17.0/bin + + +###### +#Set up insert size. This will be specific to the insert size for your particular taxon's library. +###### +insSize=1000 + +###### +#make sure your genome file has no blank lines - ALREADY DONE, NOT BEING USED HERE +###### +sed '/^$/d' $REF_HOME/$refgenome >tempGenome +cp tempGenome $REF_HOME/$refgenome".clean" + +#echo "spaces" | mailx -s "spaces" david.4.ray@gmail.com + +################################################################################ +# Map reads to genome with BWA +#~~~~~~~~~~~ + +#[1a] Use bwa to index the genome - ALREADY DONE, NOT BEING USED HERE + $BWA_HOME/bwa index \ + -a bwtsw \ + $REF_HOME/$CLEAN_REF + + $SAMTOOLS_HOME/samtools faidx \ + $REF_HOME/$CLEAN_REF + + +####### +###!!!!!!! There are comments after the "\" below. This won't work with them present. Make sure to get rid of anything after \ on all lines. +####### + + +########### +#Prepare the reads and get quality stats +########### +#[1] Use trimmomatic to generate paired and unpaired reads files +java -jar $TRIM_HOME/trimmomatic-0.27.jar \ + PE \ # we will be processing paired reads + -threads $THREADS \ #use 19 processors + -phred33 \ #quality cutoff = 33 + $RAW_READS_HOME/$ABBREV"_R1.fastq" \ #input reads + $RAW_READS_HOME/$ABBREV"_R2.fastq" \ #input reads + $PROCESSED_READS_HOME/$RP1 \ #output paired reads file for R1 + $PROCESSED_READS_HOME/$RU1 \ #output unpaired reads file for R1 + $PROCESSED_READS_HOME/$RP2 \ #output paired reads file for R2 + $PROCESSED_READS_HOME/$RU2 \ #output unpaired reads file for R2 + ILLUMINACLIP:$SUPPORT_FILES/TruSeq4-PE.fa:2:30:10 \ + LEADING:20 \ + TRAILING:20 \ + SLIDINGWINDOW:4:20 \ + MINLEN:33 + +#[2] Generate quality stats +for FASTQ in $PROCESSED_READS_HOME/$ABBREV*.fastq + do PROCESSED_FILE=$(basename $FASTQ .fastq) + + $FASTX_HOME/fastx_quality_stats \ + -Q33 \ + -o $QUALITY_INFO/$PROCESSED_FILE".stats" \ + -i $PROCESSED_READS_HOME/$PROCESSED_FILE".fastq" + + $FASTX_HOME/fastx_nucleotide_distribution_graph.sh \ + -i $QUALITY_INFO/$PROCESSED_FILE".stats" \ + -o $QUALITY_INFO/$PROCESSED_FILE"_NUC.png" \ + -t $QUALITY_INFO/$PROCESSED_FILE"_clipped" + + $FASTX_HOME/fastq_quality_boxplot_graph.sh \ + -i $QUALITY_INFO/$PROCESSED_FILE".stats" \ + -o $QUALITY_INFO/$PROCESSED_FILE"_BOX.png" \ + -t $QUALITY_INFO/$PROCESSED_FILE"_clipped" + +done + +echo $ABBREV"_qc_finished" | mailx -s $ABBREV"_qc_finished" david.4.ray@gmail.com + +sleep 5 + +################################################################################ +# Map reads to genome with BWA +#~~~~~~~~~~~ + + +#=================== +# [1b] Map the reads to the genome + $BWA_HOME/bwa aln \ + -n 0.01 \ + -l 28 \ + -t $THREADS \ + -q 20 \ + -f $ABBREV"_R1.sai" \ # the R1 output file + $REF_HOME/$CLEAN_REF \ # cleaned reference genome + $PROCESSED_READS_HOME/$RP1 #the R1 paired reads + +echo $ABBREV"_R1_map" | mailx -s $ABBREV"_R1_map" david.4.ray@gmail.com + +#=================== + $BWA_HOME/bwa aln \ + -n 0.01 \ + -l 28 \ + -t $THREADS \ + -q 20 \ + -f $ABBREV"_R2.sai" \ + $REF_HOME/$CLEAN_REF \ + $PROCESSED_READS_HOME/$RP2 + +echo $ABBREV"_R2_map" | mailx -s $ABBREV"_R2_map" david.4.ray@gmail.com + +cat $PROCESSED_READS_HOME/$RU1 $PROCESSED_READS_HOME/$RU2 >$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" + + $BWA_HOME/bwa aln \ + -n 0.01 \ + -l 28 \ + -t $THREADS \ + -q 20 \ + -f $ABBREV"_RX.sai" \ + $REF_HOME/$CLEAN_REF \ + $PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" + +echo $ABBREV"_RX_map" | mailx -s $ABBREV"_RX_map" david.4.ray@gmail.com + + +#=================== +# [2] calculate the max insertion size + maxInsSize=insSize*2 + +#=================== +# [3] use sampe and SAMtools to create bam files of the mapped reads +#create sam file from paired mapped reads + $BWA_HOME/bwa sampe \ + -a $maxInsSize \ + -f $ABBREV"_SAMPE.sam" \ #output sam file + $REF_HOME/$CLEAN_REF \ #reference sequence + $ABBREV"_R1.sai" \ #input mapped paired reads + $ABBREV"_R2.sai" \ #input mapped paired reads + $PROCESSED_READS_HOME/$RP1 \ #input paired reads + $PROCESSED_READS_HOME/$RP2 #input paired reads + +#create sam file from unpaired mapped reads + $BWA_HOME/bwa samse \ + $REF_HOME/$CLEAN_REF \ #reference sequence + $ABBREV"_RX.sai" \ #input mapped unpaired reads + $PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" \ #input unpaired reads + >$ABBREV"_RX_SAMSE.sam" \ #output file + +#convert paired sam file to bam + $SAMTOOLS_HOME/samtools view \ + -Sb \ #convert sam to bam + -o $ABBREV"_SAMPE.bam" \ #output bam file + $ABBREV"_SAMPE.sam" #input sam file + +#convert unpaired sam file to bam + $SAMTOOLS_HOME/samtools view \ + -Sb \ #convert sam to bam + -o $ABBREV"_RX_SAMSE.bam" \ #output bam file + $ABBREV"_RX_SAMSE.sam" #input sam file + +#merge the paired and unpaired mapped reads to a single bam file + $SAMTOOLS_HOME/samtools merge \ + $ABBREV"_merge.bam" \ #output merged file + $ABBREV"_SAMPE.bam" \ #input bam file 1 + $ABBREV"_RX_SAMSE.bam" #input bam file 2 + +#Not sure what this does - sort bam file? + $SAMTOOLS_HOME/samtools view \ + -F 4 \ #do not know + -q 20 \ #do not know + -b \ #output in bam format + -o $ABBREV"_R3.bam" \ #create combined file. R3 represents all paired and unpaired reads + $ABBREV"_merge.bam" #input file + +#### for samtools v1.19 +##--- $SAMTOOLS_HOME/samtools sort \ +# $ABBREV"_R3.bam" \ +# -@ $THREADS \ +# $ABBREV"_R3_sorted" + +#### for samtools v1.2 + $SAMTOOLS_HOME/samtools sort \ + -O bam \ #write output as bam file + -o $ABBREV"_R3_sorted.bam" \ + -T $ABBREV"_R3_sorted" \ #prefix to use for output file + -@ $THREADS \ # number of processors to use + $ABBREV"_R3.bam" \ # input file + +echo $ABBREV"_RX_sort" | mailx -s $ABBREV"_RX_sort" david.4.ray@gmail.com + +#=================== +# [4] remove sequencing duplicates from the sorted bam file w/ PICARD + java \ + -Xmx24g \ + -Djava.io.tmpdir=tmp \ + -jar $PICARD_HOME/MarkDuplicates.jar \ + I=$ABBREV"_R3_sorted.bam" \ + O=$ABBREV"_R3_noDup.bam" \ + M=$ABBREV"_R3_dupMetric.out" \ + REMOVE_DUPLICATES=true \ + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 \ + VALIDATION_STRINGENCY=SILENT \ + ASSUME_SORTED=TRUE \ + TMP_DIR=tmp + +#================== +#[5] create pileup from noDup.bam + +$SAMTOOLS1_8_HOME/samtools mpileup \ + -C50 \ + -f $REF_HOME/$CLEAN_REF \ + $ABBREV"_R3_noDup.bam" \ + >$ABBREV"_mPileUp_0_1_18.vcf" + +--echo $ABBREV"_pileup_finished" | mailx -s $ABBREV"_pileup_finished" david.4.ray@gmail.com + + +#======================= +#[6] generate fasta consensus from pileup + +perl $RAY_SOFTWARE/pileup2fasta_v1-4.pl \ + -i $ABBREV"_mPileUp_0_1_18.vcf" \ + -o $ABBREV".fa" \ + -g $ABBREV".gff" \ + -b 8 \ + -s \ + -V + + +$BEDTOOLS_HOME/bedtools genomecov \ + -ibam $ABBREV"_R3_noDup.bam" \ + -g $ABBREV".fa" \ + | grep genome >$ABBREV"_genome_cov.txt" + + +echo $ABBREV"_fasta_finished" | mailx -s $ABBREV"_fasta_finished" david.4.ray@gmail.com + + +echo $ABBREV"_assembly_finished" | mailx -s $ABBREV"_assembly_finished" david.4.ray@gmail.com +sleep 5 +