working version

davidaray · Jul 21, 2015 · a7cdac9 · a7cdac9
1 parent 13a0d67
commit a7cdac9
Showing 1 changed file with 294 additions and 0 deletions.
diff --git a/bwa_map_generic_driver.sh b/bwa_map_generic_driver.sh
@@ -0,0 +1,294 @@
+#!/bin/bash
+#$ -V
+#$ -cwd
+#$ -S /bin/bash
+#$ -N M8132_assembly
+#$ -o $JOB_NAME.o$JOB_ID
+#$ -e $JOB_NAME.e$JOB_ID
+#$ -q ray512cc,raycc
+#$ -pe fill 10 
+#$ -P communitycluster
+
+#This file will process raw Illumina data using Trimmomatic.  This will be followed by mapping to a reference genome to create a new genome assembly.
+
+BASEDIR=/lustre/scratch/daray/Ray_low_cov_work
+WORKDIR=$BASEDIR/bwa_work2
+
+mkdir $WORKDIR
+cd $WORKDIR
+
+######
+#set up variables - will vary according to your needs, files and file locations
+######
+ABBREV=M8132  #This will be the name you use to process your files.  You will need to change the RP1, RP2, RU1, and RU2 slots just below here accordingly.
+#These files will be generated by trimmomatic
+RP1=$ABBREV"_R1_paired.fastq"
+RP2=$ABBREV"_R2_paired.fastq"
+RU1=$ABBREV"_R1_unpaired.fastq"
+RU2=$ABBREV"_R2_unpaired.fastq"
+
+THREADS=9  #Line 9 sets this up to run 20 processors.  If you want fewer, make sure to change that line as well as this one.
+
+refgenome=myoLuc2.fa  	# your reference genome for the assembly
+CLEAN_REF=myoLuc2.fa.clean	# your cleaned reference genome
+REF_HOME=$BASEDIR/Mluc.reference	#the location of your reference genome
+
+RAW_READS_HOME=$BASEDIR/data.raw   #the location of your raw data
+mkdir $BASEDIR/data_processed
+PROCESSED_READS_HOME=$BASEDIR/data_processed	#the location of your processed data
+mkdir $PROCESSED_READS_HOME/quality_info
+QUALITY_INFO=$PROCESSED_READS_HOME/quality_info	#Where the quality stats will be saved
+mkdir $BASEDIR/support_files
+SUPPORT_FILES=$BASEDIR/support_files	#where the support files like the adapter sequences will be located.
+
+######
+#set up alias' for major programs
+######
+BWA_HOME=/lustre/work/apps/bwa-0.6.2
+SAMTOOLS_HOME=/lustre/work/apps/samtools-1.2
+SAMTOOLS1_8_HOME=/lustre/work/apps/samtools-0.1.18
+PICARD_HOME=/lustre/work/apps/picard-tools-1.91
+BCFTOOLS_HOME=/lustre/work/apps/samtools-0.1.18/bcftools
+RAY_SOFTWARE=/lustre/work/daray/software
+TRIM_HOME=/lustre/work/apps/Trimmomatic-0.27
+FASTX_HOME=/lustre/work/apps/fastx_toolkit-0.0.14/bin
+VCFTOOLS_HOME=/lustre/work/daray/software/vcftools_0.1.12b/bin
+BEDTOOLS_HOME=/lustre/work/apps/bedtools-2.17.0/bin
+
+
+######
+#Set up insert size.  This will be specific to the insert size for your particular taxon's library.
+######
+insSize=1000  
+
+######
+#make sure your genome file has no blank lines  - ALREADY DONE, NOT BEING USED HERE
+######
+sed '/^$/d' $REF_HOME/$refgenome >tempGenome
+cp tempGenome $REF_HOME/$refgenome".clean"
+
+#echo "spaces" |  mailx -s "spaces" [email protected]
+
+################################################################################
+# Map reads to genome with BWA
+#~~~~~~~~~~~
+
+#[1a] Use bwa to index the genome  - ALREADY DONE, NOT BEING USED HERE
+	$BWA_HOME/bwa index \
+		-a bwtsw \
+		$REF_HOME/$CLEAN_REF
+
+	$SAMTOOLS_HOME/samtools faidx	\
+		$REF_HOME/$CLEAN_REF
+
+
+#######
+###!!!!!!! There are comments after the "\" below.  This won't work with them present.  Make sure to get rid of anything after \ on all lines.
+#######
+
+
+###########
+#Prepare the reads and get quality stats
+###########
+#[1] Use trimmomatic to generate paired and unpaired reads files
+java -jar $TRIM_HOME/trimmomatic-0.27.jar \
+	PE \	# we will be processing paired reads
+	-threads $THREADS \	#use 19 processors
+	-phred33 \	#quality cutoff = 33
+	$RAW_READS_HOME/$ABBREV"_R1.fastq" \	#input reads
+	$RAW_READS_HOME/$ABBREV"_R2.fastq" \	#input reads
+	$PROCESSED_READS_HOME/$RP1 \	#output paired reads file for R1
+	$PROCESSED_READS_HOME/$RU1 \	#output unpaired reads file for R1
+	$PROCESSED_READS_HOME/$RP2 \	#output paired reads file for R2
+	$PROCESSED_READS_HOME/$RU2 \	#output unpaired reads file for R2
+	ILLUMINACLIP:$SUPPORT_FILES/TruSeq4-PE.fa:2:30:10 \
+	LEADING:20 \
+	TRAILING:20 \
+	SLIDINGWINDOW:4:20 \
+	MINLEN:33
+
+#[2] Generate quality stats 
+for FASTQ in $PROCESSED_READS_HOME/$ABBREV*.fastq
+	do 	PROCESSED_FILE=$(basename $FASTQ .fastq)
+
+  	$FASTX_HOME/fastx_quality_stats 					\
+		-Q33		 						\
+		-o $QUALITY_INFO/$PROCESSED_FILE".stats" 	\
+		-i $PROCESSED_READS_HOME/$PROCESSED_FILE".fastq"
+
+	$FASTX_HOME/fastx_nucleotide_distribution_graph.sh 			\
+		-i $QUALITY_INFO/$PROCESSED_FILE".stats"		\
+		-o $QUALITY_INFO/$PROCESSED_FILE"_NUC.png"	\
+		-t $QUALITY_INFO/$PROCESSED_FILE"_clipped"		
+
+	$FASTX_HOME/fastq_quality_boxplot_graph.sh 				\
+		-i $QUALITY_INFO/$PROCESSED_FILE".stats" 	\
+		-o $QUALITY_INFO/$PROCESSED_FILE"_BOX.png"	\
+		-t $QUALITY_INFO/$PROCESSED_FILE"_clipped"
+
+done
+
+echo $ABBREV"_qc_finished" |  mailx -s $ABBREV"_qc_finished" [email protected]
+
+sleep 5
+
+################################################################################
+# Map reads to genome with BWA
+#~~~~~~~~~~~
+
+
+#===================
+# [1b] Map the reads to the genome
+	$BWA_HOME/bwa aln 			\
+		-n 0.01 			\
+		-l 28 				\
+		-t $THREADS 				\
+		-q 20				\
+	    -f $ABBREV"_R1.sai" 	\	# the R1 output file
+		$REF_HOME/$CLEAN_REF			\	# cleaned reference genome
+       	$PROCESSED_READS_HOME/$RP1	#the R1 paired reads
+
+echo $ABBREV"_R1_map" |  mailx -s $ABBREV"_R1_map" [email protected]	
+
+#===================
+	$BWA_HOME/bwa aln 			\
+		-n 0.01 			\
+		-l 28 				\
+		-t $THREADS 				\
+		-q 20				\
+		-f $ABBREV"_R2.sai" 	\
+		$REF_HOME/$CLEAN_REF			\
+		$PROCESSED_READS_HOME/$RP2
+
+echo $ABBREV"_R2_map" |  mailx -s $ABBREV"_R2_map" [email protected]	
+
+cat $PROCESSED_READS_HOME/$RU1 $PROCESSED_READS_HOME/$RU2 >$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq"
+
+	$BWA_HOME/bwa aln 			\
+		-n 0.01 			\
+		-l 28 				\
+		-t $THREADS 				\
+		-q 20				\
+		-f $ABBREV"_RX.sai" 	\
+		$REF_HOME/$CLEAN_REF			\
+		$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq"
+
+echo $ABBREV"_RX_map" |  mailx -s $ABBREV"_RX_map" [email protected]	
+
+
+#===================
+# [2] calculate the max insertion size
+	maxInsSize=insSize*2  
+
+#===================
+# [3] use sampe and SAMtools to create bam files of the mapped reads
+#create sam file from paired mapped reads
+	$BWA_HOME/bwa sampe 				\
+		-a $maxInsSize 				\
+		-f $ABBREV"_SAMPE.sam" 	\	#output sam file
+		$REF_HOME/$CLEAN_REF 				\	#reference sequence
+		$ABBREV"_R1.sai" 		\	#input mapped paired reads
+		$ABBREV"_R2.sai" 		\	#input mapped paired reads
+		$PROCESSED_READS_HOME/$RP1 			\	#input paired reads
+		$PROCESSED_READS_HOME/$RP2				#input paired reads
+
+#create sam file from unpaired mapped reads
+	$BWA_HOME/bwa samse 				\
+		$REF_HOME/$CLEAN_REF 	\	#reference sequence
+		$ABBREV"_RX.sai" 		\	#input mapped unpaired reads
+		$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq"	\	#input unpaired reads
+		>$ABBREV"_RX_SAMSE.sam"	\	#output file
+
+#convert paired sam file to bam
+	$SAMTOOLS_HOME/samtools view 			\
+		-Sb 					\	#convert sam to bam
+		-o $ABBREV"_SAMPE.bam" 	\	#output bam file
+		$ABBREV"_SAMPE.sam"			#input sam file	
+
+#convert unpaired sam file to bam
+	$SAMTOOLS_HOME/samtools view 			\
+		-Sb 					\	#convert sam to bam
+		-o $ABBREV"_RX_SAMSE.bam" 	\	#output bam file
+		$ABBREV"_RX_SAMSE.sam" 			#input sam file
+
+#merge the paired and unpaired mapped reads to a single bam file
+	$SAMTOOLS_HOME/samtools merge	\	
+		$ABBREV"_merge.bam" \	#output merged file
+		$ABBREV"_SAMPE.bam"	\	#input bam file 1
+		$ABBREV"_RX_SAMSE.bam"	#input bam file 2
+
+#Not sure what this does - sort bam file?
+	$SAMTOOLS_HOME/samtools view 			\
+		-F 4 					\	#do not know
+		-q 20 					\	#do not know
+		-b						\	#output in bam format
+		-o $ABBREV"_R3.bam" 	\  #create combined file. R3 represents all paired and unpaired reads
+		$ABBREV"_merge.bam" 	#input file
+
+#### for samtools v1.19		
+##---	$SAMTOOLS_HOME/samtools sort 			\
+#		$ABBREV"_R3.bam"	\
+#		-@ $THREADS 		\
+#		$ABBREV"_R3_sorted"
+
+#### for samtools v1.2		
+	$SAMTOOLS_HOME/samtools sort 			\
+		-O bam	\	#write output as bam file
+		-o $ABBREV"_R3_sorted.bam"	\
+		-T $ABBREV"_R3_sorted" \	#prefix to use for output file
+		-@ $THREADS 					\ # number of processors to use
+		$ABBREV"_R3.bam" 		\	# input file
+
+echo $ABBREV"_RX_sort" |  mailx -s $ABBREV"_RX_sort" [email protected]	
+
+#===================
+# [4] remove sequencing duplicates from the sorted bam file w/ PICARD	
+	java 						\
+        -Xmx24g 				\
+		-Djava.io.tmpdir=tmp 			\
+		-jar $PICARD_HOME/MarkDuplicates.jar 	\
+        	I=$ABBREV"_R3_sorted.bam" 	\
+       		O=$ABBREV"_R3_noDup.bam" 	\
+        	M=$ABBREV"_R3_dupMetric.out" 	\
+        	REMOVE_DUPLICATES=true 			\
+		MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 	\
+		VALIDATION_STRINGENCY=SILENT 		\
+		ASSUME_SORTED=TRUE 			\
+		TMP_DIR=tmp
+
+#==================
+#[5] create pileup from noDup.bam
+
+$SAMTOOLS1_8_HOME/samtools mpileup \
+	-C50 \
+	-f $REF_HOME/$CLEAN_REF \
+	$ABBREV"_R3_noDup.bam" \
+	>$ABBREV"_mPileUp_0_1_18.vcf"
+
+--echo $ABBREV"_pileup_finished" |  mailx -s $ABBREV"_pileup_finished" [email protected]
+
+
+#=======================	
+#[6] generate fasta consensus from pileup
+
+perl $RAY_SOFTWARE/pileup2fasta_v1-4.pl \
+	-i $ABBREV"_mPileUp_0_1_18.vcf" \
+	-o $ABBREV".fa"	\
+	-g $ABBREV".gff"	\
+	-b 8	\
+	-s		\
+	-V		
+
+
+$BEDTOOLS_HOME/bedtools genomecov \
+	-ibam $ABBREV"_R3_noDup.bam" \
+	-g $ABBREV".fa" \
+	| grep genome >$ABBREV"_genome_cov.txt" 
+
+
+echo $ABBREV"_fasta_finished" |  mailx -s $ABBREV"_fasta_finished" [email protected]
+
+
+echo $ABBREV"_assembly_finished" |  mailx -s $ABBREV"_assembly_finished" [email protected]
+sleep 5
+