Skip to content

Commit

Permalink
working version
Browse files Browse the repository at this point in the history
  • Loading branch information
davidaray committed Jul 21, 2015
1 parent 13a0d67 commit a7cdac9
Showing 1 changed file with 294 additions and 0 deletions.
294 changes: 294 additions & 0 deletions bwa_map_generic_driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
#!/bin/bash
#$ -V
#$ -cwd
#$ -S /bin/bash
#$ -N M8132_assembly
#$ -o $JOB_NAME.o$JOB_ID
#$ -e $JOB_NAME.e$JOB_ID
#$ -q ray512cc,raycc
#$ -pe fill 10
#$ -P communitycluster

#This file will process raw Illumina data using Trimmomatic. This will be followed by mapping to a reference genome to create a new genome assembly.

BASEDIR=/lustre/scratch/daray/Ray_low_cov_work
WORKDIR=$BASEDIR/bwa_work2

mkdir $WORKDIR
cd $WORKDIR

######
#set up variables - will vary according to your needs, files and file locations
######
ABBREV=M8132 #This will be the name you use to process your files. You will need to change the RP1, RP2, RU1, and RU2 slots just below here accordingly.
#These files will be generated by trimmomatic
RP1=$ABBREV"_R1_paired.fastq"
RP2=$ABBREV"_R2_paired.fastq"
RU1=$ABBREV"_R1_unpaired.fastq"
RU2=$ABBREV"_R2_unpaired.fastq"

THREADS=9 #Line 9 sets this up to run 20 processors. If you want fewer, make sure to change that line as well as this one.

refgenome=myoLuc2.fa # your reference genome for the assembly
CLEAN_REF=myoLuc2.fa.clean # your cleaned reference genome
REF_HOME=$BASEDIR/Mluc.reference #the location of your reference genome

RAW_READS_HOME=$BASEDIR/data.raw #the location of your raw data
mkdir $BASEDIR/data_processed
PROCESSED_READS_HOME=$BASEDIR/data_processed #the location of your processed data
mkdir $PROCESSED_READS_HOME/quality_info
QUALITY_INFO=$PROCESSED_READS_HOME/quality_info #Where the quality stats will be saved
mkdir $BASEDIR/support_files
SUPPORT_FILES=$BASEDIR/support_files #where the support files like the adapter sequences will be located.

######
#set up alias' for major programs
######
BWA_HOME=/lustre/work/apps/bwa-0.6.2
SAMTOOLS_HOME=/lustre/work/apps/samtools-1.2
SAMTOOLS1_8_HOME=/lustre/work/apps/samtools-0.1.18
PICARD_HOME=/lustre/work/apps/picard-tools-1.91
BCFTOOLS_HOME=/lustre/work/apps/samtools-0.1.18/bcftools
RAY_SOFTWARE=/lustre/work/daray/software
TRIM_HOME=/lustre/work/apps/Trimmomatic-0.27
FASTX_HOME=/lustre/work/apps/fastx_toolkit-0.0.14/bin
VCFTOOLS_HOME=/lustre/work/daray/software/vcftools_0.1.12b/bin
BEDTOOLS_HOME=/lustre/work/apps/bedtools-2.17.0/bin


######
#Set up insert size. This will be specific to the insert size for your particular taxon's library.
######
insSize=1000

######
#make sure your genome file has no blank lines - ALREADY DONE, NOT BEING USED HERE
######
sed '/^$/d' $REF_HOME/$refgenome >tempGenome
cp tempGenome $REF_HOME/$refgenome".clean"

#echo "spaces" | mailx -s "spaces" [email protected]

################################################################################
# Map reads to genome with BWA
#~~~~~~~~~~~

#[1a] Use bwa to index the genome - ALREADY DONE, NOT BEING USED HERE
$BWA_HOME/bwa index \
-a bwtsw \
$REF_HOME/$CLEAN_REF

$SAMTOOLS_HOME/samtools faidx \
$REF_HOME/$CLEAN_REF


#######
###!!!!!!! There are comments after the "\" below. This won't work with them present. Make sure to get rid of anything after \ on all lines.
#######


###########
#Prepare the reads and get quality stats
###########
#[1] Use trimmomatic to generate paired and unpaired reads files
java -jar $TRIM_HOME/trimmomatic-0.27.jar \
PE \ # we will be processing paired reads
-threads $THREADS \ #use 19 processors
-phred33 \ #quality cutoff = 33
$RAW_READS_HOME/$ABBREV"_R1.fastq" \ #input reads
$RAW_READS_HOME/$ABBREV"_R2.fastq" \ #input reads
$PROCESSED_READS_HOME/$RP1 \ #output paired reads file for R1
$PROCESSED_READS_HOME/$RU1 \ #output unpaired reads file for R1
$PROCESSED_READS_HOME/$RP2 \ #output paired reads file for R2
$PROCESSED_READS_HOME/$RU2 \ #output unpaired reads file for R2
ILLUMINACLIP:$SUPPORT_FILES/TruSeq4-PE.fa:2:30:10 \
LEADING:20 \
TRAILING:20 \
SLIDINGWINDOW:4:20 \
MINLEN:33

#[2] Generate quality stats
for FASTQ in $PROCESSED_READS_HOME/$ABBREV*.fastq
do PROCESSED_FILE=$(basename $FASTQ .fastq)

$FASTX_HOME/fastx_quality_stats \
-Q33 \
-o $QUALITY_INFO/$PROCESSED_FILE".stats" \
-i $PROCESSED_READS_HOME/$PROCESSED_FILE".fastq"

$FASTX_HOME/fastx_nucleotide_distribution_graph.sh \
-i $QUALITY_INFO/$PROCESSED_FILE".stats" \
-o $QUALITY_INFO/$PROCESSED_FILE"_NUC.png" \
-t $QUALITY_INFO/$PROCESSED_FILE"_clipped"

$FASTX_HOME/fastq_quality_boxplot_graph.sh \
-i $QUALITY_INFO/$PROCESSED_FILE".stats" \
-o $QUALITY_INFO/$PROCESSED_FILE"_BOX.png" \
-t $QUALITY_INFO/$PROCESSED_FILE"_clipped"

done

echo $ABBREV"_qc_finished" | mailx -s $ABBREV"_qc_finished" [email protected]

sleep 5

################################################################################
# Map reads to genome with BWA
#~~~~~~~~~~~


#===================
# [1b] Map the reads to the genome
$BWA_HOME/bwa aln \
-n 0.01 \
-l 28 \
-t $THREADS \
-q 20 \
-f $ABBREV"_R1.sai" \ # the R1 output file
$REF_HOME/$CLEAN_REF \ # cleaned reference genome
$PROCESSED_READS_HOME/$RP1 #the R1 paired reads

echo $ABBREV"_R1_map" | mailx -s $ABBREV"_R1_map" [email protected]

#===================
$BWA_HOME/bwa aln \
-n 0.01 \
-l 28 \
-t $THREADS \
-q 20 \
-f $ABBREV"_R2.sai" \
$REF_HOME/$CLEAN_REF \
$PROCESSED_READS_HOME/$RP2

echo $ABBREV"_R2_map" | mailx -s $ABBREV"_R2_map" [email protected]

cat $PROCESSED_READS_HOME/$RU1 $PROCESSED_READS_HOME/$RU2 >$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq"

$BWA_HOME/bwa aln \
-n 0.01 \
-l 28 \
-t $THREADS \
-q 20 \
-f $ABBREV"_RX.sai" \
$REF_HOME/$CLEAN_REF \
$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq"

echo $ABBREV"_RX_map" | mailx -s $ABBREV"_RX_map" [email protected]


#===================
# [2] calculate the max insertion size
maxInsSize=insSize*2

#===================
# [3] use sampe and SAMtools to create bam files of the mapped reads
#create sam file from paired mapped reads
$BWA_HOME/bwa sampe \
-a $maxInsSize \
-f $ABBREV"_SAMPE.sam" \ #output sam file
$REF_HOME/$CLEAN_REF \ #reference sequence
$ABBREV"_R1.sai" \ #input mapped paired reads
$ABBREV"_R2.sai" \ #input mapped paired reads
$PROCESSED_READS_HOME/$RP1 \ #input paired reads
$PROCESSED_READS_HOME/$RP2 #input paired reads

#create sam file from unpaired mapped reads
$BWA_HOME/bwa samse \
$REF_HOME/$CLEAN_REF \ #reference sequence
$ABBREV"_RX.sai" \ #input mapped unpaired reads
$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" \ #input unpaired reads
>$ABBREV"_RX_SAMSE.sam" \ #output file

#convert paired sam file to bam
$SAMTOOLS_HOME/samtools view \
-Sb \ #convert sam to bam
-o $ABBREV"_SAMPE.bam" \ #output bam file
$ABBREV"_SAMPE.sam" #input sam file

#convert unpaired sam file to bam
$SAMTOOLS_HOME/samtools view \
-Sb \ #convert sam to bam
-o $ABBREV"_RX_SAMSE.bam" \ #output bam file
$ABBREV"_RX_SAMSE.sam" #input sam file

#merge the paired and unpaired mapped reads to a single bam file
$SAMTOOLS_HOME/samtools merge \
$ABBREV"_merge.bam" \ #output merged file
$ABBREV"_SAMPE.bam" \ #input bam file 1
$ABBREV"_RX_SAMSE.bam" #input bam file 2

#Not sure what this does - sort bam file?
$SAMTOOLS_HOME/samtools view \
-F 4 \ #do not know
-q 20 \ #do not know
-b \ #output in bam format
-o $ABBREV"_R3.bam" \ #create combined file. R3 represents all paired and unpaired reads
$ABBREV"_merge.bam" #input file

#### for samtools v1.19
##--- $SAMTOOLS_HOME/samtools sort \
# $ABBREV"_R3.bam" \
# -@ $THREADS \
# $ABBREV"_R3_sorted"

#### for samtools v1.2
$SAMTOOLS_HOME/samtools sort \
-O bam \ #write output as bam file
-o $ABBREV"_R3_sorted.bam" \
-T $ABBREV"_R3_sorted" \ #prefix to use for output file
-@ $THREADS \ # number of processors to use
$ABBREV"_R3.bam" \ # input file

echo $ABBREV"_RX_sort" | mailx -s $ABBREV"_RX_sort" [email protected]

#===================
# [4] remove sequencing duplicates from the sorted bam file w/ PICARD
java \
-Xmx24g \
-Djava.io.tmpdir=tmp \
-jar $PICARD_HOME/MarkDuplicates.jar \
I=$ABBREV"_R3_sorted.bam" \
O=$ABBREV"_R3_noDup.bam" \
M=$ABBREV"_R3_dupMetric.out" \
REMOVE_DUPLICATES=true \
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 \
VALIDATION_STRINGENCY=SILENT \
ASSUME_SORTED=TRUE \
TMP_DIR=tmp

#==================
#[5] create pileup from noDup.bam

$SAMTOOLS1_8_HOME/samtools mpileup \
-C50 \
-f $REF_HOME/$CLEAN_REF \
$ABBREV"_R3_noDup.bam" \
>$ABBREV"_mPileUp_0_1_18.vcf"

--echo $ABBREV"_pileup_finished" | mailx -s $ABBREV"_pileup_finished" [email protected]


#=======================
#[6] generate fasta consensus from pileup

perl $RAY_SOFTWARE/pileup2fasta_v1-4.pl \
-i $ABBREV"_mPileUp_0_1_18.vcf" \
-o $ABBREV".fa" \
-g $ABBREV".gff" \
-b 8 \
-s \
-V


$BEDTOOLS_HOME/bedtools genomecov \
-ibam $ABBREV"_R3_noDup.bam" \
-g $ABBREV".fa" \
| grep genome >$ABBREV"_genome_cov.txt"


echo $ABBREV"_fasta_finished" | mailx -s $ABBREV"_fasta_finished" [email protected]


echo $ABBREV"_assembly_finished" | mailx -s $ABBREV"_assembly_finished" [email protected]
sleep 5

0 comments on commit a7cdac9

Please sign in to comment.