-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,294 @@ | ||
#!/bin/bash | ||
#$ -V | ||
#$ -cwd | ||
#$ -S /bin/bash | ||
#$ -N M8132_assembly | ||
#$ -o $JOB_NAME.o$JOB_ID | ||
#$ -e $JOB_NAME.e$JOB_ID | ||
#$ -q ray512cc,raycc | ||
#$ -pe fill 10 | ||
#$ -P communitycluster | ||
|
||
#This file will process raw Illumina data using Trimmomatic. This will be followed by mapping to a reference genome to create a new genome assembly. | ||
|
||
BASEDIR=/lustre/scratch/daray/Ray_low_cov_work | ||
WORKDIR=$BASEDIR/bwa_work2 | ||
|
||
mkdir $WORKDIR | ||
cd $WORKDIR | ||
|
||
###### | ||
#set up variables - will vary according to your needs, files and file locations | ||
###### | ||
ABBREV=M8132 #This will be the name you use to process your files. You will need to change the RP1, RP2, RU1, and RU2 slots just below here accordingly. | ||
#These files will be generated by trimmomatic | ||
RP1=$ABBREV"_R1_paired.fastq" | ||
RP2=$ABBREV"_R2_paired.fastq" | ||
RU1=$ABBREV"_R1_unpaired.fastq" | ||
RU2=$ABBREV"_R2_unpaired.fastq" | ||
|
||
THREADS=9 #Line 9 sets this up to run 20 processors. If you want fewer, make sure to change that line as well as this one. | ||
|
||
refgenome=myoLuc2.fa # your reference genome for the assembly | ||
CLEAN_REF=myoLuc2.fa.clean # your cleaned reference genome | ||
REF_HOME=$BASEDIR/Mluc.reference #the location of your reference genome | ||
|
||
RAW_READS_HOME=$BASEDIR/data.raw #the location of your raw data | ||
mkdir $BASEDIR/data_processed | ||
PROCESSED_READS_HOME=$BASEDIR/data_processed #the location of your processed data | ||
mkdir $PROCESSED_READS_HOME/quality_info | ||
QUALITY_INFO=$PROCESSED_READS_HOME/quality_info #Where the quality stats will be saved | ||
mkdir $BASEDIR/support_files | ||
SUPPORT_FILES=$BASEDIR/support_files #where the support files like the adapter sequences will be located. | ||
|
||
###### | ||
#set up alias' for major programs | ||
###### | ||
BWA_HOME=/lustre/work/apps/bwa-0.6.2 | ||
SAMTOOLS_HOME=/lustre/work/apps/samtools-1.2 | ||
SAMTOOLS1_8_HOME=/lustre/work/apps/samtools-0.1.18 | ||
PICARD_HOME=/lustre/work/apps/picard-tools-1.91 | ||
BCFTOOLS_HOME=/lustre/work/apps/samtools-0.1.18/bcftools | ||
RAY_SOFTWARE=/lustre/work/daray/software | ||
TRIM_HOME=/lustre/work/apps/Trimmomatic-0.27 | ||
FASTX_HOME=/lustre/work/apps/fastx_toolkit-0.0.14/bin | ||
VCFTOOLS_HOME=/lustre/work/daray/software/vcftools_0.1.12b/bin | ||
BEDTOOLS_HOME=/lustre/work/apps/bedtools-2.17.0/bin | ||
|
||
|
||
###### | ||
#Set up insert size. This will be specific to the insert size for your particular taxon's library. | ||
###### | ||
insSize=1000 | ||
|
||
###### | ||
#make sure your genome file has no blank lines - ALREADY DONE, NOT BEING USED HERE | ||
###### | ||
sed '/^$/d' $REF_HOME/$refgenome >tempGenome | ||
cp tempGenome $REF_HOME/$refgenome".clean" | ||
|
||
#echo "spaces" | mailx -s "spaces" [email protected] | ||
|
||
################################################################################ | ||
# Map reads to genome with BWA | ||
#~~~~~~~~~~~ | ||
|
||
#[1a] Use bwa to index the genome - ALREADY DONE, NOT BEING USED HERE | ||
$BWA_HOME/bwa index \ | ||
-a bwtsw \ | ||
$REF_HOME/$CLEAN_REF | ||
|
||
$SAMTOOLS_HOME/samtools faidx \ | ||
$REF_HOME/$CLEAN_REF | ||
|
||
|
||
####### | ||
###!!!!!!! There are comments after the "\" below. This won't work with them present. Make sure to get rid of anything after \ on all lines. | ||
####### | ||
|
||
|
||
########### | ||
#Prepare the reads and get quality stats | ||
########### | ||
#[1] Use trimmomatic to generate paired and unpaired reads files | ||
java -jar $TRIM_HOME/trimmomatic-0.27.jar \ | ||
PE \ # we will be processing paired reads | ||
-threads $THREADS \ #use 19 processors | ||
-phred33 \ #quality cutoff = 33 | ||
$RAW_READS_HOME/$ABBREV"_R1.fastq" \ #input reads | ||
$RAW_READS_HOME/$ABBREV"_R2.fastq" \ #input reads | ||
$PROCESSED_READS_HOME/$RP1 \ #output paired reads file for R1 | ||
$PROCESSED_READS_HOME/$RU1 \ #output unpaired reads file for R1 | ||
$PROCESSED_READS_HOME/$RP2 \ #output paired reads file for R2 | ||
$PROCESSED_READS_HOME/$RU2 \ #output unpaired reads file for R2 | ||
ILLUMINACLIP:$SUPPORT_FILES/TruSeq4-PE.fa:2:30:10 \ | ||
LEADING:20 \ | ||
TRAILING:20 \ | ||
SLIDINGWINDOW:4:20 \ | ||
MINLEN:33 | ||
|
||
#[2] Generate quality stats | ||
for FASTQ in $PROCESSED_READS_HOME/$ABBREV*.fastq | ||
do PROCESSED_FILE=$(basename $FASTQ .fastq) | ||
|
||
$FASTX_HOME/fastx_quality_stats \ | ||
-Q33 \ | ||
-o $QUALITY_INFO/$PROCESSED_FILE".stats" \ | ||
-i $PROCESSED_READS_HOME/$PROCESSED_FILE".fastq" | ||
|
||
$FASTX_HOME/fastx_nucleotide_distribution_graph.sh \ | ||
-i $QUALITY_INFO/$PROCESSED_FILE".stats" \ | ||
-o $QUALITY_INFO/$PROCESSED_FILE"_NUC.png" \ | ||
-t $QUALITY_INFO/$PROCESSED_FILE"_clipped" | ||
|
||
$FASTX_HOME/fastq_quality_boxplot_graph.sh \ | ||
-i $QUALITY_INFO/$PROCESSED_FILE".stats" \ | ||
-o $QUALITY_INFO/$PROCESSED_FILE"_BOX.png" \ | ||
-t $QUALITY_INFO/$PROCESSED_FILE"_clipped" | ||
|
||
done | ||
|
||
echo $ABBREV"_qc_finished" | mailx -s $ABBREV"_qc_finished" [email protected] | ||
|
||
sleep 5 | ||
|
||
################################################################################ | ||
# Map reads to genome with BWA | ||
#~~~~~~~~~~~ | ||
|
||
|
||
#=================== | ||
# [1b] Map the reads to the genome | ||
$BWA_HOME/bwa aln \ | ||
-n 0.01 \ | ||
-l 28 \ | ||
-t $THREADS \ | ||
-q 20 \ | ||
-f $ABBREV"_R1.sai" \ # the R1 output file | ||
$REF_HOME/$CLEAN_REF \ # cleaned reference genome | ||
$PROCESSED_READS_HOME/$RP1 #the R1 paired reads | ||
|
||
echo $ABBREV"_R1_map" | mailx -s $ABBREV"_R1_map" [email protected] | ||
|
||
#=================== | ||
$BWA_HOME/bwa aln \ | ||
-n 0.01 \ | ||
-l 28 \ | ||
-t $THREADS \ | ||
-q 20 \ | ||
-f $ABBREV"_R2.sai" \ | ||
$REF_HOME/$CLEAN_REF \ | ||
$PROCESSED_READS_HOME/$RP2 | ||
|
||
echo $ABBREV"_R2_map" | mailx -s $ABBREV"_R2_map" [email protected] | ||
|
||
cat $PROCESSED_READS_HOME/$RU1 $PROCESSED_READS_HOME/$RU2 >$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" | ||
|
||
$BWA_HOME/bwa aln \ | ||
-n 0.01 \ | ||
-l 28 \ | ||
-t $THREADS \ | ||
-q 20 \ | ||
-f $ABBREV"_RX.sai" \ | ||
$REF_HOME/$CLEAN_REF \ | ||
$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" | ||
|
||
echo $ABBREV"_RX_map" | mailx -s $ABBREV"_RX_map" [email protected] | ||
|
||
|
||
#=================== | ||
# [2] calculate the max insertion size | ||
maxInsSize=insSize*2 | ||
|
||
#=================== | ||
# [3] use sampe and SAMtools to create bam files of the mapped reads | ||
#create sam file from paired mapped reads | ||
$BWA_HOME/bwa sampe \ | ||
-a $maxInsSize \ | ||
-f $ABBREV"_SAMPE.sam" \ #output sam file | ||
$REF_HOME/$CLEAN_REF \ #reference sequence | ||
$ABBREV"_R1.sai" \ #input mapped paired reads | ||
$ABBREV"_R2.sai" \ #input mapped paired reads | ||
$PROCESSED_READS_HOME/$RP1 \ #input paired reads | ||
$PROCESSED_READS_HOME/$RP2 #input paired reads | ||
|
||
#create sam file from unpaired mapped reads | ||
$BWA_HOME/bwa samse \ | ||
$REF_HOME/$CLEAN_REF \ #reference sequence | ||
$ABBREV"_RX.sai" \ #input mapped unpaired reads | ||
$PROCESSED_READS_HOME/$ABBREV"_RX_cat.fastq" \ #input unpaired reads | ||
>$ABBREV"_RX_SAMSE.sam" \ #output file | ||
|
||
#convert paired sam file to bam | ||
$SAMTOOLS_HOME/samtools view \ | ||
-Sb \ #convert sam to bam | ||
-o $ABBREV"_SAMPE.bam" \ #output bam file | ||
$ABBREV"_SAMPE.sam" #input sam file | ||
|
||
#convert unpaired sam file to bam | ||
$SAMTOOLS_HOME/samtools view \ | ||
-Sb \ #convert sam to bam | ||
-o $ABBREV"_RX_SAMSE.bam" \ #output bam file | ||
$ABBREV"_RX_SAMSE.sam" #input sam file | ||
|
||
#merge the paired and unpaired mapped reads to a single bam file | ||
$SAMTOOLS_HOME/samtools merge \ | ||
$ABBREV"_merge.bam" \ #output merged file | ||
$ABBREV"_SAMPE.bam" \ #input bam file 1 | ||
$ABBREV"_RX_SAMSE.bam" #input bam file 2 | ||
|
||
#Not sure what this does - sort bam file? | ||
$SAMTOOLS_HOME/samtools view \ | ||
-F 4 \ #do not know | ||
-q 20 \ #do not know | ||
-b \ #output in bam format | ||
-o $ABBREV"_R3.bam" \ #create combined file. R3 represents all paired and unpaired reads | ||
$ABBREV"_merge.bam" #input file | ||
|
||
#### for samtools v1.19 | ||
##--- $SAMTOOLS_HOME/samtools sort \ | ||
# $ABBREV"_R3.bam" \ | ||
# -@ $THREADS \ | ||
# $ABBREV"_R3_sorted" | ||
|
||
#### for samtools v1.2 | ||
$SAMTOOLS_HOME/samtools sort \ | ||
-O bam \ #write output as bam file | ||
-o $ABBREV"_R3_sorted.bam" \ | ||
-T $ABBREV"_R3_sorted" \ #prefix to use for output file | ||
-@ $THREADS \ # number of processors to use | ||
$ABBREV"_R3.bam" \ # input file | ||
|
||
echo $ABBREV"_RX_sort" | mailx -s $ABBREV"_RX_sort" [email protected] | ||
|
||
#=================== | ||
# [4] remove sequencing duplicates from the sorted bam file w/ PICARD | ||
java \ | ||
-Xmx24g \ | ||
-Djava.io.tmpdir=tmp \ | ||
-jar $PICARD_HOME/MarkDuplicates.jar \ | ||
I=$ABBREV"_R3_sorted.bam" \ | ||
O=$ABBREV"_R3_noDup.bam" \ | ||
M=$ABBREV"_R3_dupMetric.out" \ | ||
REMOVE_DUPLICATES=true \ | ||
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100 \ | ||
VALIDATION_STRINGENCY=SILENT \ | ||
ASSUME_SORTED=TRUE \ | ||
TMP_DIR=tmp | ||
|
||
#================== | ||
#[5] create pileup from noDup.bam | ||
|
||
$SAMTOOLS1_8_HOME/samtools mpileup \ | ||
-C50 \ | ||
-f $REF_HOME/$CLEAN_REF \ | ||
$ABBREV"_R3_noDup.bam" \ | ||
>$ABBREV"_mPileUp_0_1_18.vcf" | ||
|
||
--echo $ABBREV"_pileup_finished" | mailx -s $ABBREV"_pileup_finished" [email protected] | ||
|
||
|
||
#======================= | ||
#[6] generate fasta consensus from pileup | ||
|
||
perl $RAY_SOFTWARE/pileup2fasta_v1-4.pl \ | ||
-i $ABBREV"_mPileUp_0_1_18.vcf" \ | ||
-o $ABBREV".fa" \ | ||
-g $ABBREV".gff" \ | ||
-b 8 \ | ||
-s \ | ||
-V | ||
|
||
|
||
$BEDTOOLS_HOME/bedtools genomecov \ | ||
-ibam $ABBREV"_R3_noDup.bam" \ | ||
-g $ABBREV".fa" \ | ||
| grep genome >$ABBREV"_genome_cov.txt" | ||
|
||
|
||
echo $ABBREV"_fasta_finished" | mailx -s $ABBREV"_fasta_finished" [email protected] | ||
|
||
|
||
echo $ABBREV"_assembly_finished" | mailx -s $ABBREV"_assembly_finished" [email protected] | ||
sleep 5 | ||
|