-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmouse_microbiome_ASV.sh
executable file
·155 lines (128 loc) · 6.77 KB
/
mouse_microbiome_ASV.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
## Pipeline for 16S microbiome analysis for mouse stool
## Started August 2017 by Jocelyn Sietsma Penington
## Modified to use de-noised sequences as OTUs, instead of clusters, Oct 2017
## This is a record of steps, not actually run complete, which is why some steps have
## "nohup <command> &"
## If you were to want to run this entire, the '&' would need to be removed.
## Pipeline decisions: I'm not going to filter by alignment position using Mothur
## I will assign taxonomy using Silva
## Blend of QIIME (version 1.9.1), usearch (version 9.2) and vsearch.
## Define some path shortcuts
export PAPDIR=/wehisan/bioinf/bioinf-data/Papenfuss_lab/projects
export BASEDIR=$PAPDIR/metagenomics/currentProject
export DATADIR=$BASEDIR/sequence_data
export OUT1DIR=$BASEDIR/01_processed_data
export TOOLDIR=$BASEDIR/analysis_tools
export SILVADIR=$PAPDIR/reference_genomes/bacterial16S/silva/SILVA123_QIIME_release
export GGDIR=/usr/local/bioinfsoftware/qiime/qiime_v2-1.8.0/qiime-deploy/qiime_software/
## Quality inspection of raw data:
mkdir $DATADIR/rawQC
fastqc -o $DATADIR/rawQC --noextract $DATADIR/*fastq.gz
# Uncompress
gzip -cd $DATADIR/MISEQ2367_S1_L001_R1_001.fastq.gz > $DATADIR/MISEQ2367_R1.fastq
gzip -cd $DATADIR/MISEQ2367_S1_L001_R2_001.fastq.gz > $DATADIR/MISEQ2367_R2.fastq
# How many sequences contain Illumina universal forward primer ?
python $TOOLDIR/find_amplicon_primers.py $DATADIR/MISEQ2367_R1.fastq GTGACCTATGAACTCAGGAGTC
## Reverse Illumina adapter :
python $TOOLDIR/find_amplicon_primers.py $DATADIR/MISEQ2367_R2.fastq CTGAGACTTGCACATCGCAGC -rc
## each present in majority of sequences
## Confirming amplicon primers
F341=CCTACGGGNGGCWGCAG
R805=GACTACHVGGGTATCTAATCC
python $TOOLDIR/find_amplicon_primers.py $DATADIR/MISEQ2367_R1.fastq $F341 -rc
python $TOOLDIR/find_amplicon_primers.py $DATADIR/MISEQ2367_R2.fastq $R805 -rc
## present in 14.6mil (R1) and 14.1mil (R2) sequences in normal orientation, almost none in rc.
## Merge overlapping paired-end reads with PEAR.
## Options: -v is min overlap, -m is max assembled length, -n is min assembled length
## -q is quality threshold (not used), -j is number of threads
module load pear
pear -f $DATADIR/MISEQ2367_R1.fastq -r $DATADIR/MISEQ2367_R2.fastq \
-v 50 -m 600 -n 300 -j $(nproc) -o $DATADIR/MISEQ2367
cd $OUT1DIR
ln -s $DATADIR/MISEQ2367.assembled.fastq MISEQ2367.assembled.fastq
lamboot
## Use Qiime extract_barcodes.py to remove the 8-mer barcodes that are at each end of the
## merged sequences.
extract_barcodes.py -f $OUT1DIR/MISEQ2367.assembled.fastq \
-o bar_exed -c barcode_paired_stitched -l 8 -L 8 \
-m $OUT1DIR/mapping.txt
## split_libraries_fastq: label sequences with sample ID based on index sequences.
split_libraries_fastq.py \
-i $OUT1DIR/bar_exed/reads.fastq \
-b $OUT1DIR/bar_exed/barcodes.fastq -m $OUT1DIR/mapping.txt \
--barcode_type 16 -q 29 -n 1 -o $OUT1DIR/labelled_hiqual
## Remove universal primers and amplicon primers
cd $OUT1DIR/labelled_hiqual/
python $TOOLDIR/trim_fasta_amplicons.py -i seqs.fna -o trimmed_seqs.fna
## Edit sequence headers to format suitable for usearch:
## QIIME format is <sample_id>_<seq_counter> , and we need just <sample_id> at start
## The seqid including underscore is added to the end.
perl -pe '$_ =~s />(.+?)(_\d+)(.*$)/>$1$3 seqid=$1$2/' \
$OUT1DIR/labelled_hiqual/trimmed_seqs.fna > \
$OUT1DIR/labelled_hiqual/trimmed_seqs_sampleID.fna
OUT2DIR=$BASEDIR/02_uSearch_OUT
mkdir $OUT2DIR
## As I am using uSearch pipeline, need to "dereplicate" i.e. extract unique sequence set
## The input sequences to denoise2 must be a set of unique sequences sorted in order
## of decreasing abundance with size annotations in the labels.
## File too large for usearch, so used vsearch equivalent
## Use all but 2 CPU cores
nohup vsearch --derep_full $OUT1DIR/labelled_hiqual/trimmed_seqs_sampleID.fna \
--output $OUT2DIR/unique_w_sizesV.fasta --sizeout --threads $(( $(nproc)-2)) \
> nohup_vsearch.out &
#### Use usearch unoise2 (unoise3 not available in v9.2, which is what we have avail)
#### Accept defaults of -minampsize 4 , -unoise_alpha 2.0
nohup usearch -unoise2 $OUT2DIR/unique_w_sizesV.fasta \
-fastaout $OUT2DIR/denoised.fasta &
## Replace semicolon with space, after OTU ID in OTU header
perl -pe '$_ =~s /(>Otu\d+)(;)(.*)/$1 $3/' $OUT2DIR/denoised.fasta \
> $OUT2DIR/denoised_OTU_ID.fasta
## Assign sequences to OTUs, with 99% cut-off. Ties, with equal identity %,
## are assigned to the 1st match, which will be the largest matching OTU as
## unique_w_sizesV.fasta was size-sorted large to small
## File too large for usearch, so used vsearch equivalent
vsearch --usearch_global $OUT1DIR/labelled_hiqual/trimmed_seqs_sampleID.fna \
--db $OUT2DIR/denoised_OTU_ID.fasta --strand plus --id 0.99 \
--otutabout $OUT2DIR/otutab99.txt \
--biomout $OUT2DIR/otutab99.biom
## Assign taxonomy using Silva 16S bacterial database (and QIIME)
OUT3DIR=$BASEDIR/03_OTUs_w_meta
## mkdir $OUT3DIR
TAXO_FP=$SILVADIR/taxonomy/16S_only/99/majority_taxonomy_7_levels.txt
REFSEQ=$SILVADIR/rep_set/rep_set_16S_only/99/99_otus_16S.fasta
## merge and delete step fails if there are too many processors -
## set to 10 instead of nproc-2
nohup parallel_assign_taxonomy_uclust.py -i $OUT2DIR/denoised_OTU_ID.fasta \
-o $OUT3DIR/denoised_Silva123 -O10 -t $TAXO_FP -r $REFSEQ \
-v \
> nohup_assign_silva.out &
## Taxonomy to OTU mapping file needs a header line
sed -i '1 i\#OTU_ID\ttaxonomy\tconsensus_fraction\tnum_accepts' \
$OUT3DIR/denoised_Silva123/denoised_OTU_ID_tax_assignments.txt
## Add sample and taxonomy data to OTU table
biom add-metadata -i $OUT2DIR/otutab99.biom \
-o $OUT3DIR/denoised_allmeta.biom \
--sample-metadata-fp $OUT1DIR/mapping.txt \
--observation-metadata-fp $OUT3DIR/denoised_Silva123/denoised_OTU_ID_tax_assignments.txt \
--sc-separated taxonomy
## Make a phylogenetic tree.
## As there are now ~700 OTUs, I will not discard small first
## To make phylogenetic tree requires first aligning to ref using secondary structure
TEMPLATE=$SILVADIR/core_alignment/core_alignment_SILVA123.fasta
nohup parallel_align_seqs_pynast.py -i $OUT2DIR/denoised_OTU_ID.fasta \
-t $TEMPLATE -O10 \
-o $OUT3DIR/pynast_aligned_denoised/ \
> nohup_align_silva.out &
nohup make_phylogeny.py -i $OUT3DIR/pynast_aligned_denoised/denoised_OTU_ID_aligned.fasta \
-l $OUT3DIR/fasttree.log &
## Post-processing of OTU table ####
cd $OUT3DIR/
## Merge by sample (combining PCR wells/barcodes) i.e. mouse ID
collapse_samples.py -b $OUT3DIR/denoised_allmeta.biom \
-m $OUT1DIR/mapping.txt \
--collapse_fields mouseID \
--output_biom_fp denoised_byMouse.biom \
--output_mapping_fp $OUT1DIR/mapping_byMouse.txt
## Inspect otu_table statistics
biom summarize-table -i denoised_allmeta_byMouse.biom -o denoised_byMouse.summary
more denoised_byMouse.summary