Skip to content

Commit aa7a63e

Browse files
committed
V0.1 - no guarantees
1 parent 8ef2be1 commit aa7a63e

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed

SRA_Download_and_Process.sh

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#!/bin/sh
2+
3+
############################
4+
#SRA online search terms:
5+
#ITS1
6+
#
7+
#SRA online filters:
8+
#Platform = Illumina
9+
############################
10+
# This script takes the standard downloads from the SRA read selector (the table and accession list) and uses them to download all the associated fastq files from the Sequence Read Archive.
11+
# It then filters for read quality (default 95% of bases with at least a 25 phred score) removes empty reads and reverse reads, converts to fasta files and makes a valid QIIME mapping file based on the sample names.
12+
# Next, it constructs one BIG fasta file, ready for OTU picking or other pre-processing steps.
13+
14+
15+
#This script assumes the following: 1. you have SRATools installed on your machine. Make sure to check the verison and location and adjust line 29
16+
# 2. You have downloaded a table and list of accession numbers from the SRA read selector website. Table contains metadata for each accession.
17+
# 3. You have QIIME and the fastx_toolkit installed and in your $PATH
18+
# usage: bash SRA_Download_and_Process.sh /PATH/TO/SRA_RUN_TABLE.TXT /PATH/TO/SRA_ACCESSION_LIST.TXT /OUTPUT/DIRECTORY/PATH/FOR/READS_AND_MAPPING_FILES
19+
20+
21+
# Determine total disk size of downloads based on metadata table (field 16) (this may not be robust...fix to use column name "Mbytes")
22+
cut -f 16 $1 > file_sizes
23+
paste <(awk '{sum += $1} END {print sum}' file_sizes) <(echo "Mbytes Total... Downloads will start in 10 seconds.")
24+
25+
#pause to sink in, give time to cancel
26+
sleep 10
27+
28+
echo "Downloading fastq files associated with SRA accession numbers..."
29+
30+
# use SRA toolkit fastq-dump to download fastq files for each associated SRA accession (Fwd and Rev runs in separate reads, gzip compression, no technical reads)
31+
cat $2 | xargs ~/sratoolkit.2.8.1-2-ubuntu64/bin/fastq-dump --split-files --bzip2 --skip-technical --readids --dumpbase --outdir $3
32+
33+
echo "Removing reverse reads...screw 'em!"
34+
35+
#deal with reverse reads....probably just dump them, at least until I can incorporate PEAR matching. Probably not worth it though.
36+
rm *_2.fastq.bz2
37+
38+
echo "Unzipping fastq files..."
39+
40+
#unzip fastqs
41+
bzip2 -d *.bz2
42+
43+
echo "Filtering fastqs to remove low-quality reads..."
44+
45+
#quality filter fastq (min 95% bases with 25 phred score)
46+
for fn in *.fastq; do fastq_quality_filter -i $fn -q 25 -p 95 -o $fn.QC_25-95 -v >> QC-25-95.out; done
47+
48+
echo "Converting fastq to fasta..."
49+
50+
#convert to fasta
51+
for fn in *.QC_25-95; do convert_fastaqual_fastq.py -f $fn -c fastq_to_fastaqual -F; done
52+
53+
echo "Removing empty files..."
54+
55+
#remove empty fasta files
56+
find . -type f -empty -exec rm {} \;
57+
58+
echo "Making list of file names..."
59+
60+
#make list of filenames
61+
ls -1 *.fna > filenames
62+
63+
64+
###make lists of sampleIDs/barcodes/linkerprimers/descriptions
65+
66+
67+
echo "Making mapping file for QIIME..."
68+
69+
#make list of valid (non-empty) samples to build mapping file for QIIME
70+
cut -d "_" -f1 filenames > valid_sampleIDs
71+
72+
#count number of valid samples and assign to variable
73+
count="$(wc -l valid_sampleIDs | cut -d " " -f 1)"
74+
75+
#make unique descriptions using number of valid samples
76+
paste -d "_" valid_sampleIDs <(echo $(yes SRAExp | head -n$count) | tr " " "\n") > valid_descriptions
77+
78+
#make bogus barcodes and primers using number of valid samples
79+
echo $(yes ACGTACGTACGT | head -n$count) | tr " " "\n" > valid_barcodes
80+
echo $(yes TAGATAG | head -n$count) | tr " " "\n" > valid_linkers
81+
82+
83+
#add header labels to mapping file
84+
paste <(echo -e "#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tInputFileName") <(head -1 $1) <(echo "Description") >> mapping_file.tsv
85+
86+
87+
88+
#construct mapping file
89+
paste valid_sampleIDs valid_barcodes valid_linkers <(while read line; do paste <(grep $line filenames) <(grep $line $1) <(grep $line valid_descriptions); done < valid_sampleIDs) >> mapping_file.tsv
90+
91+
92+
echo "Cleaning up intermediate files..."
93+
94+
#remove qual scores
95+
rm *.qual
96+
97+
#package up fastqs and fastas
98+
mkdir ./Fastqs
99+
mkdir ./Fastqs/Raw
100+
zip QC_fastqs *.QC_25-95
101+
rm *.QC_25-95
102+
zip Raw_fastqs *.fastq
103+
mv Raw_fastqs.zip ./Fastqs
104+
mv QC_fastqs.zip ./Fastqs
105+
mv *.fastq ./Fastqs/Raw
106+
gzip ./Fastqs/Raw/*.fastq
107+
mv QC-25-95.out ./Fastqs
108+
109+
110+
echo "Creating main fasta file, indexed for QIIME..."
111+
112+
add_qiime_labels.py -i ./ -m ./mapping_file.tsv -c InputFileName
113+
114+
echo "Validating mapping file..."
115+
116+
validate_mapping_file.py -m mapping_file.tsv
117+
sleep 3
118+
119+
rm mapping_file.tsv.html mapping_file.tsv.log overlib.js
120+
121+
122+
mkdir Intermed_files
123+
mv valid* file* Intermed_files
124+
mkdir ./Fastas
125+
mv *.fastq.fna ./Fastas
126+
zip QC_Fastas ./Fastas/*.fna
127+
128+
echo -e "Process complete.\nReady for OTU-Picking.\n\nIf errors were raised in mapping file validation, use the corrected mapping file in QIIME."
129+
130+
131+

0 commit comments

Comments
 (0)