-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmake_qiime_database_from_fasta.sh
156 lines (93 loc) · 6.7 KB
/
make_qiime_database_from_fasta.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/sh
# this script takes a fasta file and creates a QIIME-compatible database for OTU-picking and taxonomic assignment
# This script comes with no guarantees. Use at your own peril.
# Usage: bash make_qiime_database_from_fasta.sh /ABS/PATH/TO/INPUT_FASTA_FILE /ABS/PATH/TO/DIRECTORY/FOR/entrez_qiime.py /ABS/PATH/TO/NCBI/TAXONOMY/DIRECTORY/ /ABS/PATH/TO/OUTPUT/DIRECTORY/
# It is useful for taking the results of an Entrez NCBI query and turning them into a database, and was developed due to the limitations of the UNITE fungal database currently in use in QIIME, but can be used with any fasta file
# For example, you may want to build a database from all the ITS sequences available from NCBI between a certain length. To do this, simply execute an entrez search, download the resulting output, and feed it into this script.
# This script assumes you have a local copy of entrez_qiime.py on your machine, available here: https://raw.githubusercontent.com/bakerccm/entrez_qiime/master/entrez_qiime.py
# This script also asumes you have a local copy of the NCBI taxonomy database on your machine, obtainable as follows:
### Download NCBI names and taxonomy information (check md5sums to ensure proper downloads) ###
#######################
#accession_to_taxid
# ftp ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
# gunzip nucl_gb.accession2taxid.gz
#taxdump
# ftp ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
# tar -zxvf taxdump.tar.gz
# I prefer wget since ftp did not work for me?
# wget ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
# wget ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
#
#######################
# If you don't have ENTREZ on your machine, install it as follows:
# ### install EDirect on your local machine: ###
#######################
#cd ~
#perl -MNet::FTP -e \
# '$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1);
# $ftp->login; $ftp->binary;
# $ftp->get("/entrez/entrezdirect/edirect.zip");'
# above can be easyily done with
# wget wget ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect.zip
#unzip -u -q edirect.zip
#rm edirect.zip
#export PATH=$PATH:$HOME/edirect
#./edirect/setup.sh
#echo "export PATH=\$PATH:\$HOME/edirect" >> $HOME/.bash_profile
#exec bash
#######################
# you can now execute a search of the NCBI database. for example:
# esearch -db nuccore -query "\"\(internal transcribed spacer 1\"[All Fields] AND \(300[SLEN] : 600[SLEN]\)\) NOT \"uncultured Neocallimastigales\"[porgn] NOT \"bacteria\"[Filter]" \| efetch -format fasta -mode text > ./Desktop/NCBI_ITS1_DB_raw.fasta
# I think you escaped the pipe by accident? Also not eveyone has ./Desktop
# esearch -db nuccore -query "\"\(internal transcribed spacer 1\"[All Fields] AND \(300[SLEN] : 600[SLEN]\)\) NOT \"uncultured Neocallimastigales\"[porgn] NOT \"bacteria\"[Filter]" | efetch -format fasta -mode text > ~/NCBI_ITS1_DB_raw.fasta
# this downloads fasta with ALL ncbi seqs of ITS1 between 300:600 bp, that aren't bacterial or "uncultured gut fungi" (416,912 sequences as of Dec 16, 2016) and saves them as s fasta text file
#depending on size of query results, this can take some time, so be careful about hangups and make sure your connection is good
#######################
### Search for and remove any empty sequences ###
gawk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} {if ($2) print ">"$0}' $1 > $1.tidy
# Obtain NCBI taxonomy lineages for your input fasta
python $2entrez_qiime.py -i $1.tidy -o $4NCBI_Taxonomy.txt -r kingdom,phyllum,class,order,family,genus,species -a $3nucl_gb.accession2taxid -n $3
### Validate and Tidy up files ###
### Edit output file to include rank IDs (QIIME needs them for some scripts)
cat $4/NCBI_Taxonomy.txt | sed 's/\t/\tk__/' | sed 's/;/>p__/' | sed 's/;/>c__/' | sed 's/;/>o__/' | sed 's/;/>f__/' | sed 's/;/>g__/' | sed 's/;/>s__/' | sed 's/>/;/g' > $4NCBI_QIIME_Taxonomy.txt
### Edit database to single-line fasta format
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < $1.tidy > $1.tidy.oneline.fasta
### Remove first blank line
sed -i '/^$/d' $1.tidy.oneline.fasta
### Remove trailing descriptions after Accession No.
sed -i 's/ .*//' $1.tidy.oneline.fasta
### compare read counts in fasta and txt files
grep -c "^>" $1.tidy.oneline.fasta
wc -l $4/NCBI_QIIME_Taxonomy.txt
#if numbers are different, there are duplicates introduced by entrez_qiime.py
### if some duplicates may appear in fasta file (i.e., more reads than taxonomy IDs), get lists of Seq/Taxonomy IDs and remove duplicates from fasta file
cut -f 1 $4/NCBI_QIIME_Taxonomy.txt > Tax_Names
grep "^>" $1.tidy.oneline.fasta | cut -d " " -f 1 | sed 's/>//g' > $4/DB_Names
sort $4/DB_Names | uniq -d > $4/Duplicated_IDs
grep -A1 -f $4/Duplicated_IDs $1.tidy.oneline.fasta | sed '/^--/d' > $4/Duplicated_fastas
for fn in $4/Duplicated_fastas; do count=$(wc -l <"$fn"); half=$(($count/2 )); head -n $half $fn > $4/add_back; done
grep -v -f $4/Duplicated_IDs $1.tidy.oneline.fasta > $4/tidy.no_reps.fasta
cat $4/tidy.no_reps.fasta $4/add_back > $4/DB_raw.fasta
### Sort fasta database to same order as taxonomy map
echo "Sorting Database...This will take some time."
cut -f 1 $4/NCBI_QIIME_Taxonomy.txt > $4/IDs_in_order.txt
while read ID ; do grep -m 1 -A 1 "^>$ID" $4/DB_raw.fasta ; done < $4/IDs_in_order.txt > $4/DB.fasta #This will take quite a long time to run
mv $4/NCBI_QIIME_Taxonomy.txt $4/Taxonomy.txt
rm $4/DB_Names $4/DB_raw.fasta $4/Duplicated_fastas $4/Duplicated_IDs $4/IDs_in_order.txt $4/NCBI_Taxonomy.txt $4/Tax_Names $4/tidy.no_reps.fasta $1.tidy.oneline.fasta $1.tidy $4/add_back
cat $1.log
grep "^>" $4/DB.fasta | sed 's/>//' >$4/good_acc_list
echo "Cleaning Taxonomy to match Database...This may take some time."
while read ID ; do grep -m 1 $ID $4/Taxonomy.txt ; done < $4/good_acc_list > $4/Taxonomy_ordered.txt
#mv $4/Taxonomy_ordered.txt $4/Taxonomy.txt
#rm $4/good_acc_list
grep "k__NA;p__NA;c__NA;o__NA;f__NA;g__NA;s__NA\|^:" $4/Taxonomy_ordered.txt | cut -f1 > bad_acc_list
sed -e '/k__NA;p__NA;c__NA;o__NA;f__NA;g__NA;s__NA/d' $4/Taxonomy_ordered.txt > $4/Taxonomy_clean1.txt
sed -e '/^:/d' $4/Taxonomy_clean1.txt > $4/Taxonomy.txt
echo "Final cleanup to remove bad accessions..."
while read bad; do echo "Removing $bad" ; sed -i -e "/$bad/,+1d" $4/DB.fasta ; done < bad_acc_list
sed -i -e '/^>:/,+1d' $4/DB.fasta
grep "^>" $4/DB.fasta | sed 's/>//' > $4/DB_IDs_ordered
while read ID; do grep $ID $4/Taxonomy_ordered.txt ; done < $4/DB_IDs_ordered > Taxonomy_final.txt
rm $4/Taxonomy_clean1.txt $4/Taxonomy_ordered.txt
mv bad_acc_list bad_acc_list.txt
echo -e "Process complete. Final database is DB_ordered.fasta, and associated taxonomy is Taxonomy_ordered.txt\nAccessions that were removed are in bad_acc_list.txt"