make_qiime_database_from_fasta.sh

#!/bin/sh

# this script takes a fasta file and creates a QIIME-compatible database for OTU-picking and taxonomic assignment
# This script comes with no guarantees. Use at your own peril.


# Usage: bash make_qiime_database_from_fasta.sh /ABS/PATH/TO/INPUT_FASTA_FILE /ABS/PATH/TO/DIRECTORY/FOR/entrez_qiime.py /ABS/PATH/TO/NCBI/TAXONOMY/DIRECTORY/ 	/ABS/PATH/TO/OUTPUT/DIRECTORY/


# It is useful for taking the results of an Entrez NCBI query and turning them into a database, and was developed due to the limitations of the UNITE fungal database currently in use in QIIME, but can be used with any fasta file
# For example, you may want to build a database from all the ITS sequences available from NCBI between a certain length. To do this, simply execute an entrez search, download the resulting output, and feed it into this script.

# This script assumes you have a local copy of entrez_qiime.py on your machine, available here: https://raw.githubusercontent.com/bakerccm/entrez_qiime/master/entrez_qiime.py

# This script also asumes you have a local copy of the NCBI taxonomy database on your machine, obtainable as follows:


### Download NCBI names and taxonomy information (check md5sums to ensure proper downloads) ###

#######################

#accession_to_taxid
# ftp ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
# gunzip nucl_gb.accession2taxid.gz

#taxdump
# ftp ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
# tar -zxvf taxdump.tar.gz

# I prefer wget since ftp did not work for me?
# wget ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
# wget ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
# 


#######################

# If you don't have ENTREZ on your machine, install it as follows:

# ###  install EDirect on your local machine:  ###

#######################

#cd ~
#perl -MNet::FTP -e \
#  '$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1);
#   $ftp->login; $ftp->binary;
#   $ftp->get("/entrez/entrezdirect/edirect.zip");'
# above can be easyily done with 
# wget wget ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect.zip
#unzip -u -q edirect.zip
#rm edirect.zip
#export PATH=$PATH:$HOME/edirect
#./edirect/setup.sh

#echo "export PATH=\$PATH:\$HOME/edirect" >> $HOME/.bash_profile
#exec bash

#######################

# you can now execute a search of the NCBI database.  for example:

# esearch -db nuccore -query "\"\(internal transcribed spacer 1\"[All Fields] AND \(300[SLEN] : 600[SLEN]\)\) NOT \"uncultured Neocallimastigales\"[porgn] NOT \"bacteria\"[Filter]" \| efetch -format fasta -mode text > ./Desktop/NCBI_ITS1_DB_raw.fasta

# I think you escaped the pipe by accident?  Also not eveyone has ./Desktop
# esearch -db nuccore -query "\"\(internal transcribed spacer 1\"[All Fields] AND \(300[SLEN] : 600[SLEN]\)\) NOT \"uncultured Neocallimastigales\"[porgn] NOT \"bacteria\"[Filter]" | efetch -format fasta -mode text > ~/NCBI_ITS1_DB_raw.fasta
# this downloads fasta with ALL ncbi seqs of ITS1 between 300:600 bp, that aren't bacterial or "uncultured gut fungi" (416,912 sequences as of Dec 16, 2016) and saves them as s fasta text file 
#depending on size of query results, this can take some time, so be careful about hangups and make sure your connection is good


#######################


###  Search for and remove any empty sequences ###
gawk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} {if ($2) print ">"$0}' $1 > $1.tidy


# Obtain NCBI taxonomy lineages for your input fasta
python $2entrez_qiime.py -i $1.tidy -o $4NCBI_Taxonomy.txt -r kingdom,phyllum,class,order,family,genus,species -a $3nucl_gb.accession2taxid -n $3


### Validate and Tidy up files ###

### Edit output file to include rank IDs (QIIME needs them for some scripts)
cat $4/NCBI_Taxonomy.txt | sed 's/\t/\tk__/' | sed 's/;/>p__/' | sed 's/;/>c__/' | sed 's/;/>o__/' | sed 's/;/>f__/' | sed 's/;/>g__/' | sed 's/;/>s__/' | sed 's/>/;/g' > $4NCBI_QIIME_Taxonomy.txt

### Edit database to single-line fasta format
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < $1.tidy > $1.tidy.oneline.fasta

### Remove first blank line
sed -i '/^$/d' $1.tidy.oneline.fasta

### Remove trailing descriptions after Accession No.
sed -i 's/ .*//' $1.tidy.oneline.fasta

### compare read counts in fasta and txt files
grep -c "^>" $1.tidy.oneline.fasta
wc -l $4/NCBI_QIIME_Taxonomy.txt

#if numbers are different, there are duplicates introduced by entrez_qiime.py

### if some duplicates may appear in fasta file (i.e., more reads than taxonomy IDs), get lists of Seq/Taxonomy IDs and remove duplicates from fasta file

cut -f 1 $4/NCBI_QIIME_Taxonomy.txt > Tax_Names
grep "^>" $1.tidy.oneline.fasta | cut -d " " -f 1 | sed 's/>//g' > $4/DB_Names
sort $4/DB_Names | uniq -d > $4/Duplicated_IDs
grep -A1 -f $4/Duplicated_IDs $1.tidy.oneline.fasta | sed '/^--/d' > $4/Duplicated_fastas
for fn in $4/Duplicated_fastas; do count=$(wc -l <"$fn"); half=$(($count/2 )); head -n $half $fn > $4/add_back; done
grep -v -f $4/Duplicated_IDs $1.tidy.oneline.fasta > $4/tidy.no_reps.fasta
cat $4/tidy.no_reps.fasta $4/add_back > $4/DB_raw.fasta

### Sort fasta database to same order as taxonomy map

echo "Sorting Database...This will take some time."

cut -f 1 $4/NCBI_QIIME_Taxonomy.txt > $4/IDs_in_order.txt
while read ID ; do grep -m 1 -A 1 "^>$ID" $4/DB_raw.fasta ; done < $4/IDs_in_order.txt > $4/DB.fasta  #This will take quite a long time to run

mv $4/NCBI_QIIME_Taxonomy.txt $4/Taxonomy.txt

rm $4/DB_Names $4/DB_raw.fasta $4/Duplicated_fastas $4/Duplicated_IDs $4/IDs_in_order.txt $4/NCBI_Taxonomy.txt $4/Tax_Names $4/tidy.no_reps.fasta $1.tidy.oneline.fasta $1.tidy $4/add_back

cat $1.log

grep "^>" $4/DB.fasta | sed 's/>//' >$4/good_acc_list

echo "Cleaning Taxonomy to match Database...This may take some time."

while read ID ; do grep -m 1 $ID $4/Taxonomy.txt ; done < $4/good_acc_list > $4/Taxonomy_ordered.txt
#mv $4/Taxonomy_ordered.txt $4/Taxonomy.txt
#rm $4/good_acc_list

grep "k__NA;p__NA;c__NA;o__NA;f__NA;g__NA;s__NA\|^:" $4/Taxonomy_ordered.txt | cut -f1 > bad_acc_list


sed -e '/k__NA;p__NA;c__NA;o__NA;f__NA;g__NA;s__NA/d' $4/Taxonomy_ordered.txt > $4/Taxonomy_clean1.txt
sed -e '/^:/d' $4/Taxonomy_clean1.txt > $4/Taxonomy.txt

echo "Final cleanup to remove bad accessions..."

while read bad; do echo "Removing $bad" ; sed -i -e "/$bad/,+1d" $4/DB.fasta ; done < bad_acc_list
sed -i -e '/^>:/,+1d' $4/DB.fasta


grep "^>" $4/DB.fasta | sed 's/>//' > $4/DB_IDs_ordered
while read ID; do grep $ID $4/Taxonomy_ordered.txt ; done < $4/DB_IDs_ordered > Taxonomy_final.txt


rm $4/Taxonomy_clean1.txt $4/Taxonomy_ordered.txt
mv bad_acc_list bad_acc_list.txt


echo -e "Process complete. Final database is DB_ordered.fasta, and associated taxonomy is Taxonomy_ordered.txt\nAccessions that were removed are in bad_acc_list.txt"