-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathassign_ncbi_taxonomy_from_blast_results.sh
71 lines (44 loc) · 4.64 KB
/
assign_ncbi_taxonomy_from_blast_results.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/sh
#author Geoff Zahn ([email protected])
#this script takes the output of a blast search (e.g. the .txt file downloaded after blasting rep_set.fna to the NCBI server) and attaches the top blast hit along with the NCBI taxonomy lineage
#This can be useful if you want to see whether your current database (UNITE, for example) is returning taxonomic assignments that are harmonious with the larger NCBI database, or to simply assign taxonomy to your OTUs using NCBI lineages
#taxonomy lineage assignment is called using the entrez_qiime.py python script, for which you must specify an ABSOLUTE PATH as third input variable - this script can be found at: https://raw.githubusercontent.com/bakerccm/entrez_qiime/master/entrez_qiime.py
#script usage> bash assign_ncbi_taxonomy_from_blast_results.sh blast_results_text_file ABS/PATH/TO/NCBI/TAXONOMY/FILES/DIRECTORY/ ABS/PATH/TO/DRIECTORY/FOR/entrez_qiime.py
# If you do not already have a local copy of the NCBI's taxonomy data, you will need to download it. Download links are available from http://www.ncbi.nlm.nih.gov/guide/taxonomy/. The files can also be downloaded directly from the command line:
# approx 37MB
# ftp ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
# tar -zxvf taxdump.tar.gz
# approx 900MB
# ftp ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
# gunzip nucl_gb.accession2taxid.gz
#QC, as far as removing uncultured or badly assigned reads, should be done as part of the BLAST search prior to running this script
#absolute paths are required for entrez_qiime.py so they must be supplied
#########################
cat $1 > blast_no_uncultured_or_unknown.txt
grep "Query= " blast_no_uncultured_or_unknown.txt | cut -d " " -f2 > query_names
#in case removing "uncultured", and "endophytes" leaves you with no valid blast hits....
grep -A8 -Fwf query_names blast_no_uncultured_or_unknown.txt | grep -v "Sequences producing \|Length=\|Score"| sed '/^$/d' | sed '/^--/d' > sequential_hits
echo -e "These OTUs were not assigned to a top blast hit. Consider loosening blast parameters (return 100 instead of 10 hits, for example, or do not filter uncultured hits from the result file).\nThese have been removed from the output file: otu_to_ncbi_lineage.tsv" >> ERRORS.txt
awk '/Query=/{getline; print}' sequential_hits | grep "Query= " | cut -d " " -f2 > otus_with_no_hits
awk '/Query=/{getline; print}' sequential_hits | grep "Query= " | cut -d " " -f2 >> ERRORS.txt
grep -A8 -Fwf query_names blast_no_uncultured_or_unknown.txt | grep -v "Sequences producing \|Query= \|Length=\|Score"| sed '/^$/d' | sed '/^--/d' | sed 's/Cf. //' | sed 's/\[//' | sed 's/\]//' > top_blast_hits
cut -d " " -f1 top_blast_hits > accession_list
#run entrez-qiime.py to get lineage from accession list
python $3/entrez_qiime.py -L $PWD/accession_list -n $2 -r kingdom,phylum,class,order,family,genus,species -a $2nucl_gb.accession2taxid -o $PWD/$1_accession_to_tax_lineage.txt
#add QIIME format labels to each taxonomy level
cat $PWD/$1_accession_to_tax_lineage.txt | sed 's/\t/\tk__/' | sed 's/;/+/'| sed 's/;/+/'| sed 's/;/+/'| sed 's/;/+/'| sed 's/;/+/'| sed 's/;/;s__/'| sed 's/+/;p__/'|sed 's/+/;c__/'| sed 's/+/;o__/'| sed 's/+/;f__/'| sed 's/+/;g__/' > $PWD/$1_accession_to_ncbi_lineage.txt
paste <(grep -v -Fwf otus_with_no_hits query_names) top_blast_hits > otu_to_top_blast
echo -e "OTU_ID\tNCBI_ACCESSION\tNCBI_LINEAGE" > otu_to_ncbi_lineage.tsv
paste <(grep -v -Fwf otus_with_no_hits query_names) <(while read ID; do grep $ID $1_accession_to_ncbi_lineage.txt; done < accession_list) >> otu_to_ncbi_lineage.tsv
#cleanup intermediate files
mv accession_list.log entrez_qiime.log
#rm blast_no_uncultured_or_unknown.txt accession_list $1_accession_to_tax_lineage.txt query_names top_blast_hits $1_accession_to_ncbi_lineage.txt otus_with_no_hits sequential_hits
cat ERRORS.txt
echo "Process complete. Check ERRORS.txt for OTUs that were not assigned a blast hit."
echo "Main output file is otu_to_ncbi_lineage.tsv - A tab-separated file listing OTU_ID, NCBI_ACCESSION for top blast hit, and NCBI_LINEAGE in QIIME-compatible format"
##########################
#output files are:
# otu_to_ncbi_lineage.tsv tab-separated file listing OTU_ID, NCBI_ACCESSION for top blast hit, NCBI_LINEAGE in QIIME-compatible format
# entrez_qiime.log notes and errors from entrez_qiime.py - This will list accession numbers that did not have associated TaxIDs available. You will most likely need to fill these in by hand!
# otu_to_top_blast tab-separated file listing OTU_ID, NCBI_ACCESSION for top blast hit, NCBI Name/Strain Info, BLAST Alignment Score, BLAST E-Value
##########################