-
Notifications
You must be signed in to change notification settings - Fork 9
/
CollectMetadata.sh
executable file
·62 lines (54 loc) · 1.9 KB
/
CollectMetadata.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#/bin/bash
#########################################################################################################################################################
## This script takes a file with list of protein accessions and collects metadata #
## Results are saved in $2 - ProteinAccession__GenomeAcc,GenomeAcc_ScientificName,TaxonomyID,ScientificName,Lineage,GenomeAccession,Genus,Species,URL #
#########################################################################################################################################################
if [ -f $2 ]
then
rm -f $2
touch $2
fi
#echo "ProteinAccession__GenomeAcc,GenomeAcc_ScientificName,TaxonomyID,ScientificName,Lineage,GenomeAccession,Genus,Species,GenomeAcc_Lineage,ProteinAccession,URL" > $2
filename=$1
IFS=$'\n'
for line in `cat $filename`
do
{
taxonomy=`esearch -db protein -query "$line"|elink -target taxonomy |efetch -format xml|xtract -pattern Taxon -first TaxId ScientificName Lineage -tab " "|sed 's/; /;/g;s/, /-/g;s/\t/,/g'`
if [[ "$line" == *"_"* ]];
then
genomeacc=`elink -db protein -id "$line" -target nuccore -batch|efetch -format docsum|xtract -pattern DocumentSummary -element AssemblyAcc`
else
genomeacc=`elink -db protein -id "$line" -target nuccore -batch|efetch -format acc`
fi
genus=`echo $taxonomy|cut -f $3 -d";"`
species=`echo $taxonomy|awk -F ";" '{print $NF}'`
sciname=`echo $taxonomy|cut -f2 -d ","`
lineage=`echo $taxonomy|cut -f3 -d ","`
if [ -z "$genus" ]
then
genus="undef"
fi
if [ -z "$genomeacc" ]
then
genomeacc="undef"
fi
if [ -z "$lineage" ]
then
lineage="undef"
fi
if [ -z "$species" ]
then
species="undef"
fi
if [ -z "$sciname" ]
then
sciname="undef"
fi
if [ -z "$taxonomy" ]
then
taxonomy="undef"
fi
echo $line"__"$genomeacc,$genomeacc"_"$sciname,$taxonomy,$genomeacc,$genus,$species,$genomeacc"_"$lineage,$line,$genomeacc >> $2
}
done