forked from rnnh/bioinfo-notebook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUniProt_downloader.sh
72 lines (65 loc) · 2.02 KB
/
UniProt_downloader.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#! /bin/bash
# https://github.com/rnnh/bioinfo-notebook.git
# Help/usage text
usage="$(basename "$0") [-h|--help] [-p|--processors n -o|--output] -i|--input \n
\n
This script takes a list of UniProt primary accession numbers (*.list), and \n
downloads the corresponding protein sequences from UniProt as a FASTA amino \n
acid (.faa) file.\n
\n
This list can be generated by searching UniProtKB for a desired term (e.g. \n
'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' \n
and 'Format: List' to download the accession numbers of the corresponding \n
results.\n
\n
arguments: \n
\t -h | --help\t\t show this help text and exit \n
\t -i | --input\t\t the list of UniProt proteins to download \n
\t -p | --processors\t optional: set the number (n) of processors to \n
\t\t\t\t use (default: 1) \n
\t -o | --output\t\t optional: name of the output .faa file \n
\t\t\t\t (default: uniprot_{date}.faa) \n
"
PROCESSORS=1
OUTPUT=uniprot_$(date +%Y%m%d).faa
# Iterating through the input arguments with a while loop
while (( "$#" )); do
case "$1" in
-h|--help)
echo -e $usage
exit
;;
-i|--input)
INPUT=$2
shift 2
;;
-p|--processors)
PROCESSORS=$2
shift 2
;;
-o|--output)
OUTPUT=$2
shift 2
;;
--) # end argument parsing
shift
break
;;
-*|--*) # unsupported flags
echo -e "ERROR: $1 is an invalid option. \n" >&2
echo -e $usage
exit 1
;;
esac
done
if test -z "$INPUT";
then
echo -e "ERROR: No input file given. \n" >&2
echo -e $usage
exit 1
fi
echo "$(date +%Y/%m/%d\ %H:%M) Downloading UniProt sequences..."
cat $INPUT | \
xargs -n 1 -P $PROCESSORS -I % curl -s https://www.uniprot.org/uniprot/%.fasta \
>> $OUTPUT
echo "$(date +%Y/%m/%d\ %H:%M) Script finished."