forked from unipept/unipept-database
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtype_strains.sh.in
executable file
·50 lines (40 loc) · 1.79 KB
/
type_strains.sh.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash
# Arguments: none
# Please crash on first mistake.
set -o pipefail -e
ENTREZ_URL=<<<ENTREZ_URL>>>
ENTREZ_BATCH_SIZE=<<<ENTREZ_BATCH_SIZE>>>
tempfile="$(<<<CMD_MKTEMP>>> tmp.XXXXXXXX)"
> "$tempfile"
header="$(curl -d 'db=assembly' \
-d 'term="sequence from type"' \
-d 'field=filter' \
-d 'usehistory=y' \
"$ENTREZ_URL/esearch.fcgi" \
| grep -e 'QueryKey' -e 'WebEnv' | tr -d '\n' \
)"
query_key="$(echo "$header" \
| <<<CMD_SED>>> -n 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\1/p' \
)"
web_env="$(echo "$header" \
| <<<CMD_SED>>> -n 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\1/p' \
)"
returned="$ENTREZ_BATCH_SIZE"
retstart='1'
while ((returned == ENTREZ_BATCH_SIZE)); do
returned="$(curl -d 'db=assembly' \
-d "query_key=$query_key" \
-d "WebEnv=$web_env" \
-d "retmax=$ENTREZ_BATCH_SIZE" \
-d "retstart=$retstart" \
"$ENTREZ_URL/esummary.fcgi" \
| grep '<Genbank>' \
| <<<CMD_SED>>> -e 's/<[^>]*>//g' -e 's/[ \t][ \t]*//g' \
| tee -a "$tempfile" \
| wc -l \
)"
retstart="$((retstart + returned))"
done
# write out the type strain assembly ids sorted
<<<CMD_SORT>>> "$tempfile"
rm "$tempfile"