-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_names_and_accessions
executable file
·43 lines (30 loc) · 1.77 KB
/
get_names_and_accessions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash
set -e # Will EXIT the script if any command returns a non-zero exit status.
set -E # Make ERR trapping work inside functions too.
set -u # Variables must be pre-defined before using them.
set -o pipefail # If a pipe fails, returns the error code for the failed pipe even if it isn't the last command in a series of pipes.
# If an error occurs, report the line number to stderr (see https://olivergondza.github.io/2019/10/01/bash-strict-mode.html).
trap 's=$?; echo >&2 "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR
input_list=$1
input_base=$(basename $input_list)
input_dir=$(dirname $input_list)
function build_name() {
prepend=$1
echo "$input_dir/$prepend.$input_base"
}
if [[ ! -f "names_to_ID.hmdb_metabolites.txt" ]]; then
if [[ ! -f "hmdb_metabolites.xml" ]]; then
wget https://hmdb.ca/system/downloads/current/hmdb_metabolites.zip
unzip hmdb_metabolites.zip
fi
grep -P '^ <name>' hmdb_metabolites.xml | sed 's/ <name>//' | sed 's/<\/name>//' > names.txt
grep -P '^ <accession>' hmdb_metabolites.xml | sed 's/ <accession>//' | sed 's/<\/accession>//' > accessions.txt
paste names.txt accessions.txt | sort > names_to_ID.hmdb_metabolites.txt
rm names.txt
rm accessions.txt
fi
awk 'BEGIN{FS="\t"; OFS="\t"} {print NR,$0}' $input_list | sort -k2 > $( build_name numbered )
sort $input_list > $( build_name sorted )
join -a 2 -t$'\t' names_to_ID.hmdb_metabolites.txt $( build_name sorted ) > $( build_name name_sorted.combined )
join -1 2 -2 1 -t$'\t' $( build_name numbered ) $( build_name name_sorted.combined ) | sort -t$'\t' -k2 > $( build_name original_order )
awk 'BEGIN{FS="\t"; OFS="\t"} {print $1,$3}' $( build_name original_order ) > $( build_name clean_original_order )