-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_blast.sh
executable file
·147 lines (111 loc) · 5.21 KB
/
run_blast.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
## ANALYSIS OF AMINO ACID SUBSTITUTION SITES
## as provided by Shiri Tsour.
## SOFTWARE DEPENDENCIES
## additionally to standard bash tools (cat, grep, cut, awk, gunzip,
## rsync), this script additionally requires NCBI's blast, and a
## recent R version.
if [[ -z "${DECODE}" ]]; then
echo "Please define a path for input and output of this script:"
echo "export DECODE=<YOURDATAPATH>"
fi
## NUMBER OF CPUs used for blast
nthreads=7
## OUTPUT DATA PATHS
## SET THIS PATH TO WHERE YOU WANT TO SAVE INPUT and OUTPUT
## generate output paths
mkdir -p $DECODE/log
mkdir -p $DECODE/figures
mkdir -p $DECODE/processedData
mkdir -p $DECODE/originalData
### DATA COLLECTION
## 1) REQUIRED MANUAL DOWNLOADS
## download additionalData.zip which provides human genome data files
## generated in by the genomeBrowser project AND unzip in $DECODE/
## cp -a ~/data/mistrans/additionalData.zip $DECODE/
## cd $DECODE; unzip additionalData.zip
echo "testing existence of required input data"
required=${DECODE}/additionalData/features_GRCh38.110.tsv.gz
if [ ! -f "$required" ]; then
echo "MANUAL STEPS REQIRED: Please download the additionalData.zip"
echo "and unzip as instructed."
exit 1
fi
echo "testing availabilty of NCBI blast"
# Ensure blast is installed
if ! command -v blastp &> /dev/null; then
echo "blastp needs to be isntalled and in your PATH."
exit 1
fi
# Ensure R is installed
if ! command -v R &> /dev/null; then
echo "R needs to be isntalled and in your PATH."
exit 1
fi
## 2) COLLECT BP/SAAP SEQUENCES
## produces $DECODE/processedData/unique_bp.fas and
## $DECODE/processedData/unique_saap.tsv
echo "extracting BP and SAAP sequences"
## write BP into a fasta file for use with blast
gunzip -c $DECODE/additionalData/All_SAAP_TMTlevel_quant_df.txt.gz \
| cut -f 5 | sort | uniq | grep -v "^BP$" | \
grep -v -e '^$' > $DECODE/processedData/tmp.txt
gunzip -c $DECODE/additionalData/All_SAAP_patient_level_quant_df.txt \
| cut -f 5 | sort | uniq | grep -v "^BP$" | \
grep -v -e '^$' > $DECODE/processedData/tmp2.txt
cat $DECODE/processedData/tmp.txt $DECODE/processedData/tmp2.txt | \
sort |uniq | awk '{print ">" $0 ORS $0}' - \
> $DECODE/processedData/unique_bp.fas
rm -f $DECODE/processedData/tmp.txt $DECODE/processedData/tmp2.txt
## BP/SAAP as simple table, basis for search in proteins
gunzip -c $DECODE/additionalData/All_SAAP_TMTlevel_quant_df.txt.gz \
| cut -f 4,5 | sort | uniq | grep -v -P "^SAAP\tBP" | \
grep -v -e '^$' > $DECODE/processedData/tmp.tsv
gunzip -c $DECODE/additionalData/All_SAAP_patient_level_quant_df.txt \
| cut -f 4,5 | sort | uniq | grep -v -P "^SAAP\tBP" | \
grep -v -e '^$' > $DECODE/processedData/tmp2.tsv
cat $DECODE/processedData/tmp.tsv $DECODE/processedData/tmp2.tsv | \
sort | uniq > $DECODE/processedData/unique_saap.tsv
rm -f $DECODE/processedData/tmp.txt $DECODE/processedData/tmp2.txt
## 3) GET Ensembl PROTEIN SEQUENCES
echo "downloading Ensembl protein fasta"
rsync -av rsync://ftp.ensembl.org/ensembl/pub/release-110/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz $DECODE/originalData/
## 4) ADD PATIENT-SPECIFIC SINGLE AMINO REPLACEMENTS
## produces $DECODE/processedData/all_proteins.fa
echo "adding mutations to fasta"
R --vanilla < get_mutated_proteins.R &> $DECODE/log/mutated_proteins.txt
## 5) BLAST BP IN Ensembl+MUTATION PROTEINS
## produces $DECODE/processedData/unique_bp_blast.tsv
echo "RUNNING BLAST SEARCH OF BP"
makeblastdb -in $DECODE/processedData/all_proteins.fa -parse_seqids \
-title "ensembl hg38 proteins" -dbtype prot \
&> $DECODE/log/blast_database.txt
## NOTE: blast hits are pre-filtered for query length=alignment length
## and >75% identity
format="6 qseqid sacc pident mismatch length qlen slen sstart send evalue bitscore"
blastp -num_threads ${nthreads} -task blastp-short \
-query $DECODE/processedData/unique_bp.fas \
-db $DECODE/processedData/all_proteins.fa \
-outfmt "$format" 2> $DECODE/log/unique_bp_blast.txt \
| awk '{if($5==$6 && $3>75) print}' \
| grep -v "^#" > $DECODE/processedData/unique_bp_blast.tsv
## 6) SELECT BEST-MATCHING PROTEIN HIT FOR EACH BP
## produces $DECODE/processedData/bp_mapped.tsv
echo "getting best blast hit for each BP"
R --vanilla < get_protein_match.R &> $DECODE/log/protein_match.txt
## 7) COLLECT ALL DATA FOR UNIQUE BP/SAAP PAIRS
## produces $DECODE/processedData/saap_mapped.tsv
## NOTE: this script requires large additional data produced
## by genomeBrowser project and partially
## requiring high performance computing. The script can be run
## after doing these calculations and providing the correct path
## to the resulting data directory; see
## https://gitlab.com/raim/genomeBrowser/-/blob/master/data/mammary/setup.sh
## Here, we provide the produced file saap_mapped.tsv in the
## downloaded additionalData folder instead, to allow running
## the R analysis scripts that produce the final published figures, below.
echo "collecting BP protein, transcript and genome coordinates" \
"and various structural data. Note that this script will abort" \
"if the genomeBrowser mammary data setup is not present."
## not redirecting this to a log file to show error to user
R --vanilla < map_peptides.R