-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBIN-ALI-Optimal_sequence_alignment.R
365 lines (305 loc) · 13.3 KB
/
BIN-ALI-Optimal_sequence_alignment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# ==============================================================================
# Version: 1.7.1
#
# Date: 2017-09 - 2020-10
# Author: Boris Steipe ([email protected])
#
# Versions:
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
# 1.7 2020 updates
# 1.6 Maintenance
# 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library.
# 1.3 Updated confirmation task with correct logic
# 1.2 Added missing load of seqinr package
# 1.1 Update annotation file logic - it could already have been
# prepared in the BIN-FUNC-Annotation unit.
# 1.0.1 bugfix
# 1.0 First 2017 live version.
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------------
#TOC> 1 Prepare 58
#TOC> 2 Biostrings Pairwise Alignment 75
#TOC> 2.1 Optimal global alignment 93
#TOC> 2.2 Optimal local alignment 156
#TOC> 3 APSES Domain annotation by alignment 180
#TOC> 4 Update your database script 261
#TOC> 4.1 Preparing an annotation file ... 267
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
#TOC> 4.2 Execute and Validate 338
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# You can get package information with the following commands:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.
source("./myScripts/makeProteinDB.R")
# = 2 Biostrings Pairwise Alignment =======================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward.
# == 2.1 Optimal global alignment ==========================================
# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
str(ali1) # ... it's complicated
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch
# or work with "normal" R functions
# the alignment length
nchar(as.character(ali1@pattern))
# the number of identities
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject)))
# ... e.g. to calculate the percentage of identities
100 *
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) /
nchar(as.character(ali1@pattern))
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object
return(100 *
sum(seqinr::s2c(as.character(al@pattern)) ==
seqinr::s2c(as.character(al@subject))) /
nchar(as.character(al@pattern)))
}
percentID(ali1)
# == 2.2 Optimal local alignment ===========================================
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
gapExtension = 10)
Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity:
percentID(ali2)
# == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...
# = 3 APSES Domain annotation by alignment ================================
# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
#
# I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
# ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
# ... and with the two annotations we can get the corresponding ID from the
# annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
myDB$annotation[myDB$annotation$ID == proID &
myDB$annotation$ID == ftrID, ]
# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- Biostrings::pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_MYSPE,
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
Biostrings::writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.
str(aliApses@subject@range)
# start is:
aliApses@subject@range@start
# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1
# = 4 Update your database script =========================================
# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.
# == 4.1 Preparing an annotation file ... ==================================
#
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
#
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines except for the line that says:
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
#
# - Then delete the comma at the end of the line (your file will just have
# this one annotation).
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Save the file in your myScripts/ directory
#
## - Validate your file online at https://jsonlint.com/
#
# - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
# - save and close the file.
#
# Then SKIP the next section.
#
#
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
#
#
# You DO already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Below the last feature lines (but before the closing "]") add the
# following feature line (without the "#")
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Add a comma after the preceding feature line.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 4.2 Execute and Validate ==============================================
#
# - source() your database creation script:
#
# source("./myScripts/makeProteinDB.R")
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask on the mailing list for
# help.
#
# - Confirm
# The following commands should retrieve the correct start and end
# coordinates and sequence of the MBP1_MYSPE APSES domain:
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
(proID <- myDB$protein$ID[sel])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# [END]