-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathnumericData.R
392 lines (303 loc) · 13.3 KB
/
numericData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# numericData.R
#
# +-----------------------------------------------------------------+
# | |
# | Do not edit this file! Edit "myNumericData.R" instead. |
# | |
# +-----------------------------------------------------------------+
#
# Purpose: Sample solutions for the Numeric Data workshop unit.
#
# Version: 1.1
#
# Date: 2019 05 12
# Author: Boris Steipe ([email protected])
#
# V 1.1 2019 Updates
# V 1.0 First code 2018
#
# TODO:
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 SCENARIO 40
#TOC> 2 Introduction to the bio3D package 62
#TOC> 3 A Ramachandran plot 148
#TOC> 4 Density plots 175
#TOC> 4.1.1 ... as overlay on a colored grid 228
#TOC> 4.1.2 ... as filled contour 245
#TOC> 4.1.3 ... as a perspective plot 286
#TOC> 4.1.4 ... advanced perspective plot 303
#TOC>
#TOC> ==========================================================================
# = 1 SCENARIO ============================================================
# In this example of working with numeric data, we ...
# - load the library "bio3D" which supports work with
# protein structure files;
# - explore some elementary functions of the library;
# - consider implicit and explicit sequence in a structure file,
# and the important issue of sequence numbers;
# - explore plotting of density values with scatterplots.
# We wish to create a plot that looks like this:
source("./sampleSolutions/numericDataSampleSolutions-ShowPlot.R")
# This is a plot of the density of backbone torsion angles (how often they are
# observed) for each combination of phi-psi values in the PDB structure file
# 6AU6 - the GNAS-2 protein.
#
# The protein structure of GNAS-2 has been determined to high resolution.
# Let's explore the structure file.
# = 2 Introduction to the bio3D package ===================================
if (! requireNamespace(bio3d, quietly=TRUE)) {
install.packages("bio3d")
}
# Package information:
# library(help = bio3d) # basic information
# browseVignettes("bio3d") # available vignettes
# data(package = "bio3d") # available datasets
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you can
GNASpdb <- bio3d::read.pdb("6AU6") # load a molecule directly from the PDB
# via the Internet. (This is not the local version in the project's
# ./data folder.)
# check what we have:
GNASpdb
# what is this object actually?
str(GNASpdb)
# Compare this to the actual structure file from the PDB
file.show("./data/6au6.pdb")
# bio3d's pdb objects are simple lists. Great! You know lists!
# You see that there is a list element called $atom which is a data frame in
# which the columns are vectors of the same length - namely the number of atoms
# in the structure file. And there is a matrix of (x, y, z) triplets called xyz.
# And there is a vector that holds sequence, and two tables called helix and
# sheet. Let's pull out a few values to confirm how selection and subsetting
# works here:
# selection by atom ...
i <- 5
GNASpdb$atom[i,]
GNASpdb$atom[i, c("x", "y", "z")] # here we are selecting with column names!
GNASpdb$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers
# all atoms of a residue ...
i <- 48
GNASpdb$atom[GNASpdb$atom[,"resno"] == i, ]
# sequence of the first ten residues
GNASpdb$seqres[1:10] # the "A"s here identify chain "A"
# Convert this to one letter code
bio3d::aa321(GNASpdb$seqres[1:10])
# Task 2.1 List the implicit sequence contained in the file. Note: the
# "explicit" sequence is spelled out in the SEQRES records, and
# is the one that is stored in the databases to correspond to the
# PDB file. The "implicit" sequence however is the one that's actually
# contained in the coordinates. This is not necessarily the same,
# there may be modifications, or N- or C- termini or loops may
# be invisible in the coordinates. In general, the explicit sequence
# is what the crystallographer puts into the experiment, the
# implicit sequence is how she interprets the resulting electron
# density map.
# Task 2.2 Do explicit and implicit sequence have the same length?
# Task 2.3 Compare the implicit sequence with the genomic sequence.
# - Do we need to renumber the PDB file to match the positions
# of recorded mutations (cf. IntOGen data)?
# get a list of all CA atoms of arginine residues
sel <- GNASpdb$atom$resid == "ARG" & GNASpdb$atom$elety == "CA"
GNASpdb$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
bio3d::plot.bio3d(GNASpdb$atom$b[GNASpdb$calpha],
sse=GNASpdb,
typ="l",
ylab="B-factor")
# = 3 A Ramachandran plot =================================================
# Task 2.1 Calculate a Ramachandran plot for the structure. Hint: the
# bio3d::torsion.pdb() function calculates all dihedral angles for
# backbone and sidechain bonds, NA where the bond does not exist in an
# amino acid. Assign the values to the variable name "tor".
# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues.
# Task 2.2 Replot the data and color the points for glycine residues green.
# As you see, eight residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script?
# Task 2.3 Identify the outlier residues.
# Task 2.4 Check the residues in your favourite molecular viewer.
# Is there anything unusual about these residues?
# = 4 Density plots =======================================================
# Such x, y scatter-plots of data that is sampled from a distribution can tell
# us a lot about what shapes the distribution. The distribution is governed by
# the free energy of the phi-psi landscape in folded proteins, since folded
# proteins generally minimize the free energy of their conformations. We observe
# empirically, from comparing frequency statistics and mutation experiments,
# that this generally follows a Boltzmann distribution, where the free energy
# changes we observe in experments that change one conformation into another are
# proportional to the log-ratio of the number of times we observe each
# observation in the protein structure database (after correcting for
# observation bias). The proper way to visualize such 2D landscapes is with
# contour plots.
# The best way to plot such data is provided by the function contour():
?contour
# Contour plots are not produced along the sampled values of a data
# set, but on a regular grid. This means, we need to convert observed values
# into estimated densities. Density estimation is an important topic for
# exploratory data analysis, base R has the density() function for 1D
# distributions. But for 2D data like or phi-psi plots, we need a function from
# the MASS package: kde2d()
if (! requireNamespace(MASS, quietly=TRUE)) {
install.packages("MASS")
}
# Package information:
# library(help = MASS) # basic information
# browseVignettes("MASS") # available vignettes
# data(package = "MASS") # available datasets
?MASS::kde2d
all(is.na(tor$phi) == sum(is.na(tor$phi))
sel <- !(is.na(tor$phi) | is.na(tor$psi))
phi <- tor$phi[sel]
psi <- tor$psi[sel]
dPhiPsi <-MASS::kde2d(phi, psi,
n = 60,
lims = c(-180, 180, -180, 180))
str(dPhiPsi)
# This is a list, with gridpoints in x and y, and the estimated densities in z.
# Generic plot with default parameters
contour(dPhiPsi)
# === 4.1.1 ... as overlay on a colored grid
image(dPhiPsi,
col = myColorRamp(100),
main = "Ramachandran plot for 6AU6",
xlab = expression(phi),
ylab = expression(psi))
contour(dPhiPsi, col = "royalblue",
add = TRUE,
method = "edge",
nlevels = 10,
lty = 2)
points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
# === 4.1.2 ... as filled contour
#
# using a custom color-ramp
myColorRamp <- colorRampPalette(c("#99AACC",
"#3399CC",
"#2266DD",
"#CC00AA"))
N <- 10
barplot(rep(1, N), col = myColorRamp(N))
filled.contour(dPhiPsi,
xlim = c(-180, 180), ylim = c(-180, 180),
nlevels = 10,
color.palette = myColorRamp,
main = "Ramachandran plot for 6AU6",
xlab = expression(phi),
ylab = expression(psi))
# Note: we can pass additional plotting and overlay commands to the contour plot
# in a block of expressions passed via the plot.axes parameter:
filled.contour(dPhiPsi,
xlim = c(-180, 180), ylim = c(-180, 180),
nlevels = 10,
color.palette = myColorRamp,
main = "Ramachandran plot for 6AU6",
xlab = expression(phi),
ylab = expression(psi),
plot.axes = {
contour(dPhiPsi, col = "#00000044",
add = TRUE,
method = "edge",
nlevels = 10,
lty = 2)
points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
})
# === 4.1.3 ... as a perspective plot
persp(dPhiPsi,
xlab = "phi",
ylab = "psi",
zlab = "Density")
persp(dPhiPsi,
theta = 40,
phi = 10,
col = "#99AACC",
xlab = "phi",
ylab = "psi",
zlab = "Density")
# === 4.1.4 ... advanced perspective plot
if (! require(plot3D, quietly=TRUE)) {
install.packages("plot3D")
library(plot3D)
}
# Package information:
# library(help = plot3D) # basic information
# browseVignettes("plot3D") # available vignettes
# data(package = "plot3D") # available datasets
# example with custom tickmarks
# cf http://entrenchant.blogspot.ca/2014/03/custom-tick-labels-in-r-perspective.html
# set up axis parametrs
minX <- -180
maxX <- 180
xPos <- seq(minX, maxX, by = 60)
minY <- minX
maxY <- maxX
yPos <- xPos
zScale <- 500000
minZ <- 0
maxZ <- 150
zPos <- seq(minZ, maxZ, by=50)
# draw the plot, save the perspective matrix
pMat <- persp3D(z = dPhiPsi$z * zScale,
x = dPhiPsi$x,
y = dPhiPsi$y,
xlab = "phi",
ylab = "psi",
zlab = "density",
main = "phi/psi plot of 6AU6",
axes = FALSE,
theta = 50,
phi = 15,
expand = 2,
facets = TRUE,
scale = FALSE,
col = myColorRamp(40),
shade = 0.2,
border = "#FFFFFF22",
clab = "density",
colkey = list(side = 4, length = 0.6))
# Transformed axes are drawn "by hand" from lines(), segements(), and text()
# given the transformation matrix pMat that is returned by the plot.
#
# draw the axis lines
lines(trans3d(xPos, minY, minZ, pMat) , col="#222255", lwd = 3)
lines(trans3d(maxX, yPos, minZ, pMat) , col="#222255", lwd = 3)
lines(trans3d(minX, minY, zPos, pMat) , col="#222255", lwd = 3)
# draw tick marks
tickLength <- (maxX - minX) * 0.05
tickStart <- trans3d(xPos, minY, minZ, pMat)
tickEnd <- trans3d(xPos, (minY - tickLength), minZ, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)
tickStart <- trans3d( maxX, yPos, minZ, pMat)
tickEnd <- trans3d((maxX + tickLength), yPos, minZ, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)
tickStart <- trans3d(minX, minY, zPos, pMat)
tickEnd <- trans3d(minX, (minY - tickLength), zPos, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)
# add tick mark labels
labelOffset <- (maxX - minX) * 0.075
labelPos <- trans3d(xPos, (minY - labelOffset), minZ, pMat)
text(labelPos$x, labelPos$y,
labels = as.character(xPos),
adj = c(0, NA), srt = 270, cex = 0.6)
labelPos <- trans3d((maxX + labelOffset), yPos, minZ, pMat)
text(labelPos$x, labelPos$y,
labels = as.character(xPos),
adj = c(0, NA), cex = 0.6)
labelPos <- trans3d(minX, (minY - labelOffset), zPos, pMat)
text(labelPos$x, labelPos$y,
labels = as.character(zPos),
adj = c(1, NA), cex = 0.6)
# [END]