numericData.R

# numericData.R
#
#   +-----------------------------------------------------------------+
#   |                                                                 |
#   |  Do not edit this file! Edit "myNumericData.R" instead.         |
#   |                                                                 |
#   +-----------------------------------------------------------------+
#
# Purpose:  Sample solutions for the Numeric Data workshop unit.
#
# Version: 1.1
#
# Date:    2019  05  12
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1    2019 Updates
# V 1.0    First code 2018
#
# TODO:
#
# ==============================================================================


#TOC> ==========================================================================
#TOC> 
#TOC>   Section  Title                                      Line
#TOC> ----------------------------------------------------------
#TOC>   1        SCENARIO                                     40
#TOC>   2        Introduction to the bio3D package            62
#TOC>   3        A Ramachandran plot                         148
#TOC>   4        Density plots                               175
#TOC>   4.1.1          ... as overlay on a colored grid      228
#TOC>   4.1.2          ... as filled contour                 245
#TOC>   4.1.3          ... as a perspective plot             286
#TOC>   4.1.4          ... advanced perspective plot         303
#TOC> 
#TOC> ==========================================================================


# =    1  SCENARIO  ============================================================

# In this example of working with numeric data, we ...
#   - load the library "bio3D" which supports work with
#     protein structure files;
#   - explore some elementary functions of the library;
#   - consider implicit and explicit sequence in a structure file,
#       and the important issue of sequence numbers;
#   - explore plotting of density values with scatterplots.

# We wish to create a plot that looks like this:

source("./sampleSolutions/numericDataSampleSolutions-ShowPlot.R")

# This is a plot of the density of backbone torsion angles (how often they are
# observed) for each combination of phi-psi values in the PDB structure file
# 6AU6 - the GNAS-2 protein.
#
# The protein structure of GNAS-2 has been determined to high resolution.
# Let's explore the structure file.


# =    2  Introduction to the bio3D package  ===================================

if (! requireNamespace(bio3d, quietly=TRUE)) {
    install.packages("bio3d")
}
# Package information:
#  library(help = bio3d)       # basic information
#  browseVignettes("bio3d")    # available vignettes
#  data(package = "bio3d")     # available datasets


# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you can

GNASpdb <- bio3d::read.pdb("6AU6")  # load a molecule directly from the PDB
# via the Internet. (This is not the local version in the project's
# ./data folder.)

# check what we have:
GNASpdb

# what is this object actually?
str(GNASpdb)

# Compare this to the actual structure file from the PDB
file.show("./data/6au6.pdb")

# bio3d's pdb objects are simple lists. Great! You know lists!

# You see that there is a list element called $atom which is a data frame in
# which the columns are vectors of the same length - namely the number of atoms
# in the structure file. And there is a matrix of (x, y, z) triplets called xyz.
# And there is a vector that holds sequence, and two tables called helix and
# sheet. Let's pull out a few values to confirm how selection and subsetting
# works here:

# selection by atom ...
i <- 5
GNASpdb$atom[i,]
GNASpdb$atom[i, c("x", "y", "z")]   # here we are selecting with column names!
GNASpdb$xyz[c(i*3-2, i*3-1, i*3)]   # here we are selcting with row numbers

# all atoms of a residue ...
i <- 48
GNASpdb$atom[GNASpdb$atom[,"resno"] == i, ]

# sequence of the first ten residues
GNASpdb$seqres[1:10]  # the "A"s here identify chain "A"

# Convert this to one letter code
bio3d::aa321(GNASpdb$seqres[1:10])


# Task 2.1  List the implicit sequence contained in the file. Note: the
#           "explicit" sequence is spelled out in the SEQRES records, and
#           is the one that is stored in the databases to correspond to the
#           PDB file. The "implicit" sequence however is the one that's actually
#           contained in the coordinates. This is not necessarily the same,
#           there may be modifications, or N- or C- termini or loops may
#           be invisible in the coordinates. In general, the explicit sequence
#           is what the crystallographer puts into the experiment, the
#           implicit sequence is how she interprets the resulting electron
#           density map.


# Task 2.2  Do explicit and implicit sequence have the same length?


# Task 2.3 Compare the implicit sequence with the genomic sequence.
#          - Do we need to renumber the PDB file to match the positions
#            of recorded mutations (cf. IntOGen data)?


# get a list of all CA atoms of arginine residues
sel <- GNASpdb$atom$resid == "ARG" & GNASpdb$atom$elety == "CA"
GNASpdb$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]

# The introduction to bio3d tutorial at
#   http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
bio3d::plot.bio3d(GNASpdb$atom$b[GNASpdb$calpha],
                  sse=GNASpdb,
                  typ="l",
                  ylab="B-factor")


# =    3  A Ramachandran plot  =================================================

# Task 2.1  Calculate a Ramachandran plot for the structure. Hint: the
#           bio3d::torsion.pdb() function calculates all dihedral angles for
#           backbone and sidechain bonds, NA where the bond does not exist in an
#           amino acid. Assign the values to the variable name "tor".


# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues.

# Task 2.2  Replot the data and color the points for glycine residues green.


# As you see, eight residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script?

# Task 2.3  Identify the outlier residues.


# Task 2.4  Check the residues in your favourite molecular viewer.
#           Is there anything unusual about these residues?


# =    4  Density plots  =======================================================

# Such x, y scatter-plots of data that is sampled from a distribution can tell
# us a lot about what shapes the distribution. The distribution is governed by
# the free energy of the phi-psi landscape in folded proteins, since folded
# proteins generally minimize the free energy of their conformations. We observe
# empirically, from comparing frequency statistics and mutation experiments,
# that this generally follows a Boltzmann distribution, where the free energy
# changes we observe in experments that change one conformation into another are
# proportional to the log-ratio of the number of times we observe each
# observation in the protein structure database (after correcting for
# observation bias). The proper way to visualize such 2D landscapes is with
# contour plots.


# The best way to plot such data is provided by the function contour():
?contour

# Contour plots are not produced along the sampled values of a data
# set, but on a regular grid. This means, we need to convert observed values
# into estimated densities. Density estimation is an important topic for
# exploratory data analysis, base R has the density() function for 1D
# distributions. But for 2D data like or phi-psi plots, we need a function from
# the MASS package: kde2d()

if (! requireNamespace(MASS, quietly=TRUE)) {
    install.packages("MASS")
}
# Package information:
#  library(help = MASS)       # basic information
#  browseVignettes("MASS")    # available vignettes
#  data(package = "MASS")     # available datasets

?MASS::kde2d

all(is.na(tor$phi) == sum(is.na(tor$phi))
sel <- !(is.na(tor$phi) | is.na(tor$psi))
phi <- tor$phi[sel]
psi <- tor$psi[sel]


dPhiPsi <-MASS::kde2d(phi, psi,
                      n = 60,
                      lims = c(-180, 180, -180, 180))

str(dPhiPsi)
# This is a list, with gridpoints in x and y, and the estimated densities in z.

# Generic plot with default parameters
contour(dPhiPsi)


# ===   4.1.1  ... as overlay on a colored grid 

image(dPhiPsi,
      col = myColorRamp(100),
      main = "Ramachandran plot for 6AU6",
      xlab = expression(phi),
      ylab = expression(psi))
contour(dPhiPsi, col = "royalblue",
        add = TRUE,
        method = "edge",
        nlevels = 10,
        lty = 2)
points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")


# ===   4.1.2  ... as filled contour            
#
# using a custom color-ramp

myColorRamp <- colorRampPalette(c("#99AACC",
                                  "#3399CC",
                                  "#2266DD",
                                  "#CC00AA"))
N <- 10
barplot(rep(1, N), col = myColorRamp(N))


filled.contour(dPhiPsi,
               xlim = c(-180, 180), ylim = c(-180, 180),
               nlevels = 10,
               color.palette = myColorRamp,
               main = "Ramachandran plot for 6AU6",
               xlab = expression(phi),
               ylab = expression(psi))

# Note: we can pass additional plotting and overlay commands to the contour plot
# in a block of expressions passed via the plot.axes parameter:

filled.contour(dPhiPsi,
               xlim = c(-180, 180), ylim = c(-180, 180),
               nlevels = 10,
               color.palette = myColorRamp,
               main = "Ramachandran plot for 6AU6",
               xlab = expression(phi),
               ylab = expression(psi),
               plot.axes = {
                   contour(dPhiPsi, col = "#00000044",
                           add = TRUE,
                           method = "edge",
                           nlevels = 10,
                           lty = 2)
                   points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
                   abline(h = 0, lwd = 0.5, col = "#00000044")
                   abline(v = 0, lwd = 0.5, col = "#00000044")
               })

# ===   4.1.3  ... as a perspective plot        

persp(dPhiPsi,
      xlab = "phi",
      ylab = "psi",
      zlab = "Density")


persp(dPhiPsi,
      theta = 40,
      phi = 10,
      col = "#99AACC",
      xlab = "phi",
      ylab = "psi",
      zlab = "Density")


# ===   4.1.4  ... advanced perspective plot    

if (! require(plot3D, quietly=TRUE)) {
    install.packages("plot3D")
    library(plot3D)
}
# Package information:
#  library(help = plot3D)       # basic information
#  browseVignettes("plot3D")    # available vignettes
#  data(package = "plot3D")     # available datasets

# example with custom tickmarks
# cf http://entrenchant.blogspot.ca/2014/03/custom-tick-labels-in-r-perspective.html

# set up axis parametrs
minX <- -180
maxX <-  180
xPos <- seq(minX, maxX, by = 60)
minY <- minX
maxY <- maxX
yPos <- xPos

zScale <- 500000
minZ <- 0
maxZ <- 150
zPos <- seq(minZ, maxZ, by=50)

# draw the plot, save the perspective matrix
pMat <- persp3D(z = dPhiPsi$z * zScale,
                x = dPhiPsi$x,
                y = dPhiPsi$y,
                xlab = "phi",
                ylab = "psi",
                zlab = "density",
                main = "phi/psi plot of 6AU6",
                axes = FALSE,
                theta = 50,
                phi = 15,
                expand = 2,
                facets = TRUE,
                scale = FALSE,
                col = myColorRamp(40),
                shade = 0.2,
                border = "#FFFFFF22",
                clab = "density",
                colkey = list(side = 4, length = 0.6))


# Transformed axes are drawn "by hand" from lines(), segements(), and text()
# given the transformation matrix pMat that is returned by the plot.
#
# draw the axis lines
lines(trans3d(xPos, minY, minZ, pMat) , col="#222255", lwd = 3)
lines(trans3d(maxX, yPos, minZ, pMat) , col="#222255", lwd = 3)
lines(trans3d(minX, minY, zPos, pMat) , col="#222255", lwd = 3)


# draw tick marks
tickLength <- (maxX - minX) * 0.05
tickStart <- trans3d(xPos,  minY,               minZ, pMat)
tickEnd   <- trans3d(xPos, (minY - tickLength), minZ, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)

tickStart <- trans3d( maxX,               yPos, minZ, pMat)
tickEnd   <- trans3d((maxX + tickLength), yPos, minZ, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)

tickStart <- trans3d(minX,                minY, zPos, pMat)
tickEnd   <- trans3d(minX, (minY - tickLength), zPos, pMat)
segments(tickStart$x, tickStart$y, tickEnd$x, tickEnd$y)


# add tick mark labels
labelOffset <- (maxX - minX) * 0.075
labelPos <- trans3d(xPos, (minY - labelOffset), minZ, pMat)
text(labelPos$x, labelPos$y,
     labels = as.character(xPos),
     adj = c(0, NA), srt = 270, cex = 0.6)

labelPos <- trans3d((maxX + labelOffset), yPos, minZ, pMat)
text(labelPos$x, labelPos$y,
     labels = as.character(xPos),
     adj = c(0, NA), cex = 0.6)

labelPos <- trans3d(minX, (minY - labelOffset), zPos, pMat)
text(labelPos$x, labelPos$y,
     labels = as.character(zPos),
     adj = c(1, NA), cex = 0.6)

# [END]