-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBIN-PPI-Analysis.R
323 lines (239 loc) · 11.7 KB
/
BIN-PPI-Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# tocID <- "BIN-PPI-Analysis.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit.
#
# Version: 1.4
#
# Date: 2017-08 - 2020-10
# Author: Boris Steipe ([email protected])
#
# Versions:
# 1.4 Update vector ID's for betweenness centrality.
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
# 1.2 2020 Updates; Rewrite for new STRINg V11;
# Deprecate save()/load() for saveRDS()/readRDS()
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First live version
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 50
#TOC> 2 Functional Edges in the Human Proteome 86
#TOC> 2.1 Cliques 129
#TOC> 2.2 Communities 170
#TOC> 2.3 Betweenness Centrality 184
#TOC> 3 biomaRt 231
#TOC> 4 Task for submission 302
#TOC>
#TOC> ==========================================================================
# = 1 Setup and data ======================================================
# Not surprisingly, the analysis of PPI networks needs iGraph:
if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph")
}
# Package information:
# library(help = igraph) # basic information
# browseVignettes("igraph") # available vignettes
# data(package = "igraph") # available datasets
# In order for you to explore some real, biological networks, I give you a
# dataframe of functional relationships of human proteins that I have downloaded
# from the STRING database. The full table has 8.5 million records, here is a
# subset of records with combined confidence scores > 980
# The selected set of edges with a confidence of > 964 is a dataframe with about
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
# a fungal proteome. You can load the saved dataframe here (To read more about
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
STRINGedges <- readRDS("./data/STRINGedges.rds")
head(STRINGedges)
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
head(STRINGedges)
# = 2 Functional Edges in the Human Proteome ==============================
# There are many possibilities to explore interesting aspects of biological
# networks, we will keep with some very simple procedures here but you have
# to be aware that this is barely scratching the surface of possibilities.
# However, once the network exists in your computer, it is comparatively
# easy to find information online about the many, many options to analyze.
# Make a graph from this dataframe
?igraph::graph_from_data_frame
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google
# for <layout large graphs> if you are curious. Also, consider what one can
# really learn from plotting such a graph ...
# Of course simple computations on this graph are reasonably fast:
compSTR <- igraph::components(gSTR)
summary(compSTR) # our graph is fully connected!
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
# this actually does look rather scale-free
(freqRank <- table(igraph::degree(gSTR)))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "8,400 nodes from the human functional interaction network")
# This looks very scale-free indeed.
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
log10(as.numeric(names(freqRank)) + 1)))
abline(regressionLine, col = "firebrick")
# Now explore some more:
# == 2.1 Cliques ===========================================================
# Let's find the largest cliques. Remember: a clique is a fully connected
# subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs.
igraph::clique_num(gSTR)
# The largest clique has 81 members.
(C <- igraph::largest_cliques(gSTR)[[1]])
# Pick one of the proteins and find out what this fully connected cluster of 81
# proteins is (you can simply Google for any of the IDs). Is this expected?
# Plot this ...
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
# color the vertices along a color spectrum
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
# color the edges to have the same color as the originating node
eCol <- character()
for (i in seq_along(vCol)) {
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
}
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(R,
layout = igraph::layout_in_circle(R),
vertex.size = 3,
vertex.color = vCol,
edge.color = eCol,
edge.width = 0.1,
vertex.label = NA)
par(oPar)
# ... well: remember: a clique means every node is connected to every other
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
# networks looks like for large complexes.
# == 2.2 Communities =======================================================
set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- igraph::cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
igraph::modularity(gSTRclusters) # ... measures how separated the different
# membership types are from each other
tMem <- table(igraph::membership(gSTRclusters))
length(tMem) # About 700 communities identified
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
range(tMem) # ... but one has > 200 members
# == 2.3 Betweenness Centrality ============================================
# Let's find the nodes with the 10 - highest betweenness centralities.
#
BC <- igraph::centr_betw(gSTR)
# remember: BC$res contains the results
head(BC$res)
BC$res[1] # betweenness centrality of node 1 in the graph ...
# ... which one is node 1?
igraph::V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements of BC with their
# index ...
names(BC$res) <- as.character(1:length(BC$res))
# ... and then we sort:
sBC <- sort(BC$res, decreasing = TRUE)
head(sBC)
# This ordered vector means: node 3 has the highest betweenness centrality,
# node 721 has the second highest, etc.
(BCsel <- as.numeric(names(sBC)[1:10]))
# We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs...
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
# Task:
# =====
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
# We are going to use these IDs to produce some output for a submitted task:
# therefore I need you to execute the following line, note the "seal" that this
# returns, and not change myENSPsel later:
myENSPsel <- selectENSP(ENSPsel)
# Next, to find what these proteins are...
# We could now Google for all of these IDs to learn more about them. But really,
# googling for IDs one after the other, that would be lame. Let's instead use
# the very, very useful biomaRt package to translate these Ensemble IDs into
# gene symbols.
# = 3 biomaRt =============================================================
# IDs are just labels, but for _bio_informatics we need to learn more about the
# biological function of the genes or proteins that we retrieve via graph data
# mining. biomaRt is the tool of choice. It's a package distributed by the
# bioconductor project. This here is not a biomaRt tutorial (that's for another
# day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt")
}
# Package information:
# library(help = biomaRt) # basic information
# browseVignettes("biomaRt") # available vignettes
# data(package = "biomaRt") # available datasets
# define which dataset to use ... this takes a while for download
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined?
( filters <- biomaRt::listFilters(myMart) )
# and what attributes can we filter for?
( attributes <- biomaRt::listAttributes(myMart) )
# Soooo many options - let's look for the correct name of filters that are
# useful for ENSP IDs ...
filters[grep("ENSP", filters$description), ]
# ... and the correct attribute names for gene symbols and descriptions ...
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
# ... so we can put this together: here is a syntax example:
biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = "ENSP00000000442",
mart = myMart)
# A simple loop will now get us the information for our 10 most central genes
# from the human subset of STRING.
CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list.
for (ID in myENSPsel) {
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = ID,
mart = myMart)
}
# So what are the proteins with the ten highest betweenness centralities?
# ... are you surprised? (I am! Really.)
# = 4 Task for submission =================================================
# Write a loop that will go through your personalized list of Ensemble IDs and
# for each ID:
# -- print the ID,
# -- print the first row's HGNC symbol,
# -- print the first row's wikigene description.
# -- print the first row's phenotype.
#
# Write your thoughts about this group of genes.
#
# (Hint, you can structure your loop in the same way as the loop that
# created CPdefs. )
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
# for this loop and its output into your report if you are submitting
# anything for credit for this unit. Please read the requirements carefully.
# [END]