-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBIN-Sequence.R
394 lines (291 loc) · 13.9 KB
/
BIN-Sequence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
# tocID <- "BIN-Sequence.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit.
#
# Version: 1.5
#
# Date: 2017-09 - 2020-09
# Author: Boris Steipe ([email protected])
#
# Versions:
# 1.5 2020 Updates
# 1.4 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr()
# 1.0 First live version 2017.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Prepare 63
#TOC> 2 Storing Sequence 80
#TOC> 3 String properties 109
#TOC> 4 Substrings 116
#TOC> 5 Creating strings: sprintf() 137
#TOC> 6 Changing strings 172
#TOC> 6.1.1 Changing case 174
#TOC> 6.1.2 Reverse 179
#TOC> 6.1.3 Change characters 183
#TOC> 6.1.4 Substitute characters 211
#TOC> 6.2 stringi and stringr 231
#TOC> 6.3 dbSanitizeSequence() 241
#TOC> 7 Permuting and sampling 253
#TOC> 7.1 Permutations 260
#TOC> 7.2 Sampling 306
#TOC> 7.2.1 Equiprobable characters 308
#TOC> 7.2.2 Defined probability vector 350
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
# Much basic sequence handling is supported by the Bioconductor package
# Biostrings.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# = 2 Storing Sequence ====================================================
# Sequences can be represented and stored as vectors of single characters ...
(v <- c("D", "I", "V", "M", "T", "Q"))
# ... as strings ...
(s <- "DIVMTQ")
# ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc.
(a <- Biostrings::AAString("DIVMTQ"))
# ... and all of these representations can be interconverted:
# string to vector ...
unlist(strsplit(s, ""))
# vector to string ...
paste(v, sep = "", collapse = "")
# ... and AAstring to plain string.
as.character(a)
# Since operations with character vectors trivially follow all other vector
# conventions and syntax, and we will look at Biostrings methods in more
# detail in a later unit, we will focus on basic strings in the following.
# = 3 String properties ===================================================
length(s) # why ???
nchar(s) # Aha!
# = 4 Substrings ==========================================================
# Use the substr() function
substr(s, 2, 4)
# or the similar substring()
substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence
(vI <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
substring(s, vI, vI+2) # ... returns all triplets
# = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a bit of a learning curve, but this is
# totally worth it:
# the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf()
# calls elsewhere in the code.
sprintf("Just a string.")
sprintf("A string and the number %d.", 5)
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer"
N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string:
for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i - 1, x))
}
# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ====================================================
# === 6.1.1 Changing case
tolower(s)
toupper(tolower(s))
# === 6.1.2 Reverse
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
# Biostrings::str_rev(s)
# The following works, of course, but awkward:
s
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
# reverse complement
COMP <- c("t", "g", "c", "a")
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
s
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
# === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." Kind of like the COMP vector above ...
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
# One could implement toupper() and tolower() with this - remember that R has
# character vectors of uppercase and lowercase letters as language constants.
chartr(paste0(letters, collapse = ""),
paste0(LETTERS, collapse = ""),
"Twinkle, twinkle little star, how I wonder what you are.")
# One amusing way to use the function is for a reversible substitution
# cypher.
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
set.seed(112358) # set RNG seed for repeatable randomness
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
set.seed(NULL) # reset the RNG
# encode ...
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
# decode ...
chartr(myCypher, alBet, x)
# (Nb. substitution cyphers are easy to crack!)
# === 6.1.4 Substitute characters
# gsub can change lengths.
# Example: implementing the binary Fibonacci sequence:
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
( s <- 1 )
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
# Iterate this line a few times ...
#
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
# for the features of the sequence.
# I use gsub() often to delete unwanted characters ...
# ... select something, and substitute the empty string for it.
(s <- gsub("-", "", s))
# For example: clean up a sequence
# copy/paste from UniProt
(s <- " 10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove numbers
(s <- gsub("[0-9]", "", s))
# remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s))
# == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at
# the function descriptions for the stringr and the stringi package. stringr is
# part of the tidyverse, and for the most part a wrapper for stringi functions.
# https://github.com/tidyverse/stringr
# == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources
cat( s <- ">FASTA header will be removed
10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
dbSanitizeSequence(s)
# = 7 Permuting and sampling ==============================================
# An important aspect of working with strings is generating random strings
# with given statistical properties: reference items to evaluate significance.
# == 7.1 Permutations ======================================================
# One way to produce such reference items is to permute a string. A permuted
# string has the same composition as the original, but all positional
# information is lost. The sample() function can be used to permute:
# This is the sequence of the ompA secretion signal
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
(x <- sample(s, length(s))) # permuted
# Here's a small example how such permuted strings may be useful. As you look
# at the ompA sequence, you suspect that the two lysines near the +-charged
# N-terminus may not be accidental, but selected for a positively charged
# N-terminus. What is the chance that such a sequence has two lysines close to
# the N-terminus simply by chance? Or put differently: what is the average
# distance of two lysines in such a sequence to the N-terminus. First, we
# need an expression that measures the distance. A simple use of the which()
# function will do just fine.
which(s == "K") # shows they are in position 2 and 3, so ...
mean(which(s == "K")) # ... gives us the average, and ...
mean(which(x == "K")) # ... gives us the average of the permuted sequence.
# So what does the distribution look like? Lets do 10,000 trials.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000
d <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K"))
}
set.seed(NULL) # reset the RNG
hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
# N-terminus or more. That's just below the signifcance
# threshold of 5 %. It's a trend, but to be sure we are looking
# at a biological effect we would need to see more
# sequences.
# == 7.2 Sampling ==========================================================
# === 7.2.1 Equiprobable characters
# Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it:
nuc <- c("A", "C", "G", "T")
N <- 100
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # 51 is close to expected
# What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function
if (! requireNamespace("stringi", quietly = TRUE)) {
install.packages("stringi")
}
# Package information:
# library(help = stringi) # basic information
# browseVignettes("stringi") # available vignettes
# data(package = "stringi") # available datasets
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine
# whether there are more or less CpG motifs than expected by chance.
# (cf. https://en.wikipedia.org/wiki/CpG_site)
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
# with the same probability. But the yeast genomic DNA GC content is only
# 38%. So you would expect fewer CpG motifs based on the statistical properties
# of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that?
# === 7.2.2 Defined probability vector
# This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with
# 19 C, 19 G, 31 A and 31 T
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
# ... but that doesn't scale if the numeric accuracy needs to be higher.
#
# However sample() has an argument that takes care of that: you can explicitly
# specify the probabilities with which each element of the the sampling vector
# should be chosen:
nuc <- c("A", "C", "G", "T")
N <- 100
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs?
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case.
# [END]