-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmenthu2.0AccessoryFunctions.R
331 lines (265 loc) · 12 KB
/
menthu2.0AccessoryFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#' calculateMenthu2
#'
#' @param inData A DNA sequence window/context
#' @param cutSite The expected index of the nucleotide to the left of the cut; -1 if cut site is in middle of wildtype sequence
#' @param weight A weight factor for caculating pattern score; default is 20, as in Bae paper
#' @param maxdbm Maximum distance between microhomologies
#'
#' @return
#For each sequence in the list, calculate the slope competition
calculateMenthu2 <- function(inData, cutSite = -1, weight = 20, maxdbm = 5){
# If the cut site isn't specified, assume it is in the middle of the input
if(cutSite == -1){
cutSite <- floor(nchar(inData) / 2)
}
# Get ordered pattern scores for every deletion pattern in the wildtype sequence
patternScoreDF <- baeRevised(inData, cutSite, weight)
if(nrow(patternScoreDF) > 0){
#Sort frame by pattern score
patternScoreDF <- patternScoreDF[order(-patternScoreDF$patternScore), ]
# Filter out MHs shorter than 3 from patternScoreDF
threePlus <- patternScoreDF[which(nchar(patternScoreDF$microhomology) >= 3),]
# Filter out entries with top predicted deletion with more than 5bp separation between MHs
if(threePlus$delLength[1] - nchar(patternScoreDF$microhomology)[1] <= maxdbm) {
critOne <- TRUE
} else {
critOne <- FALSE
}
# Generate output values
# If no entries comply with criterion (no 3+bp mh w/in 5bp), set ratio to zero and fShift to NA
if(critOne == FALSE) {
ratio <- 0
fShift <- "NA"
# If only one entry complies with criterion, set an infinite ratio and determine fShift
} else if(nrow(threePlus) == 1) {
ratio <- Inf
fShift <- (if(threePlus$delLength[1] %% 3 == 0){"No"} else {"Yes"})
} else {
# Calculation of MENTHU score 2.0
ratio <- threePlus$patternScore[1] / threePlus$patternScore[2]
#To prevent choking on 0/0 instances (no 3+bp microhomologies in anything), set ratio to zero and fShift to NA
if(is.nan(ratio)){
ratio <- 0
fShift <- "NA"
} else {
fShift <- (if(threePlus$delLength[1] %% 3 == 0){"No"} else {"Yes"})
}
}
# Organize output values as data frama
outFrame <- data.frame(seq = inData,
menthuScore = ratio,
frameShift = fShift,
topDel = threePlus$delSeq[1],
topMH = threePlus$microhomology[1],
stringsAsFactors = FALSE)
} else {
#If there are no microhomologies, return an empty-ish frame
outFrame <- data.frame(seq = "",
menthuScore = -1,
frameShift = "NA",
topDel = "",
topMH = "",
stringsAsFactors = FALSE)
}
return(outFrame)
}
#' calculateSlopeCompetition
#'
#' @param inData A DNA sequence/window/context
#' @param cutSite The expected index of the nucleotide to the left of the cut; -1 if cut site is in middle of wildtype sequence
#' @param weight A weight factor for caculating pattern score; default is 20, as in Bae paper
#' @param top The number of entries to consider when generating the linear regression
#'
#' @return
calculateSlopeCompetition <- function(inData, cutSite = -1, weight = 20, top = 10){
#If the cut site isn't specified, assume it is in the middle of the input
if(cutSite == -1){
cutSite <- floor(nchar(inData) / 2)
}
#Get the pattern scores for every deletion pattern in the wildtype sequence
patternScoreDF <- baeRevised(inData, cutSite, weight, mh = 2)
patternScoreDF <- patternScoreDF[order(-patternScoreDF$patternScore), ]
if(nrow(patternScoreDF) > 0){
#Determine if there is a frameshift for the top-scoring deletion pattern
fShift <- (if(patternScoreDF[1, 3] %% 3 == 0){"No"} else {"Yes"})
#Subset the out of frame scores
outOfFrameInst <- patternScoreDF[which(patternScoreDF$delLength %% 3 != 0), ]
#Calculate the microhomology score according to Bae algorithm
mhScore <- sum((nchar(patternScoreDF$microhomology) +
stringr::str_count(toupper(patternScoreDF$microhomology), "G") +
stringr::str_count(toupper(patternScoreDF$microhomology), "C")) *
(1 / exp((patternScoreDF$delLength) / weight)) * 100)
#Calculate the out-of-frame score according to Bae algorithm
outOfFrameScore <- (sum((nchar(outOfFrameInst$microhomology) +
stringr::str_count(toupper(outOfFrameInst$microhomology), "G") +
stringr::str_count(toupper(outOfFrameInst$microhomology), "C")) *
(1 / exp((outOfFrameInst$delLength) / weight)) * 100) / mhScore) * 100
###This subsets to only 3bp or greater microhomologies#########
threePlus <- patternScoreDF[which(nchar(patternScoreDF$microhomology) >= 3), ]
#If there are at least 10 3bp microhomologies, take the top 10; otherwise, use as many as are available
if(nrow(threePlus) < top){
end3 <- nrow(threePlus)
} else {
end3 <- top
}
#Generate list of top 10 3bp+ microhomologies
top103Plus <- threePlus[1:end3, ]
#Perform linear regression on the top 10
linModel3 <- lm(top103Plus$patternScore ~ seq(1:end3), data = top103Plus)
#Create data frame to hold information
tempFrame <- data.frame(seq = inData,
microhomologyScore = mhScore,
outOfFrameScore = outOfFrameScore,
slopeMH3Plus = linModel3$coefficients[2],
frameShift = fShift,
topDel = top103Plus$seq[1],
stringsAsFactors = FALSE)
} else {
#Create empty data frame if there are no pattern scores
tempFrame <- data.frame(seq = "",
microhomologyScore = 0,
slopeMH3Plus = 0,
frameShift = "NA",
stringsAsFactors = FALSE)
}
rownames(tempFrame) <- c()
return(tempFrame)
}
#' baeRevised
#'
#' @param seq DNA sequence
#' @param cutPosition Position to be cut in the DNA sequence
#' @param weight deletion length weight factor
#'
#' @return
#' @export
#'
#' @examples
baeRevised <- function(seq, cutPosition, weight = 20.0, mhL = 3){
#Split the sequence into upstream and downstream of the cut site
upstream <- substring(seq, 1, nchar(seq) / 2)
downstream <- substring(seq, (nchar(seq) / 2) + 1, nchar(seq))
#Get all common substrings between upstream and downstream sections of sequence
commonSubstrings <- allsubstr(upstream, downstream, mhL)
#Check if there is a 3bp or greater MH
if(length(commonSubstrings) > 0){
#Find all upstream matches to common substrings
matchLocsUp <- stack(sapply(commonSubstrings, Biostrings::gregexpr2, text = upstream))
matchLocsUp$ind <- as.character(matchLocsUp$ind)
#Find all downstream matches to common substrings
matchLocsDown <- stack(sapply(commonSubstrings, Biostrings::gregexpr2, text = downstream))
matchLocsDown$ind <- as.character(matchLocsDown$ind)
#Find all the starting and ending locations of the common substrings in the upstream portion
upStart <- matchLocsUp$values
upEnd <- matchLocsUp$values + nchar(matchLocsUp$ind) - 1
#Do the same for the downstream
downStart <- matchLocsDown$values + nchar(upstream)
downEnd <- matchLocsDown$values + nchar(matchLocsDown$ind) + nchar(upstream)
#Store microhomologies in data frames
mhDFup <- data.frame(seq = matchLocsUp$ind,
upStart = upStart,
upEnd = upEnd,
stringsAsFactors = FALSE
)
mhDFdown <- data.frame(seq = matchLocsDown$ind,
downStart = downStart,
downEnd = downEnd,
stringsAsFactors = FALSE
)
#Order the data frames
mhDFup <- mhDFup[order(nchar(mhDFup$seq), mhDFup$seq), ]
mhDFdown <- mhDFdown[order(nchar(mhDFdown$seq), mhDFdown$seq), ]
#Merge the two data frames
mhDF <- suppressMessages(plyr::join(mhDFup, mhDFdown))
#Get the lengths of all the deletions
leng <- mhDF$downEnd - mhDF$upEnd - 1
#Get the microhomologies
mh <- mhDF$seq
#Create the deletion sequences
delSeqs <- unlist(lapply(1:nrow(mhDF), function(x) paste0(substring(seq, 1, mhDF$upEnd[x]),
paste(rep('-', leng[x]), collapse = ''),
substring(seq, mhDF$downEnd[x], nchar(seq)))))
# Create the sequence that would be observed from the deletion
delPattern <- gsub('-', '', delSeqs, fixed = TRUE)
oof <- unlist(lapply(leng, function(x) (if(x %% 3 != 0){1} else {0})))
# Create a data frame to hold the information
delFrame <- data.frame(seq = rep(seq, length(delSeqs)),
deletedSeqContext = delSeqs,
delPattern = delPattern,
microhomology = mh,
startDel = mhDF$upEnd + 1,
endDel = mhDF$downEnd,
mhStart1 = mhDF$upStart,
mhEnd1 = mhDF$upEnd,
mhStart2 = mhDF$downStart,
mhEnd2 = mhDF$downEnd,
deletedSeq = delSeqs,
delLength = leng,
mhLength = nchar(mhDF$seq),
GC = (stringr::str_count(mh, 'G') + stringr::str_count(mh, 'C')) / nchar(mh),
outOfFrame = oof,
patternScore = (100 * (round(1 / exp((leng) / weight), 3)) *
(stringr::str_count(mh, 'G') + stringr::str_count(mh, 'C') + nchar(mh))),
stringsAsFactors = FALSE)
# Order the data frame by microhomology length (descending), and then by pattern score (descending)
delFrameOrd <- delFrame[order(-nchar(delFrame$microhomology), -delFrame$patternScore),]
# Identify if a row in the data frame produces a deletion pattern produced by a longer microhomology
# This eliminates redundant microhomologies which are actually part of a longer, better-scoring microhomology
dupes <- sapply(1:nrow(delFrameOrd), function(x) unlist(sapply(1:x, function(y){
if(x != y){
if(delFrameOrd$delPattern[x] == delFrameOrd$delPattern[y]){
TRUE
} else {
FALSE
}
} else {
FALSE
}
})))
# Drop the duplicates
dupeDrop <- sapply(dupes, function(x) any(x))
dupeDropList <- which(dupeDrop)
delFrameOrdDeDupe <- delFrameOrd[-dupeDropList,]
delFrame <- delFrameOrdDeDupe[order(-delFrameOrdDeDupe$patternScore), ]
#Create the deletion/patternscore data frame
psDF <- data.frame(seq = delFrame$seq,
microhomology = delFrame$microhomology,
delLength = delFrame$delLength,
patternScore = delFrame$patternScore,
delSeq = delFrame$deletedSeqContext,
stringsAsFactors = FALSE
)
} else {
#Create empty return frame if no MHs detected
psDF <- data.frame(seq = as.character(),
microhomology = as.character(),
delLength = as.numeric(),
patternScore = as.numeric(),
delSeqs = as.character(),
stringsAsFactors = FALSE)
}
return(psDF)
}
#' allsubstr
#'
#' @param upstream upstream section of DNA sequence
#' @param downstream downstream section of DNA sequence
#'
#' @return
#' @export
#'
#' @examples
allsubstr <- function(upstream, downstream, mh = 3){
#Create empty string to hold upstream strings
upstreamStrings <- list()
#Create empty string to hold downstream strings
downstreamStrings <- list()
#Generate all possible substrings of length >= 3 in the upstream and downstream strings
for(i in mh:nchar(upstream)){
upstreamStrings <- c(upstreamStrings, unique(substring(upstream, 1:(nchar(upstream) - i + 1), i:nchar(upstream))))
downstreamStrings <- c(downstreamStrings, unique(substring(downstream, 1:(nchar(downstream) - i + 1), i:nchar(downstream))))
}
#Find the intersection (common strings) between the upstream and downstream section
commonStrings <- intersect(unlist(upstreamStrings), unlist(downstreamStrings))
return(commonStrings)
}