forked from esherm/intSiteCaller
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdebugTrim.R
182 lines (150 loc) · 5.74 KB
/
debugTrim.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# This source code file is a component of the larger INSPIIRED genomic analysis software package.
# Copyright (C) 2016 Frederic Bushman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
## this is debugging for:
## run: run20150609
## replicate: GTSP0440-1
##
## problem was trhat
## replicates of GTSP0440 have 165819 reads but only 178
## had all primer, ltrbit, linker.
## see https://microb215.med.upenn.edu/Download/data/share/GTSPReports/bushmanlab/html/run20150609.stat.html
##
## debugging data was located
## microb244:~/run20150609/
##
## The following debugging info seems to suggest that
## the code performed as expected. Sample GTSP0440 might
## have some problem.
codeDir <- get(load("codeDir.RData"))
source(file.path(codeDir, "intSiteLogic.R"))
## here goes the id for GTSP0440-1
sampleID <- 29
completeMetadata <- get(load("completeMetadata.RData"))[sampleID,]
alias <- completeMetadata$alias
print(t(as.data.frame(completeMetadata)), quote=FALSE)
workingDir <- alias
setwd(workingDir)
stats.bore <- data.frame(sample=alias)
qualityThreshold=completeMetadata$qualityThreshold
badQuality=completeMetadata$badQualityBases
qualityWindow=completeMetadata$qualitySlidingWindow
primer=completeMetadata$primer
ltrbit=completeMetadata$ltrBit
largeLTRFrag=completeMetadata$largeLTRFrag
linker=completeMetadata$linkerSequence
linker_common=completeMetadata$linkerCommon
mingDNA=completeMetadata$mingDNA
read1=completeMetadata$read1
read2=completeMetadata$read2
alias=completeMetadata$alias
vectorSeq=completeMetadata$vectorSeq
read1 <- completeMetadata$read1
read2 <- completeMetadata$read2
read1 <- sub("GTSPRun/", "", read1)
read2 <- sub("GTSPRun/", "", read2)
stopifnot(all(file.exists(read1, read2)))
## inside the function
reads <- lapply(list(read1, read2), sapply, readFastq)
stats.bore$barcoded <- sum(sapply(reads[[1]], length))
r <- lapply(reads, function(x){
seqs <- x[[1]]
if(length(seqs) > 0){
##remove anything after 5 bases under Q30 in 10bp window
##trimmed <- trimTailw(seqs, badQuality, qualityThreshold,
## round(qualityWindow/2))
## this step is not necessary at all
## trim if 5 bases are below '0'(fred score 15) in a window of 10 bases
## trimmed <- trimTailw(seqs, 5, '+', 5)
## trimmed <- trimTailw(seqs, 5, '#', 5)
## this step is necessary because many shortreads functions work on ACGT only
##trimmed <- trimmed[width(trimmed) > 65]
trimmed <- seqs
trimmed <- trimmed[!grepl('N', sread(trimmed))]
if(length(trimmed) > 0){
trimmedSeqs <- sread(trimmed)
trimmedqSeqs <- quality(quality(trimmed))
names(trimmedSeqs) <- names(trimmedqSeqs) <-
sapply(sub("(.+) .+","\\1",ShortRead::id(trimmed)),
function(z){paste0(alias, "%", strsplit(z, "-")[[1]][2])})
}
}
list(trimmedSeqs, trimmedqSeqs)
})
reads <- sapply(r, "[[", 1)
qualities <- sapply(r, "[[", 2)
R1Quality <- qualities[[1]]
rm(r)
gc()
reads.p <- trim_Ltr_side_reads(reads[[2]], primer, ltrbit)
stats.bore$LTRed <- length(reads.p)
## note very few reads left
## debugging trim_Ltr_side_reads()
reads.p = reads[[2]]
maxMisMatch=2
## inside trim_Ltr_side_reads()
stopifnot(class(reads.p) %in% "DNAStringSet")
stopifnot(!any(duplicated(names(reads.p))))
stopifnot(length(primer)==1)
stopifnot(length(ltrbit)==1)
submat1 <- nucleotideSubstitutionMatrix(match=1,
mismatch=0,
baseOnly=TRUE)
## p for primer
## search for primer from the beginning
aln.p <- pairwiseAlignment(pattern=subseq(reads.p, 1, 1+nchar(primer)),
subject=primer,
substitutionMatrix=submat1,
gapOpening = 0,
gapExtension = 1,
type="overlap")
aln.p.df <- PairwiseAlignmentsSingleSubject2DF(aln.p)
## l for ltrbit
## search for ltrbit fellowing primer
## note, for SCID trial, there are GGG between primer and ltr bit and hence 5
## for extra bases
aln.l <- pairwiseAlignment(pattern=subseq(reads.p, nchar(primer)+1, nchar(primer)+nchar(ltrbit)+1),
subject=ltrbit,
substitutionMatrix=submat1,
gapOpening = 0,
gapExtension = 1,
type="overlap")
aln.l.df <- PairwiseAlignmentsSingleSubject2DF(aln.l, shift=nchar(primer)-1)
## check primer
print(primer)
## [1] "CCTCGGG"
tdf <- as.data.frame(table(unname(as.character(subseq(reads.p, 1, nchar(primer))))))
head(tdf[order(-tdf$Freq),])
## Var1 Freq
## 331 CCTCGGG 13494
## 330 CCTCGGC 11596
## 345 CCTCTTG 193
## 332 CCTCGGT 86
## 329 CCTCGGA 57
## 355 CCTGGGC 28
## most reads match primer
## check ltrbit
print(ltrbit)
## [1] "GGTCTTTCA"
tdf <- as.data.frame(table(unname(as.character(subseq(reads.p, nchar(primer)+1, nchar(primer)+nchar(ltrbit))))))
head(tdf[order(-tdf$Freq),])
## Var1 Freq
## 476 CTCCCAAAG 12698
## 1008 GTGGAGGGT 2908
## 500 CTCCCAGAG 1461
## 551 CTCCGAAAG 1109
## 240 CAGGTTACG 1071
## 663 CTTCCAAAG 748