forked from esherm/intSiteCaller
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhiReadsProcessor.R
6513 lines (5858 loc) · 278 KB
/
hiReadsProcessor.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Functions to process LM-PCR reads from 454/Illumina data.
#'
#' hiReadsProcessor contains set of functions which allow users to process
#' LM-PCR products sequenced using any platform. Given an excel/txt file
#' containing parameters for demultiplexing and sample metadata, the functions
#' automate trimming of adaptors and identification of the genomic product.
#' Genomic products are further processed for QC and abundance quantification.
#'
#' @import Biostrings GenomicAlignments hiAnnotator BiocParallel xlsx plyr
#' sonicLength BiocGenerics
#' @docType package
#' @name hiReadsProcessor
#' @author Nirav V Malani
NULL
#' Sample Integration Sites Sequencing Data
#'
#' This is a processed data object containing raw sequences and respective
#' alignments to UCSC freeze hg18 from 112 integration site samples. The object
#' is of SimpleList class and follows a certain structural hierarchy explained
#' by the Introductory vignette.
#'
#' @docType data
#' @keywords datasets
#' @format A SimpleList object
#' @name seqProps
NULL
#' PSL file output
#'
#' Sample BLAT PSL file output from samples included Integration Sites
#' Sequencing Data \code{\link{seqProps}}
#'
#' @docType data
#' @keywords datasets
#' @format a data frame of 1000 rows and 21 columns
#' @name psl
NULL
#' Read contents of a sequencing folder and make a SimpleList object
#'
#' Given a sequencing folder path, sample information file path, and sequence
#' file extension pattern, the function returns a list of variables required to
#' process the data. The function also calls \code{\link{read.sampleInfo}}
#' which reads in sample processing metadata and formats it if needed.
#'
#' @param sequencingFolderPath full or relative path to the sequencing folder
#' @param sampleInfoFilePath full or relative path to the sample information
#' file, which holds samples to quadrant/lane associations along with other
#' metadata required to trim sequences or process it. Default to NULL, where
#' the function tries to find xls or tab deliminated txt file in the sequencing
#' folder which sounds similar to 'sampleinfo' and present you with choices of
#' file to select from.
#' @param seqfilePattern regex/string to describe sequence file endings.
#' See examples. Default is NULL.
#' @param interactive whether to prompt each time the function encounters an
#' issue or use the defaults. Default is TRUE.
#'
#' @return a SimpleList list which is used by other functions to process and
#' decode the data.
#'
#' @note
#' \itemize{
#' \item One must make sure that each sequencing file has sector name/number
#' prefixed at the beginning, else \code{\link{findBarcodes}} will fail trying
#' to find the filename.
#' \item For paired end Illumina runs, make sure the filenames include R1, R2,
#' and I1 somewhere in the name denoting pair1, pair2, and index/barcode
#' reads, respectively.
#' }
#'
#' @seealso \code{\link{read.sampleInfo}}, \code{\link{findBarcodes}},
#' \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' runData <- system.file("extdata/FLX_sample_run/",
#' package = "hiReadsProcessor")
#' read.SeqFolder(runData, seqfilePattern=".+fna.gz$")
#' \dontrun{
#' read.SeqFolder(".", seqfilePattern="\\.TCA.454Reads.fna$")
#' read.SeqFolder(".", seqfilePattern=".+fastq$")
#' read.SeqFolder(".", seqfilePattern=".+sff$")
#' }
read.SeqFolder <- function(sequencingFolderPath=NULL, sampleInfoFilePath=NULL,
seqfilePattern=NULL, interactive=TRUE) {
if(is.null(sequencingFolderPath)) {
stop("No Sequencing Folder Path provided.")
}
## get the sequencingFolderPath right!
sequencingFolderPath <- normalizePath(sequencingFolderPath, mustWork=TRUE)
seqFilePaths <- list.files(path=sequencingFolderPath, recursive=TRUE,
full.names=TRUE, pattern=seqfilePattern)
if(length(seqFilePaths)==0) {
stop("No files found in the folder ", sequencingFolderPath,
"matching following pattern: ", seqfilePattern)
}
## read the sample info file
if(is.null(sampleInfoFilePath)) {
possibleFiles <- list.files(path=sequencingFolderPath,
recursive=TRUE, full.names=TRUE,
pattern=".*sampleinfo.+", ignore.case=TRUE)
if (length(possibleFiles)==0) {
stop("No sample information file found in folder: ", sequencingFolderPath)
} else {
if(interactive & length(possibleFiles)>1) {
message("Please choose a sample information file to read the meta data
from:\n",
paste(1:length(possibleFiles), possibleFiles,
sep=": ", collapse="\n"))
choice <- scan(what=integer(0), n=1, quiet=TRUE, multi.line=FALSE)
} else {
choice <- 1
}
message("Choosing ", possibleFiles[choice],
" as sample information file.")
}
sampleInfoFilePath <- possibleFiles[choice]
}
sampleInfo <- read.sampleInfo(sampleInfoFilePath, interactive=interactive)
## do a quick test of filenames if any samples are from paired end illumina
if(any(sampleInfo$pairedend)) {
sectors <- unique(sampleInfo$sector[sampleInfo$pairedend])
for(sector in sectors) {
vals <- grep(gsub("I1|R1|R2","",sector), seqFilePaths, value=TRUE)
if(length(vals)!=3) {
stop("Sector ",sector," is missing one of the files: R1, R2, or I1.")
}
}
}
if(length(sampleInfo)!=length(unique(gsub("I1|R1|R2","",seqFilePaths)))) {
warning("Number of sectors (", length(sampleInfo),
") in sample information file doesn't match # of sector files (",
length(unique(gsub(seqfilePattern,'',seqFilePaths))),
") found in the folder.")
}
return(SimpleList("sequencingFolderPath"=sequencingFolderPath,
"seqFilePaths"=seqFilePaths,
"seqfilePattern"=seqfilePattern,
"sampleInfoFilePath"=sampleInfoFilePath,
"sectors"=sampleInfo, "callHistory"=match.call()))
}
#' Read a sample information file and format appropriate metadata.
#'
#' Given a sample information file, the function checks if it includes required
#' information to process samples present on each sector/quadrant/region/lane.
#' The function also adds other columns required for processing with default
#' values if not already defined ahead of time.
#'
#' @param sampleInfoPath full or relative path to the sample information file,
#' which holds samples to quadrant/lane associations along with other metadata
#' required to trim sequences or process it.
#' @param splitBySector split the data frame into a list by sector column.
#' Default is TRUE.
#' @param interactive whether to prompt each time the function encounters an
#' issue, or use the defaults. Default is TRUE.
#'
#' @details
#' \itemize{
#' \item Required Column Description:
#' \itemize{
#' \item sector => region/quadrant/lane of the sequencing plate the sample
#' comes from. If files have been split by samples apriori, then the filename
#' associated per sample without the extension. If this is a filename, then
#' be sure to enable 'alreadyDecoded' parameter in \code{\link{findBarcodes}},
#' since contents of this column is pasted together with 'seqfilePattern'
#' parameter in \code{\link{read.SeqFolder}} to find the appropriate file
#' needed. For paired end data, this is basename of the FASTA/Q file holding
#' the sample data from the LTR side. For example, files such as
#' Lib3_L001_R2_001.fastq.gz or Lib3_L001_R2_001.fastq would be
#' Lib3_L001_R2_001, and consequently Lib3_L001_R1_001 would be used as the
#' second pair!
#' \item barcode => unique 4-12bp DNA sequence which identifies the sample.
#' If providing filename as sector, then leave this blank since it is assumed
#' that the data is already demultiplexed.
#' \item primerltrsequence => DNA sequence of the viral LTR primer
#' with/without the viral LTR sequence following the primer landing site.
#' If already trimmed, then mark this as SKIP.
#' \item sampleName => Name of the sample associated with the barcode
#' \item sampleDescription => Detailed description of the sample
#' \item gender => sex of the sample: male or female or NA
#' \item species => species of the sample: homo sapien, mus musculus, etc.
#' \item freeze => UCSC freeze to which the sample should be aligned to.
#' \item linkerSequence => DNA sequence of the linker adaptor following the
#' genomic sequence. If already trimmed, then mark this as SKIP.
#' \item restrictionEnzyme => Restriction enzyme used for digestion and
#' sample recovery. Can also be one of: Fragmentase or Sonication!
#' }
#' \item Metadata Parameter Column Description:
#' \itemize{
#' \item ltrBitSequence => DNA sequence of the viral LTR following the
#' primer landing site. Default is last 7bps of the primerltrsequence.
#' \item ltrBitIdentity => percent of LTR bit sequence to match during the
#' alignment. Default is 1.
#' \item primerLTRidentity => percent of primer to match during the
#' alignment. Default is .85
#' \item linkerIdentity => percent of linker sequence to match during the
#' alignment. Default is 0.55. Only applies to non-primerID/random tag based
#' linker search.
#' \item primerIdInLinker => whether the linker adaptor used has
#' primerID/random tag in it? Default is FALSE.
#' \item primerIdInLinkerIdentity1 => percent of sequence to match before
#' the random tag. Default is 0.75. Only applies to primerID/random tag
#' based linker search and when primeridinlinker is TRUE.
#' \item primerIdInLinkerIdentity2 => percent of sequence to match after the
#' random tag. Default is 0.50. Only applies to primerID/random tag based
#' linker search and when primeridinlinker is TRUE.
#' \item celltype => celltype information associated with the sample
#' \item user => name of the user who prepared or processed the sample
#' \item pairedEnd => is the data paired end? Default is FALSE.
#' \item vectorFile => fasta file containing the vector sequence
#' }
#' \item Processing Parameter Column Description:
#' \itemize{
#' \item startWithin => upper bound limit of where the alignment should
#' start within the query. Default is 3.
#' \item alignRatioThreshold => cuttoff for (alignment span/read length).
#' Default is 0.7.
#' \item genomicPercentIdentity => cuttoff for (1-(misMatches/matches)).
#' Default is 0.98.
#' \item clusterSitesWithin => cluster integration sites within a defined
#' window size based on frequency which corrects for any sequencing errors.
#' Default is 5.
#' \item keepMultiHits => whether to keep sequences/reads that return
#' multiple best hits, aka ambiguous locations.
#' \item processingDate => the date of processing
#' }
#' }
#'
#' @return if splitBySector=TRUE, then an object of SimpleList named by
#' quadrant/lane information defined in sampleInfo file, else a dataframe.
#'
#' @seealso \code{\link{read.SeqFolder}}, \code{\link{findBarcodes}},
#' \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' runData <- system.file("extdata/FLX_sample_run",
#' package = "hiReadsProcessor")
#' read.sampleInfo(file.path(runData,"sampleInfo.xls"))
read.sampleInfo <- function(sampleInfoPath=NULL, splitBySector=TRUE,
interactive=TRUE) {
## read file and make sampleInfo object with sample to metadata associations
if(is.null(sampleInfoPath)) {
stop("No sample information file path provided.")
}
sampleInfoPath <- normalizePath(sampleInfoPath, mustWork=TRUE)
requiredCols <- c('sector', 'barcode', 'primerltrsequence', 'samplename',
'sampledescription', 'gender', 'species', 'freeze',
'linkersequence', 'restrictionenzyme')
metaDataCols <- c('ltrbitsequence'='', 'ltrbitidentity'=1,
'primerltridentity'=.85, 'linkeridentity'=.55,
'primeridinlinker'=FALSE, 'primeridinlinkeridentity1'=.75,
'primeridinlinkeridentity2'=.50, 'celltype'='',
'user'=Sys.getenv("USER"), 'startwithin'=3,
'alignratiothreshold'=.7, 'clustersiteswithin'=5,
'keepmultihits'=TRUE , 'genomicpercentidentity'=.98,
'processingdate'=format(Sys.time(), "%Y-%m-%d "),
'pairedend'=FALSE, 'vectorFile'='')
if(grepl('.xls.?$', sampleInfoPath)) {
sampleInfo <- unique(read.xlsx(sampleInfoPath,
sheetIndex=1, stringsAsFactors=FALSE))
} else {
sampleInfo <- unique(read.delim(sampleInfoPath, stringsAsFactors=FALSE))
}
names(sampleInfo) <- tolower(gsub("\\.|-|_", "", names(sampleInfo)))
# check for required columns
ColsNotThere <- !requiredCols %in% names(sampleInfo)
if (any(ColsNotThere)) {
absentCols <- requiredCols[ColsNotThere]
stop("Following required column(s) is absent from the Sample Info file: ",
paste(absentCols,sep="", collapse=", "))
}
# add missing meta data columns
metaColsNotThere <- !names(metaDataCols) %in% names(sampleInfo)
if(any(metaColsNotThere)) {
sampleInfo <- cbind(sampleInfo,
as.data.frame(t(metaDataCols[metaColsNotThere]),
stringsAsFactors = FALSE))
}
# do some formatting to avoid later hassels!
for(column in c('sector', 'barcode', 'primerltrsequence', 'ltrbitsequence',
'samplename', 'linkersequence', 'restrictionenzyme')) {
sampleInfo[,column] <- gsub(" ", "", sampleInfo[,column])
if(column %in% c('barcode', 'primerltrsequence', 'ltrbitsequence',
'linkersequence', 'restrictionenzyme')) {
sampleInfo[,column] <- toupper(sampleInfo[,column])
}
}
for(column in c('pairedend', 'keepmultihits', 'primeridinlinker')) {
sampleInfo[,column] <- as.logical(sampleInfo[,column])
}
# confirm ltr bit is correct
ltrbitTest <- sampleInfo$primerltrsequence=="SKIP"
if(any(ltrbitTest)) {
## add SKIP to ltrbit as well if primerltrsequence has been trimmed
tofix <- which(ltrbitTest)
message("adding SKIP to ltrbitsequence to ",length(tofix),
" sample since primerltrsequence has been trimmed.")
sampleInfo$ltrbitsequence[tofix] <- "SKIP"
}
ltrbitTest <- nchar(sampleInfo$ltrbitsequence)==0 | sampleInfo$ltrbitsequence==""
if(any(ltrbitTest)) {
tofix <- which(ltrbitTest)
if(interactive) {
message("LTR bit not found for ",length(tofix)," samples.
Use last 7 bases of the LTR primer as the LTR bit? (y/n)")
choice <- scan(what=character(0), n=1, quiet=TRUE, multi.line=FALSE)
} else {
message("LTR bit not found for ",length(tofix),
" samples. Using last 7 bases of the LTR primer as the LTR bit.")
choice <- "y"
}
if(tolower(choice)=="y") {
sampleInfo$ltrbitsequence <- substr(sampleInfo$primerltrsequence,
nchar(sampleInfo$primerltrsequence)-6,
nchar(sampleInfo$primerltrsequence))
sampleInfo$primerltrsequence <- substr(sampleInfo$primerltrsequence, 1,
nchar(sampleInfo$primerltrsequence)-7)
} else {
warning("No LTR bit sequence found for following samples: ",
paste(sampleInfo$samplename[tofix], sep="", collapse=", "),
immediate.=TRUE)
}
}
# check if samplenames are up to the expectations
samplenametest <- nchar(sampleInfo$samplename)==0 | sampleInfo$samplename==""
if(any(samplenametest)) {
stop("No sample names found in following rows of the sample information file ",
sampleInfoPath, " : ", paste(which(samplenametest),
sep="", collapse=", "))
}
# check for sectors and their usage
sectortest <- nchar(sampleInfo$sector)==0 | sampleInfo$sector=="" |
is.na(sampleInfo$sector)
if(any(sectortest)) {
tofix <- which(sectortest)
if(interactive) {
message("Sector information not found for ", length(tofix),
" samples. Which sector are they from? (1,2,4,etc)")
choice <- scan(what=character(0), quiet=TRUE, multi.line=FALSE)
} else {
message("Sector information not found for ", length(tofix),
" samples. Assuming they are from sector 1.")
choice <- "1"
}
if(length(choice)>0) {
sampleInfo$sector[tofix] <- unlist(strsplit(choice,","))
} else {
stop("No Sector information found for following samples: ",
paste(sampleInfo$samplename[tofix], sep="", collapse=", "))
}
}
## excel sometimes converts integers to doubles...
## make sure to remove the trailing 0
sampleInfo$sector <- gsub("\\.0$", "", as.character(sampleInfo$sector))
sampleSectorTest <- table(paste(sampleInfo$samplename, sampleInfo$sector))
if(any(sampleSectorTest>1)) {
stop("Duplicate sample names found on same quadrant in the ",
"sample information file ", sampleInfoPath, " : ",
paste(sampleSectorTest[sampleSectorTest>1], sep="",collapse=", "))
}
# prepare the sample info object!
if(splitBySector) {
sampleInfo <- SimpleList(split(sampleInfo, sampleInfo$sector))
for(sector in 1:length(sampleInfo)) {
sampleData <- SimpleList(split(sampleInfo[[sector]],
sampleInfo[[sector]]$samplename))
for(sample.i in 1:length(sampleData)) {
sampleData[[sample.i]] <- SimpleList(as.list(sampleData[[sample.i]]))
}
sampleInfo[[sector]] <- SimpleList("samples"=sampleData)
}
}
return(sampleInfo)
}
#' Removes duplicate sequences from DNAStringSet object.
#'
#' Given a DNAStringSet object, the function dereplicates reads and
#' adds counts=X to the definition line to indicate replication.
#'
#' @param dnaSet DNAStringSet object to dereplicate.
#'
#' @return DNAStringSet object with names describing frequency of repeat.
#'
#' @seealso \code{\link{replicateReads}}, \code{\link{removeReadsWithNs}},
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' dnaSet <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC",
#' "CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' dereplicateReads(dnaSet)
dereplicateReads <- function(dnaSet) {
if(!is(dnaSet,"DNAStringSet")) {
dnaSet <- DNAStringSet(dnaSet)
}
if(is.null(names(dnaSet))) {
message("No names attribute found in dnaSet object...",
"using artifically generated names")
names(dnaSet) <- paste("read", 1:length(dnaSet), sep="-")
}
dnaSet <- dnaSet[order(dnaSet)]
counts <- BiocGenerics::table(dnaSet)
dnaSet <- unique(dnaSet)
names(dnaSet) <- paste0(names(dnaSet),
"counts=",
as.integer(counts[names(counts)[names(dnaSet)]]))
return(dnaSet)
}
#' Replicate sequences from DNAStringSet object using counts identifier or vector
#'
#' Given a DNAStringSet object, the function replicates reads using counts=X
#' marker at the end of definition line.
#'
#' @param dnaSet DNAStringSet object to replicate.
#' @param counts an integer or a numeric vector of length length(dnaSet)
#' indicating how many times to repeat each sequence. Default is NULL,
#' in which it uses counts=X notation from the definition line to
#' replicate reads.
#'
#' @return DNAStringSet object.
#'
#' @seealso \code{\link{dereplicateReads}}, \code{\link{removeReadsWithNs}},
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' dnaSet <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC",
#' "CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' dnaSet <- dereplicateReads(dnaSet)
#' replicateReads(dnaSet)
replicateReads <- function(dnaSet, counts=NULL) {
stopifnot(is(dnaSet,"DNAStringSet"))
if(is.null(counts)) {
if(is.null(names(dnaSet))) {
stop("No names attribute found in dnaSet object")
}
counts <- as.numeric(sub(".+counts=(\\d+)","\\1", names(dnaSet)))
if(all(is.na(counts))) {
stop("No counts=X marker found at the end of definition line or ",
"names attribute in dnaSet object")
}
}
if (length(counts)==1) {
counts <- rep(counts, length(dnaSet))
}
ids <- unlist(sapply(counts, seq_len))
deflines <- paste0(rep(sub("(.+)counts=.+", "\\1",
names(dnaSet)), times=counts), "_", ids)
dnaSet <- rep(dnaSet, times=counts)
names(dnaSet) <- deflines
return(dnaSet)
}
#' Remove sequences with ambiguous nucleotides.
#'
#' Given a DNAStringSet object, the function removes any reads that has either repeating or total Ns which is greater than to maxNs threshold
#'
#' @param dnaSet DNAStringSet object to evaluate.
#' @param maxNs integer value denoting the threshold of maximum allowed Ns.
#' Default is 5.
#' @param consecutive boolean flag denoting whether Ns to filter is consecutive or total . Default is TRUE.
#'
#' @return DNAStringSet object.
#'
#' @seealso \code{\link{dereplicateReads}}, \code{\link{replicateReads}},
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' dnaSet <- c("CCTGAATCCTNNCAATGTCATCATC", "ATCCTGGCNATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGNNTCTGCAATGTGNGGNCCTAN", "GAAGNNNNNNGTTGAAGTTCACAC")
#' removeReadsWithNs(dnaSet)
#' removeReadsWithNs(dnaSet, maxNs=4, consecutive=FALSE)
removeReadsWithNs <- function(dnaSet, maxNs=5, consecutive=TRUE) {
if(!is(dnaSet,"DNAStringSet")) {
dnaSet <- DNAStringSet(dnaSet)
}
if(consecutive) {
good.row <- !grepl(paste(rep("N",maxNs+1), collapse=""), dnaSet, fixed=TRUE)
} else {
res <- alphabetFrequency(dnaSet)
good.row <- res[,"N"] <= maxNs
}
return(dnaSet[good.row])
}
#' Breaks an object into chunks of N size.
#'
#' Given a linear/vector like object, the function breaks it into equal sized chunks either by chunkSize. This is a helper function used by functions in 'See Also' section where each chunk is sent to a parallel node for processing.
#'
#' @param x a linear object.
#' @param chunkSize number of rows to use per chunk of query. Defaults to length(x)/detectCores() or length(query)/bpworkers() depending on parallel backend registered.
#'
#' @return a list of object split into chunks.
#'
#' @seealso \code{\link{primerIDAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, \code{\link{pairwiseAlignSeqs}}
#'
#' @export
#'
#' @examples
#' x <- c("GAGGCTGTCACCGACAAGGTTCTGA", "AATAGCGTGGTGACAGCCCACATGC",
#' "GGTCTTCTAGGGAACCTACGCCACA", "TTTCCGGCGGCAGTCAGAGCCAAAG",
#' "TCCTGTCAACTCGTAGATCCAATCA", "ATGGTCACCTACACACAACGGCAAT",
#' "GTCAGGACACCTAATCACAAACGGA", "AGACGCAGGTTCAGGCTGGACTAGA",
#' "ATCGTTTCCGGAATTCGTGCACTGG", "CAATGCGGGCACACGCTCTTACAGT")
#' chunkize(DNAStringSet(x), 5)
chunkize <- function(x, chunkSize = NULL) {
chunks <- breakInChunks(length(x),
ifelse(!is.null(chunkSize),
length(x)/chunkSize,
ifelse(!is.null(bpworkers()),
length(x)/bpworkers(),
length(x)/detectCores())))
mapply(function(z, y) x[z:y], start(chunks), end(chunks),
SIMPLIFY = FALSE, USE.NAMES = FALSE)
}
#' Split DNAStringSet object using first X number of bases defined by a vector.
#'
#' Given a character vector of barcodes/MID to sample association and a DNAStringSet object, the function splits/demultiplexes the DNAStringSet object by first few bases dictated by length of barcodes/MID supplied. This is an accessory function used by \code{\link{findBarcodes}}
#'
#' @param barcodesSample a character vector of barcodes to sample name associations. Ex: c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2",...)
#' @param dnaSet DNAStringSet object to evaluate.
#' @param trimFrom integer value serving as start point to trim the sequences from. This is calculated internally length barcode+1. Default is NULL.
#' @param showStats boolean flag denoting whether to show decoding statistics per sample & barcode. Default is FALSE.
#' @param returnUnmatched boolean flag denoting whether to return unmatched reads. Default is FALSE.
#'
#' @return DNAStringSet object split by sample name found in barcodesSample.
#'
#' @seealso \code{\link{findBarcodes}}, \code{\link{dereplicateReads}},
#' \code{\link{replicateReads}}
#'
#' @export
#'
#' @examples
#' dnaSet <- DNAStringSet(c("read1"="ACATCCATAGAGCTACGACGACATCGACATA",
#' "read2"="GAATGGATGACGACTACAGCACGACGAGCAGCTACT",
#' "read3"="GAATGGATGCGCTAAGAAGAGA", "read4"="ACATCCATTCTACACATCT"))
#' splitByBarcode(c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2"), dnaSet,
#' showStats=TRUE)
splitByBarcode <- function(barcodesSample, dnaSet, trimFrom=NULL,
showStats=FALSE, returnUnmatched=FALSE) {
if(is.null(barcodesSample) | length(barcodesSample)==0) {
stop("No barcodes to samples association vector provided in parameter ",
"barcodesSample.")
}
if(is.null(dnaSet) | length(dnaSet)==0) {
stop("No sequences provided in parameter dnaSet.")
}
if(is.null(names(dnaSet))) {
stop("No names attribute found in dnaSet object")
}
message("Using following schema for barcode to sample associations")
print(as.data.frame(barcodesSample))
## subset barcode string from the rest of sequence ##
barcodelen <- unique(nchar(names(barcodesSample)))
seqbarcodes <- substr(dnaSet,1,barcodelen)
## index rows that match your list of barcodes ##
good.row <- seqbarcodes %in% names(barcodesSample)
if(!any(good.row)) {
stop("No matching barcoded sequences found on this quadrant.")
}
sampleNames <- barcodesSample[seqbarcodes[good.row]]
deflines <- sub("^(\\S+) .+$", "\\1", names(dnaSet)[good.row], perl=TRUE)
## if primer bases were utilized for tiebreaker, use the original length
## instead of modified for trimming.
if(is.null(trimFrom)) {
trimFrom <- barcodelen+1
}
## remove sequences with unknown barcode and trim barcode itself ##
unmatched <- DNAStringSet(dnaSet[!good.row])
dnaSet <- DNAStringSet(dnaSet[good.row], start=trimFrom)
names(dnaSet) <- deflines
if(showStats) {
message("Number of Sequences with no matching barcode: ",
as.numeric(table(good.row)['FALSE']))
message("Number of Sequences decoded:")
print(as.data.frame(table(sampleNames)))
}
dnaSet <- as.list(split(dnaSet, as.character(sampleNames)))
if(returnUnmatched) {
dnaSet <- c(dnaSet, "unDecodedSeqs"=unmatched)
}
return(dnaSet)
}
#' Demultiplex reads by their barcodes
#'
#' Given a sample information object, the function reads in the raw fasta/fastq file, demultiplexes reads by their barcodes, and appends it back to the sampleInfo object. Calls \code{\link{splitByBarcode}} to perform the actual splitting of file by barcode sequences. If supplied with a character vector and reads themselves, the function behaves a bit differently. See the examples.
#'
#' @param sampleInfo sample information SimpleList object created using
#' \code{\link{read.sampleInfo}}, which holds barcodes and sample names per
#' sector/quadrant/lane or a character vector of barcodes to sample name
#' associations. Ex: c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2",...)
#' @param sector If sampleInfo is a SimpleList object, then a numeric/character
#' value or vector representing sector(s) in sampleInfo. Optionally if on high
#' memory machine sector='all' will decode/demultiplex sequences from all
#' sectors/quadrants. This option is ignored if sampleInfo is a character vector.
#' Default is NULL.
#' @param dnaSet DNAStringSet object containing sequences to be decoded or
#' demultiplexed. Default is NULL. If sampleInfo is a SimpleList object, then
#' reads are automatically extracted using \code{\link{read.seqsFromSector}}
#' and parameters defined in sampleInfo object.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param returnUnmatched return unmatched sequences. Returns results as a list
#' where x[["unDecodedSeqs"]] has culprits. Default is FALSE.
#' @param dereplicate return dereplicated sequences. Calls
#' \code{\link{dereplicateReads}}, which appends counts=X to sequence
#' names/deflines. Default is FALSE. Not applicable for paired end data since
#' it can cause insyncronicity.
#' @param alreadyDecoded if reads have be already decoded and split into
#' respective files per sample and 'seqfilePattern' parameter in
#' \code{\link{read.SeqFolder}} is set to reading sample files and not the
#' sector files, then set this to TRUE. Default is FALSE. Enabling this
#' parameter skips the barcode detection step and loads the sequence file as is
#' into the sampleInfo object.
#'
#' @return If sampleInfo is an object of SimpleList then decoded sequences are
#' appeneded to respective sample slots, else a named list of DNAStringSet
#' object. If returnUnmatched=TRUE, then x[["unDecodedSeqs"]] has the
#' unmatched sequences.
#'
#' @seealso \code{\link{splitByBarcode}}, \code{\link{dereplicateReads}},
#' \code{\link{replicateReads}}
#'
#' @export
#'
#' @aliases decodeByBarcode
#'
#' @examples
#' dnaSet <- DNAStringSet(c("read1"="ACATCCATAGAGCTACGACGACATCGACATA",
#' "read2"="GAATGGATGACGACTACAGCACGACGAGCAGCTACT",
#' "read3"="GAATGGATGCGCTAAGAAGAGA", "read4"="ACATCCATTCTACACATCT"))
#' findBarcodes(sampleInfo=c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2"),
#' dnaSet=dnaSet, showStats=TRUE, returnUnmatched=TRUE)
#' \dontrun{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findBarcodes(seqProps, sector="all", showStats=TRUE)
#' }
findBarcodes <- function(sampleInfo, sector=NULL, dnaSet=NULL,
showStats=FALSE, returnUnmatched=FALSE,
dereplicate=FALSE, alreadyDecoded=FALSE) {
## tried PDict()...and its slower than this awesome code! ##
if(is(sampleInfo,"SimpleList")) {
if(is.null(sector)) {
stop("No sector provided in parameter sector.")
}
sectors <- sector <- as.character(sector)
if(length(sectors)==1 & tolower(sectors)=="all") {
sectors <- names(sampleInfo$sectors)
}
if(any(!sectors %in% names(sampleInfo$sectors))) {
stop("Following sectors not found in names(sampleInfo$sectors): ",
sectors[!sectors %in% names(sampleInfo$sectors)])
}
for(sector in sectors) {
## check everything is cool with the provided barcodes first before
## reading the sequences!
message("Decoding sector: ",sector)
isPaired <- any(as.logical(extractFeature(sampleInfo, sector,
feature='pairedend')[[sector]]))
## prepare a vector of barcode to sample associations ##
sampleBarcodes <- toupper(extractFeature(sampleInfo, sector=sector,
feature="barcode")[[sector]])
barcodesSample <- structure(names(sampleBarcodes),
names=as.character(sampleBarcodes))
if (length(table(nchar(as.character(sampleBarcodes))))>1) {
stop("Multiple barcode lengths found.")
}
## length of barcodes before any modifications done later if any! ##
realbarcodelen <- unique(nchar(as.character(sampleBarcodes)))
if (any(table(as.character(sampleBarcodes))>1)) {
message("Duplicate barcode found on this sector.\n",
"Please choose from one of the options below:\n",
"\t1: Pick first few bases of primer for tiebreaker? ",
"(This could be dangerous if the run has too many errors!)\n",
"\t2: Use the last sample associated with the duplicate ",
"as the primary sample?\n",
"\t3: Do not do anything.")
choice <- scan(what=integer(0), n=1, quiet=TRUE, multi.line=FALSE)
if(choice==1) {
message("Enter # of bases to use from primer:")
howmany <- scan(what=integer(0), n=1, quiet=TRUE, multi.line=FALSE)
samplePrimers <- extractFeature(sampleInfo,
sector=sector,
feature="primerltrsequence")[[sector]]
samplePrimers <- toupper(samplePrimers)
newBarcodes <- toupper(paste0(sampleBarcodes,
substr(samplePrimers,1,howmany)))
counts <- table(newBarcodes)
## only take 1 to 1 associations!
rows <- newBarcodes %in% names(which(counts==1))
if (any(counts>1)) {
message("Tie breaking failed...",
"try choosing high number of bases from primer possibly? ",
"Here are the failed barcodes: ",
paste(names(which(counts>1)),collapse=", "))
message("Ignore samples associated with those barcodes and ",
"continue processing? (y/n)")
whatsup <- scan(what=character(0), n=1, quiet=TRUE,
multi.line=FALSE)
if(whatsup=='n') {
stop("Aborting processing due to ambiguous barcode ",
"association for samples: ",
paste(names(sampleBarcodes[!rows]),collapse=", "))
} else {
message("Ignoring following samples due to duplicate barcodes: ",
paste(names(sampleBarcodes[!rows]),collapse=", "))
}
}
barcodesSample <- structure(names(sampleBarcodes[rows]),
names=newBarcodes[rows])
} else if(choice==2) {
message("Overwriting duplicate samples associated with the same barcode...")
} else {
stop("Aborting due to duplicate barcode found on this sector")
}
}
dnaSet <- read.seqsFromSector(sampleInfo, sector, isPaired)
if(alreadyDecoded) {
if(length(barcodesSample)>1) {
stop("alreadyDecoded parameter is set to TRUE. There shouldn't be more ",
"than one sample associated to a sequence file.")
}
if(is.list(dnaSet)) {
dnaSet <- sapply(dnaSet, function(x) {
names(x) <- sub("^\\S+-(\\S+) .+$", "\\1", names(x), perl=TRUE)
x
})
} else {
names(dnaSet) <- sub("^\\S+-(\\S+) .+$", "\\1",
names(dnaSet), perl=TRUE)
}
if(isPaired) {
## no need to store barcode/index reads if alreadyDecoded!
dnaSet <- list(dnaSet[c("pair1","pair2")])
} else {
if(dereplicate) {
dnaSet <- list(dereplicateReads(dnaSet))
} else {
dnaSet <- list(dnaSet)
}
}
names(dnaSet) <- as.character(barcodesSample)
} else {
if(isPaired) {
bc <- splitByBarcode(barcodesSample, dnaSet[["barcode"]],
trimFrom=realbarcodelen+1, showStats=showStats,
returnUnmatched=returnUnmatched)
p1 <- sapply(bc, function(x) dnaSet[['pair1']][names(x)])
p2 <- sapply(bc, function(x) dnaSet[['pair2']][names(x)])
stopifnot(identical(sapply(bc,length), sapply(p1,length)))
stopifnot(identical(sapply(bc,length), sapply(p2,length)))
dnaSet <- mapply(function(x,y) list("pair1"=x, "pair2"=y), p1, p2,
SIMPLIFY=FALSE)
rm("bc","p1","p2")
} else {
dnaSet <- splitByBarcode(barcodesSample, dnaSet,
trimFrom=realbarcodelen+1,
showStats=showStats,
returnUnmatched=returnUnmatched)
if(dereplicate) {
dnaSet <- dereplicateReads(dnaSet)
}
}
}
for(samplename in names(dnaSet)) {
if(samplename=="unDecodedSeqs") {
metadata(sampleInfo$sectors[[sector]]) <-
append(metadata(sampleInfo$sectors[[sector]]),
list("unDecodedSeqs"=dnaSet[[samplename]]))
} else {
sampleInfo$sectors[[sector]]$samples[[samplename]]$decoded <-
dnaSet[[samplename]]
}
}
metadata(sampleInfo$sectors[[sector]]) <-
append(metadata(sampleInfo$sectors[[sector]]),
list("decodedBy"=barcodesSample))
}
sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
decoded <- sampleInfo
cleanit <- gc()
} else {
decoded <- splitByBarcode(sampleInfo, dnaSet, trimFrom=NULL,
showStats=showStats,
returnUnmatched=returnUnmatched)
cleanit <- gc()
}
return(decoded)
}
#' @export
decodeByBarcode <- findBarcodes
#' Align a short pattern to variable length target sequences.
#'
#' Align a fixed length short pattern sequence (i.e. primers or adaptors) to subject sequences using \code{\link{pairwiseAlignment}}. This function uses default of type="overlap", gapOpening=-1, and gapExtension=-1 to align the patternSeq against subjectSeqs. One can adjust these parameters if prefered, but not recommended. This function is meant for aligning a short pattern onto large collection of subjects. If you are looking to align a vector sequence to subjects, then please use BLAT or see one of following \code{\link{blatSeqs}}, \code{\link{findAndRemoveVector}}
#'
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern. This is generally bigger than patternSeq, and cases where subjectSeqs is smaller than patternSeq will be ignored in the alignment.
#' @param patternSeq DNAString object or a sequence containing the query sequence to search. This is generally smaller than subjectSeqs.
#' @param side which side of the sequence to perform the search: left, right or middle. Default is 'left'.
#' @param qualityThreshold percent of patternSeq to match. Default is 1, full match.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param bufferBases use x number of bases in addition to patternSeq length to perform the search. Beneficial in cases where the pattern has homopolymers or indels compared to the subject. Default is 5. Doesn't apply when side='middle'.
#' @param doRC perform reverse complement search of the defined pattern. Default is TRUE.
#' @param returnUnmatched return sequences which had no or less than 5\% match
#' to the patternSeq. Default is FALSE.
#' @param returnLowScored return sequences which had quality score less than
#' the defined qualityThreshold. Default is FALSE.
#' @param parallel use parallel backend to perform calculation with
#' \code{\link{BiocParallel}}. Defaults to FALSE. If no parallel backend is
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#' @param ... extra parameters for \code{\link{pairwiseAlignment}}
#' @note
#' \itemize{
#' \item For qualityThreshold, the alignment score is calculated by
#' (matches*2)-(mismatches+gaps) which programatically translates to
#' round(nchar(patternSeq)*qualityThreshold)*2.
#' \item Gaps and mismatches are weighed equally with value of -1 which can
#' be overriden by defining extra parameters 'gapOpening' & 'gapExtension'.
#' \item If qualityThreshold is 1, then it is a full match, if 0, then any
#' match is accepted which is useful in searching linker sequences at 3' end.
#' Beware, this function only searches for the pattern sequence in one
#' orientation. If you are expecting to find the pattern in both orientation,
#' you might be better off using BLAST/BLAT!
#' \item If parallel=TRUE, then be sure to have a parallel backend registered
#' before running the function. One can use any of the following
#' \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @return
#' \itemize{
#' \item IRanges object with starts, stops, and names of the aligned sequences.
#' \item If returnLowScored or returnUnmatched = T, then a CompressedIRangesList
#' where x[["hits"]] has the good scoring hits, x[["Rejected"]] has the failed
#' to match qualityThreshold hits, and x[["Absent"]] has the hits where the
#' aligned bit is <=10\% match to the patternSeq.
#' }
#'
#' @seealso \code{\link{primerIDAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}},
#' \code{\link{doRCtest}}, \code{\link{findAndTrimSeq}}, \code{\link{blatSeqs}},
#' \code{\link{findAndRemoveVector}}
#'
#' @export
#'
#' @examples
#' subjectSeqs <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG",
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA",
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' subjectSeqs <- DNAStringSet(xscat("AAAAAAAAAA", subjectSeqs))
#' pairwiseAlignSeqs(subjectSeqs, "AAAAAAAAAA", showStats=TRUE)
#' pairwiseAlignSeqs(subjectSeqs, "AAATAATAAA", showStats=TRUE,
#' qualityThreshold=0.5)
#'
pairwiseAlignSeqs <- function(subjectSeqs=NULL, patternSeq=NULL, side="left",
qualityThreshold=1, showStats=FALSE, bufferBases=5,
doRC=TRUE, returnUnmatched=FALSE,
returnLowScored=FALSE, parallel=FALSE, ...) {
dp <- NULL
.checkArgs_SEQed()
if(parallel) {
subjectSeqs2 <- chunkize(subjectSeqs)
hits <- bplapply(subjectSeqs2, function(x)
pairwiseAlignSeqs(x, patternSeq, side, qualityThreshold, showStats=FALSE,
bufferBases, doRC, returnUnmatched, returnLowScored,
parallel=FALSE, ...), BPPARAM=dp)
hits <- do.call(c, hits)
if(is(hits,"CompressedIRangesList")) {
attrs <- unique(names(hits))
hits <- sapply(attrs,
function(x) unlist(hits[names(hits)==x],use.names=FALSE))
IRangesList(hits)
} else {
hits
}
} else {
qualityThreshold <- as.numeric(qualityThreshold)
## only get the relevant side of subject sequence with extra bufferBases to
## account for indels/mismatches & save memory while searching and avoid
## searching elsewhere in the sequence
if(tolower(side)=="left") {
badSeqs <- DNAStringSet()
culprits <- width(subjectSeqs) < (nchar(patternSeq)+bufferBases)
if(any(culprits)) {
badSeqs <- subjectSeqs[culprits]
message(length(badSeqs),
" sequences were removed from aligning since they were",
" shorter than pattern getting aligned: ",
(nchar(patternSeq)+bufferBases),"bp")
subjectSeqs <- subjectSeqs[!culprits]
}
subjectSeqs2 <- subseq(subjectSeqs, start=1,
end=(nchar(patternSeq)+bufferBases))
overFromLeft <- rep(0,length(subjectSeqs))
} else if (tolower(side)=="right") {
overFromLeft <- width(subjectSeqs)-(nchar(patternSeq)+bufferBases)
overFromLeft[overFromLeft<1] <- 1
subjectSeqs2 <- subseq(subjectSeqs, start=overFromLeft)
} else {
subjectSeqs2 <- subjectSeqs
overFromLeft <- rep(0, length(subjectSeqs))
}
## search both ways to test which side yields more hits!
if(doRC) {
patternSeq <- tryCatch(doRCtest(subjectSeqs2, patternSeq,
qualityThreshold),
error=function(e) patternSeq)
}
## type=overlap is best for primer trimming...see Biostrings Alignment vignette
if(any(names(match.call()) %in% c("type","gapOpening","gapExtension"))) {
hits <- pairwiseAlignment(subjectSeqs2, patternSeq, ...)
} else {
hits <- pairwiseAlignment(subjectSeqs2, patternSeq, type="overlap",
gapOpening=-1, gapExtension=-1, ...)
}
stopifnot(length(hits)==length(subjectSeqs2))