diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 946e96ef..e32789e2 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -249,7 +249,7 @@ jobs:
rcmdcheck::rcmdcheck(
args = c("--no-manual", "--no-vignettes", "--timings"),
build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
- error_on = "warning",
+ error_on = "error",
check_dir = "check"
)
shell: Rscript {0}
diff --git a/DESCRIPTION b/DESCRIPTION
index 77e0fc50..2dc759a1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,12 +1,13 @@
Package: FRASER
Type: Package
Title: Find RAre Splicing Events in RNA-Seq Data
-Version: 1.11.0
-Date: 2021-04-05
+Version: 1.99.0
+Date: 2022-11-25
Authors@R: c(
person("Christian", "Mertes", role=c("aut", "cre"),
email="mertes@in.tum.de"),
person("Ines", "Scheller", role=c("aut"), email="scheller@in.tum.de"),
+ person("Karoline", "Lutz", role=c("aut")),
person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"),
person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
Description: Detection of rare aberrant splicing events in transcriptome
@@ -28,7 +29,7 @@ biocViews:
License: MIT + file LICENSE
URL: https://github.com/gagneurlab/FRASER
BugRepots: https://github.com/gagneurlab/FRASER/issues
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.2
Encoding: UTF-8
VignetteBuilder: knitr
Depends:
@@ -82,9 +83,13 @@ Suggests:
covr,
TxDb.Hsapiens.UCSC.hg19.knownGene,
org.Hs.eg.db,
+ rtracklayer,
+ SGSeq,
+ ggbio,
+ biovizBase
LinkingTo:
- Rcpp,
- RcppArmadillo
+ RcppArmadillo,
+ Rcpp
Collate:
variables.R
getNSetterFuns.R
@@ -114,3 +119,4 @@ Collate:
fitCorrectionMethods.R
plotMethods.R
zzz.R
+ resultAnnotations.R
diff --git a/NAMESPACE b/NAMESPACE
index 35e2d66a..907f1408 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,6 +6,7 @@ export("condition<-")
export("currentType<-")
export("dontWriteHDF5<-")
export("featureExclusionMask<-")
+export("fitMetrics<-")
export("name<-")
export("nonSplicedReads<-")
export("pairedEnd<-")
@@ -22,12 +23,16 @@ export(K)
export(N)
export(aberrant)
export(addCountsToFraserDataSet)
+export(annotateIntronReferenceOverlap)
+export(annotatePotentialImpact)
export(annotateRanges)
export(annotateRangesWithTxDb)
+export(availableFDRsubsets)
export(bamFile)
export(bestQ)
export(calculatePSIValues)
export(calculatePadjValues)
+export(calculatePadjValuesOnSubset)
export(calculatePvalues)
export(calculateZscore)
export(condition)
@@ -44,6 +49,8 @@ export(filterExpression)
export(filterExpressionAndVariability)
export(filterVariability)
export(fit)
+export(fitMetrics)
+export(flagBlacklistRegions)
export(getNonSplitReadCountsForAllSamples)
export(getSplitReadCountsForAllSamples)
export(hyperParams)
@@ -60,19 +67,22 @@ export(pVals)
export(padjVals)
export(pairedEnd)
export(plotAberrantPerSample)
+export(plotBamCoverage)
+export(plotBamCoverageFromResultTable)
export(plotCountCorHeatmap)
export(plotEncDimSearch)
export(plotExpectedVsObservedPsi)
export(plotExpression)
export(plotFilterExpression)
export(plotFilterVariability)
+export(plotManhattan)
export(plotQQ)
+export(plotSpliceMetricRank)
export(plotVolcano)
export(predictedMeans)
export(pseudocount)
export(psiTypes)
export(results)
-export(resultsByGenes)
export(rho)
export(samples)
export(saveFraserDataSet)
@@ -98,6 +108,7 @@ exportMethods(assays)
exportMethods(bamFile)
exportMethods(condition)
exportMethods(filterExpression)
+exportMethods(filterVariability)
exportMethods(length)
exportMethods(name)
exportMethods(nonSplicedReads)
@@ -105,6 +116,7 @@ exportMethods(pairedEnd)
exportMethods(plotAberrantPerSample)
exportMethods(plotCountCorHeatmap)
exportMethods(plotEncDimSearch)
+exportMethods(plotManhattan)
exportMethods(plotQQ)
exportMethods(plotVolcano)
exportMethods(results)
@@ -158,27 +170,40 @@ importFrom(GenomeInfoDb,seqlengths)
importFrom(GenomeInfoDb,seqlevels)
importFrom(GenomeInfoDb,seqlevelsStyle)
importFrom(GenomeInfoDb,seqnames)
+importFrom(GenomeInfoDb,sortSeqlevels)
importFrom(GenomeInfoDb,standardChromosomes)
importFrom(GenomicAlignments,junctions)
importFrom(GenomicAlignments,readGAlignmentPairs)
importFrom(GenomicAlignments,readGAlignments)
importFrom(GenomicAlignments,summarizeJunctions)
+importFrom(GenomicFeatures,exons)
+importFrom(GenomicFeatures,fiveUTRsByTranscript)
importFrom(GenomicFeatures,genes)
importFrom(GenomicFeatures,intronsByTranscript)
importFrom(GenomicFeatures,makeTxDbFromGFF)
+importFrom(GenomicFeatures,seqlevels0)
+importFrom(GenomicFeatures,threeUTRsByTranscript)
+importFrom(GenomicRanges,"end<-")
+importFrom(GenomicRanges,"seqinfo<-")
+importFrom(GenomicRanges,"start<-")
importFrom(GenomicRanges,GRanges)
importFrom(GenomicRanges,GRangesList)
+importFrom(GenomicRanges,end)
importFrom(GenomicRanges,findOverlaps)
importFrom(GenomicRanges,granges)
importFrom(GenomicRanges,invertStrand)
importFrom(GenomicRanges,makeGRangesFromDataFrame)
+importFrom(GenomicRanges,start)
importFrom(HDF5Array,HDF5Array)
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
importFrom(HDF5Array,path)
importFrom(HDF5Array,saveHDF5SummarizedExperiment)
importFrom(HDF5Array,writeHDF5Array)
+importFrom(IRanges,"%over%")
importFrom(IRanges,IRanges)
+importFrom(IRanges,distance)
importFrom(IRanges,from)
+importFrom(IRanges,nearest)
importFrom(IRanges,ranges)
importFrom(IRanges,subsetByOverlaps)
importFrom(IRanges,to)
@@ -204,15 +229,18 @@ importFrom(Rsamtools,scanBamHeader)
importFrom(Rsubread,featureCounts)
importFrom(S4Vectors,"mcols<-")
importFrom(S4Vectors,"metadata<-")
+importFrom(S4Vectors,"values<-")
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,Rle)
importFrom(S4Vectors,SimpleList)
+importFrom(S4Vectors,elementMetadata)
importFrom(S4Vectors,end)
importFrom(S4Vectors,mcols)
importFrom(S4Vectors,metadata)
importFrom(S4Vectors,queryHits)
importFrom(S4Vectors,start)
importFrom(S4Vectors,subjectHits)
+importFrom(S4Vectors,values)
importFrom(SummarizedExperiment,"assay<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
@@ -236,6 +264,7 @@ importFrom(VGAM,rbetabinom)
importFrom(VGAM,vglm)
importFrom(biomaRt,getBM)
importFrom(biomaRt,useEnsembl)
+importFrom(cowplot,background_grid)
importFrom(cowplot,theme_cowplot)
importFrom(extraDistr,dbbinom)
importFrom(extraDistr,pbbinom)
@@ -245,6 +274,8 @@ importFrom(ggplot2,aes)
importFrom(ggplot2,annotate)
importFrom(ggplot2,annotation_logticks)
importFrom(ggplot2,element_blank)
+importFrom(ggplot2,facet_grid)
+importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_abline)
importFrom(ggplot2,geom_histogram)
importFrom(ggplot2,geom_hline)
@@ -253,10 +284,12 @@ importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_ribbon)
importFrom(ggplot2,geom_segment)
importFrom(ggplot2,geom_smooth)
+importFrom(ggplot2,geom_text)
importFrom(ggplot2,geom_vline)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggtitle)
importFrom(ggplot2,labs)
+importFrom(ggplot2,quo_name)
importFrom(ggplot2,scale_color_brewer)
importFrom(ggplot2,scale_color_discrete)
importFrom(ggplot2,scale_color_gradientn)
@@ -271,6 +304,7 @@ importFrom(ggplot2,theme_bw)
importFrom(ggplot2,xlab)
importFrom(ggplot2,xlim)
importFrom(ggplot2,ylab)
+importFrom(ggplot2,ylim)
importFrom(ggrepel,geom_text_repel)
importFrom(grDevices,colorRampPalette)
importFrom(matrixStats,colAnys)
@@ -324,9 +358,11 @@ importFrom(stats,rnbinom)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
+importFrom(tibble,"%>%")
importFrom(tibble,as_tibble)
importFrom(tools,file_path_as_absolute)
importFrom(utils,capture.output)
importFrom(utils,packageVersion)
+importFrom(utils,tail)
importMethodsFrom(OUTRIDER,results)
useDynLib(FRASER)
diff --git a/NEWS b/NEWS
index 15ca2c35..6728c4a5 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,24 @@
+CHANGES IN VERSION 2.0.0
+-------------------------
+ o Major update to FRASER2:
+ o Introduction of new & more robust splice metric Intron Jaccard Index
+ o Only Intron Jaccard Index metric used by default
+ o Improved gene level pvalue calculation and internal storage
+ o Introduction of option to limit FDR correction to user-defined
+ subsets of genes per sample (e.g. OMIM genes with rare variant)
+ o Updated internal pseudocount parameter and default delta Jaccard
+ cutoff
+ o Junction filtering adapted to usage of Intron Jaccard Index metric
+ o Require min expression of N >= 10 in 25% of the samples
+ o Results table:
+ o Functionality to flag outliers in blacklist regions of the genome
+ o Functionality to annotate the predicted type of aberrantSplicing
+ (e.g. exon skipping, intron retention etc.)
+ o Several updates in the plotting functions
+ o introduction of manhattan plot functionality
+ o possibility to create sashimi plots to visualize read coverage in
+ the bam files for outliers
+
CHANGES IN VERSION 1.8.1
-------------------------
o Bugfix in merging splicing counts (#41)
diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R
index ced4142a..0015b143 100644
--- a/R/AllGenerics-definitions.R
+++ b/R/AllGenerics-definitions.R
@@ -146,3 +146,14 @@ setGeneric("nonSplicedReads",
#' @export
setGeneric("nonSplicedReads<-", signature = "object",
function(object, value) standardGeneric("nonSplicedReads<-"))
+
+#' @rdname plotFunctions
+#' @export
+setGeneric("plotManhattan", function(object, ...)
+ standardGeneric("plotManhattan"))
+
+#' @rdname filtering
+#' @export
+setGeneric("filterVariability", function(object, ...)
+ standardGeneric("filterVariability"))
+
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index cfb73078..05eb80d1 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -12,8 +12,8 @@ asFDS <- function(x){
#'
#' @title Getter/Setter methods for the FraserDataSet
#'
-#' The following methods are getter and setter methods to extract or set
-#' certain values of a FraserDataSet object.
+#' @description The following methods are getter and setter methods to extract
+#' or set certain values of a FraserDataSet object.
#'
#' \code{samples} sets or gets the sample IDs; \code{condition} ;
#' \code{}
@@ -66,6 +66,9 @@ NULL
#' @rdname fds-methods
#' @export
setMethod("samples", "FraserDataSet", function(object) {
+ if(!is.null(colnames(object))){
+ return(colnames(object))
+ }
return(as.character(colData(object)[,"sampleID"]))
})
@@ -74,6 +77,7 @@ setMethod("samples", "FraserDataSet", function(object) {
setReplaceMethod("samples", "FraserDataSet", function(object, value) {
colData(object)[,"sampleID"] <- as.character(value)
rownames(colData(object)) <- colData(object)[,"sampleID"]
+ colnames(object) <- as.character(value)
validObject(object)
return(object)
})
@@ -515,12 +519,14 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace)
#' @examples
#' fds <- createTestFraserDataSet()
#'
-#' counts(fds, type="psi5", side="ofInterest")
-#' counts(fds, type="psi5", side="other")
+#' counts(fds, side="ofInterest")
+#' counts(fds, type="jaccard", side="other")
+#' head(K(fds))
+#' head(K(fds, type="psi5"))
#' head(K(fds, type="psi3"))
#' head(N(fds, type="theta"))
#'
-setMethod("counts", "FraserDataSet", function(object, type=NULL,
+setMethod("counts", "FraserDataSet", function(object, type=currentType(object),
side=c("ofInterest", "otherSide")){
side <- match.arg(side)
if(side=="ofInterest"){
@@ -536,8 +542,9 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
# extract psi value from type
type <- whichPSIType(type)
if(length(type) == 0 | length(type) > 1){
- stop(paste0("Please provide a correct psi type: psi5, psi3, or ",
- "theta. Not the given one: '", type, "'."))
+ stop(paste0("Please provide a correct psi type: psi5, psi3, ",
+ "theta or jaccard. Not the given one: '",
+ type, "'."))
}
aname <- paste0("rawOtherCounts_", type)
if(!aname %in% assayNames(object)){
@@ -551,7 +558,8 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
#' setter for count data
#'
#' @rdname counts
-setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL,
+setReplaceMethod("counts", "FraserDataSet", function(object,
+ type=currentType(object),
side=c("ofInterest", "otherSide"), ..., value){
side <- match.arg(side)
@@ -578,150 +586,289 @@ setAs("DataFrame", "matrix", function(from){
as.matrix(as(from, "data.table")) })
#'
-#' retrieve a single sample result object
-#' @noRd
-resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals,
- rawCts, rawTotalCts, deltaPsiVals, muPsi, psiType, fdrCut,
- zscoreCut, dPsiCut, rowMeansK, rowMeansN, minCount,
- additionalColumns){
-
- zscore <- zscores[,sampleID]
- dpsi <- deltaPsiVals[,sampleID]
- pval <- pvals[,sampleID]
- padj <- padjs[,sampleID]
-
- goodCut <- !logical(length(zscore))
- if(!is.na(zscoreCut)){
- goodCut <- goodCut & na2default(abs(zscore) >= zscoreCut, TRUE)
- }
- if(!is.na(dPsiCut)){
- goodCut <- goodCut & na2default(abs(dpsi) >= dPsiCut, TRUE)
- }
- if(!is.na(fdrCut)){
- goodCut <- goodCut & na2false(padj <= fdrCut)
+#' Mapping of chromosome names
+#'
+#' @param fds FraserDataSet
+#' @param style The style of the chromosome names.
+#' @param ... Further parameters. For mapSeqLevels: further parameters
+#' passed to GenomeInfoDb::mapSeqlevels().
+#'
+#' @rdname fds-methods
+#' @export
+mapSeqlevels <- function(fds, style="UCSC", ...){
+
+ mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...))
+ # fix missing names() when fds has only a single chromosome
+ if(is.null(names(mappings))){
+ names(mappings) <- seqlevels(fds)
}
- if(!is.na(minCount)){
- goodCut <- goodCut & rawTotalCts[,sampleID] >= minCount
+
+ if(length(mappings) != length(seqlevels(fds))){
+ message(date(), ": Drop non standard chromosomes for compatibility.")
+ fds <- keepStandardChromosomes(fds)
+ nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds))
+ validObject(fds)
}
+ fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)]
+
+ seqlevels(fds) <- as.vector(mappings)
+ seqlevels(nonSplicedReads(fds)) <- as.vector(mappings)
+
+ return(fds)
+}
+#'
+#' retrieve a single sample result object
+#' @noRd
+resultsSingleSample <- function(sampleID, gr, pvals, padjs,
+ psivals, rawCts, rawTotalCts, rawNonsplitCts,
+ rawNsProportion, nsProportion_99quantile,
+ deltaPsiVals, psiType, rowMeansK, rowMeansN,
+ aberrant, aggregate, rho,
+ pvalsGene=NULL, padjsGene=NULL,
+ aberrantGene, additionalColumns,
+ geneColumn="hgnc_symbol"){
+ mcols(gr)$idx <- seq_along(gr)
+ # if gene level results, find the most aberrant junction per gene first
+ if(isTRUE(aggregate)){
+ goodGenes <- rownames(aberrantGene)[aberrantGene[,sampleID] &
+ !is.na(aberrantGene[,sampleID])]
+ geneJunctions <- findJunctionsForAberrantGenes(gr=gr,
+ aberrantGenes=goodGenes,
+ pvals=pvals[,sampleID],
+ dpsi=deltaPsiVals[,sampleID],
+ geneColumn=geneColumn,
+ aberrantJunctions=aberrant[,sampleID])
+ goodCut <- rep(FALSE, nrow(pvals))
+ goodCut[geneJunctions] <- TRUE
+ } else{
+ goodCut <- aberrant[,sampleID]
+ }
+
ans <- granges(gr[goodCut])
-
+
if(!any(goodCut)){
return(ans)
}
-
- if(!"hgnc_symbol" %in% colnames(mcols(gr))){
- mcols(gr)$hgnc_symbol <- NA_character_
+ mcols(ans)$idx <- mcols(gr)$idx[goodCut]
+
+ if(!geneColumn %in% colnames(mcols(gr))){
+ mcols(gr)[,geneColumn] <- NA_character_
}
-
+
# extract data
mcols(ans)$sampleID <- Rle(sampleID)
if("hgnc_symbol" %in% colnames(mcols(gr))){
- mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])$hgnc_symbol)
- }
- if("other_hgnc_symbol" %in% colnames(mcols(gr))){
- mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol)
+ mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])[,geneColumn])
}
+
mcols(ans)$type <- Rle(psiType)
- mcols(ans)$pValue <- signif(pval[goodCut], 5)
- mcols(ans)$padjust <- signif(padj[goodCut], 5)
- mcols(ans)$zScore <- Rle(round(zscore[goodCut], 2))
+ mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5)
+ mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5)
mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2))
- mcols(ans)$deltaPsi <- Rle(round(dpsi[goodCut], 2))
- mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2))
- mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
+ mcols(ans)$deltaPsi <- round(deltaPsiVals[goodCut,sampleID], 2)
mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID])
mcols(ans)$totalCounts <- Rle(rawTotalCts[goodCut, sampleID])
+ mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2))
+ mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
+
+ if(psiType == "jaccard"){
+ mcols(ans)$nonsplitCounts <-
+ Rle(round(rawNonsplitCts[goodCut, sampleID], 2))
+ mcols(ans)$nonsplitProportion <-
+ Rle(round(rawNsProportion[goodCut, sampleID], 2))
+ mcols(ans)$nonsplitProportion_99quantile <-
+ Rle(round(nsProportion_99quantile[goodCut], 2))
+ }
if(!is.null(additionalColumns)){
for(column in additionalColumns){
mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column])
}
}
-
- return(ans[order(mcols(ans)$pValue)])
+
+ if(isTRUE(aggregate)){
+ # report junction more than once if it is significant for several genes
+ nrGenesPerJunction <- table(geneJunctions)
+ ans <- rep(ans, nrGenesPerJunction[as.character(mcols(ans)$idx)])
+ mcols(ans)$hgncSymbol <-
+ as.data.table(ans)[, names(geneJunctions)[geneJunctions == idx],
+ by = eval(colnames(mcols(ans)))][,V1]
+
+ # add gene level pvalue
+ mcols(ans)$pValueGene <-
+ signif(pvalsGene[mcols(ans)$hgncSymbol,sampleID], 5)
+ mcols(ans)$padjustGene <-
+ signif(padjsGene[mcols(ans)$hgncSymbol,sampleID], 5)
+ mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol)
+ }
+
+ # remove helper column
+ mcols(ans)$idx <- NULL
+
+
+ return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))])
}
-FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
- dPsiCutoff, psiType, BPPARAM=bpparam(), maxCols=20,
- minCount, additionalColumns=NULL){
-
- # check input
- checkNaAndRange(fdrCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE)
- checkNaAndRange(dPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE)
- checkNaAndRange(zscoreCutoff, min=0, max=100, scalar=TRUE, na.ok=TRUE)
- checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE)
-
+FRASER.results <- function(object, sampleIDs, fdrCutoff,
+ dPsiCutoff, minCount, rhoCutoff, psiType,
+ maxCols=20, aggregate=FALSE, collapse=FALSE,
+ geneColumn="hgnc_symbol", BPPARAM=bpparam(),
+ subsetName=NULL, all=all, additionalColumns=NULL){
+
stopifnot(is(object, "FraserDataSet"))
stopifnot(all(sampleIDs %in% samples(object)))
-
+
+ if("annotatedJunction" %in% colnames(mcols(object, type="j")) &&
+ !("annotatedJunction" %in% additionalColumns)){
+ additionalColumns <- c(additionalColumns, "annotatedJunction")
+ }
+
+ # only extract results for requested psiTypes if pvals exist for them
+ stopifnot(all(psiType %in% psiTypes))
+ if(is.na(rhoCutoff)){
+ rhoCutoff <- 1
+ }
+ pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType,
+ filters=list(rho=rhoCutoff),
+ aggregate=aggregate,
+ subsetName=subsetName)
+ psiType <- psiType[pvalsAvailable]
+ if(all(isFALSE(pvalsAvailable))){
+ stop("For the splice metric(s), pvalues are not yet computed. \n",
+ "Please compute them first by running the ",
+ "calculatePadjValues function.")
+ }
+
resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
- message(date(), ": Collecting results for: ", type)
+ message(date(), ": Collecting results for: ", type,
+ ifelse(is.null(subsetName), " (transcriptome-wide)",
+ paste0(" (", subsetName, ")")))
currentType(object) <- type
gr <- rowRanges(object, type=type)
-
+
# first get row means
rowMeansK <- rowMeans(K(object, type=type))
rowMeansN <- rowMeans(N(object, type=type))
-
+
+ # get proportion of nonsplitCounts among all counts (N) for each intron
+ if(type == "jaccard"){
+ rawNonsplitCts <- as.matrix(assay(object, "rawCountsJnonsplit"))
+ rawNsProportion <- rawNonsplitCts / as.matrix(N(object))
+ nsProportion_99quantile <-
+ rowQuantiles(rawNsProportion, probs=0.99)
+ } else{
+ rawNonsplitCts <- NULL
+ rawNsProportion <- NULL
+ nsProportion_99quantile <- NULL
+ }
+
# then iterate by chunk
chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols)
sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs,
- chunkSize=chunkCols)
-
+ chunkSize=chunkCols)
+
ans <- lapply(seq_along(sampleChunks), function(idx){
message(date(), ": Process chunk: ", idx, " for: ", type)
sc <- sampleChunks[[idx]]
tmp_x <- object[,sc]
-
+
# extract values
rawCts <- as.matrix(K(tmp_x))
rawTotalCts <- as.matrix(N(tmp_x))
- pvals <- as.matrix(pVals(tmp_x))
- padjs <- as.matrix(padjVals(tmp_x))
- zscores <- as.matrix(zScores(tmp_x))
+ pvals <- as.matrix(pVals(tmp_x,
+ filters=list(rho=rhoCutoff)))
+ padjs <- as.matrix(padjVals(tmp_x,
+ subsetName=subsetName,
+ filters=list(rho=rhoCutoff)))
psivals <- as.matrix(assay(tmp_x, type))
muPsi <- as.matrix(predictedMeans(tmp_x))
psivals_pc <- (rawCts + pseudocount()) /
- (rawTotalCts + 2*pseudocount())
- deltaPsiVals <- psivals_pc - muPsi
-
+ (rawTotalCts + 2*pseudocount())
+ deltaPsiVals <- deltaPsiValue(tmp_x, type)
+ rho <- rho(tmp_x, type)
+ aberrant <- aberrant.FRASER(tmp_x, type=type,
+ padjCutoff=fdrCutoff,
+ deltaPsiCutoff=dPsiCutoff,
+ minCount=minCount,
+ rhoCutoff=rhoCutoff,
+ aggregate=FALSE,
+ all=all,
+ geneColumn=geneColumn,
+ subsetName=subsetName)
+ if(isTRUE(aggregate)){
+ pvalsGene <- as.matrix(pVals(tmp_x, level="gene",
+ filters=list(rho=rhoCutoff)))
+ padjsGene <- as.matrix(padjVals(tmp_x, level="gene",
+ subsetName=subsetName,
+ filters=list(rho=rhoCutoff)))
+ aberrantGene <- aberrant.FRASER(tmp_x, type=type,
+ padjCutoff=fdrCutoff,
+ deltaPsiCutoff=dPsiCutoff,
+ minCount=minCount,
+ rhoCutoff=rhoCutoff,
+ aggregate=TRUE,
+ all=all,
+ geneColumn=geneColumn,
+ subsetName=subsetName)
+ } else{
+ pvalsGene <- NULL
+ padjsGene <- NULL
+ aberrantGene <- NULL
+ }
+
if(length(sc) == 1){
colnames(pvals) <- sc
colnames(padjs) <- sc
- colnames(zscores) <- sc
colnames(deltaPsiVals) <- sc
}
# create result table
sampleRes <- lapply(sc,
- resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs,
- zscores=zscores, psiType=type, psivals=psivals,
- deltaPsiVals=deltaPsiVals, muPsi=muPsi, rawCts=rawCts,
- rawTotalCts=rawTotalCts, fdrCut=fdrCutoff,
- zscoreCut=zscoreCutoff, dPsiCut=dPsiCutoff,
- rowMeansK=rowMeansK, rowMeansN=rowMeansN,
- minCount=minCount, additionalColumns=additionalColumns)
-
+ resultsSingleSample, gr=gr, pvals=pvals,
+ padjs=padjs, psiType=type,
+ psivals=psivals, deltaPsiVals=deltaPsiVals,
+ rawCts=rawCts, rawTotalCts=rawTotalCts,
+ rawNonsplitCts=rawNonsplitCts[,sc,drop=FALSE],
+ rawNsProportion=rawNsProportion[,sc,drop=FALSE],
+ nsProportion_99quantile=nsProportion_99quantile,
+ rowMeansK=rowMeansK, rowMeansN=rowMeansN,
+ aberrant=aberrant, aggregate=aggregate,
+ rho=rho, geneColumn=geneColumn,
+ pvalsGene=pvalsGene, padjsGene=padjsGene,
+ aberrantGene=aberrantGene,
+ additionalColumns=additionalColumns)
+
# return combined result
return(unlist(GRangesList(sampleRes)))
})
-
+
unlist(GRangesList(ans))
})
-
+
# merge results
ans <- unlist(GRangesList(resultsls))
-
+
# sort it if existing
if(length(ans) > 0){
ans <- ans[order(ans$pValue)]
+ if(is.null(subsetName)){
+ mcols(ans)[["FDR_set"]] <- "transcriptome-wide"
+ } else{
+ mcols(ans)[["FDR_set"]] <- subsetName
+ }
}
-
+
+ # collapse into one row per gene if requested
+ if(isTRUE(aggregate) && isTRUE(collapse)){
+ ans <- collapseResTablePerGene(ans)
+ }
+
# return only the results
return(ans)
}
+
#'
#' Extracting results and aberrant splicing events
#'
@@ -733,31 +880,41 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
#' @param sampleIDs A vector of sample IDs for which results should be
#' retrieved
#' @param padjCutoff The FDR cutoff to be applied or NA if not requested.
-#' @param zScoreCutoff The z-score cutoff to be applied or NA if not requested.
#' @param deltaPsiCutoff The cutoff on delta psi or NA if not requested.
#' @param minCount The minimum count value of the total coverage of an intron
#' to be considered as significant.
#' result
+#' @param rhoCutoff The cutoff value on the fitted rho value
+#' (overdispersion parameter of the betabinomial) above which
+#' junctions are filtered
#' @param psiType The psi types for which the results should be retrieved.
#' @param additionalColumns Character vector containing the names of additional
#' columns from mcols(fds) that should appear in the result table
#' (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns
#' are included.
#' @param BPPARAM The BiocParallel parameter.
-#' @param res Result as created with \code{results()}
-#' @param geneColumn The name of the column in \code{mcols(res)} that contains
-#' the gene symbols.
-#' @param method The p.adjust method that is being used to adjust p values per
-#' sample.
#' @param type Splicing type (psi5, psi3 or theta)
#' @param by By default \code{none} which means no grouping. But if
#' \code{sample} or \code{feature} is specified the sum by
#' sample or feature is returned
-#' @param aggregate If TRUE the returned object is based on the grouped
-#' features
-#' @param ... Further arguments can be passed to the method. If "zscores",
-#' "padjVals" or "dPsi" is given, the values of those arguments
-#' are used to define the aberrant events.
+#' @param aggregate If TRUE the returned object is aggregated to the feature
+#' level (i.e. gene level).
+#' @param collapse Only takes effect if \code{aggregate=TRUE}.
+#' If TRUE, collapses results across the different psi
+#' types to return only one row per feature (gene) and sample.
+#' @param geneColumn The column name of the column that has the gene annotation
+#' that will be used for gene-level pvalue computation.
+#' @param all By default FALSE, only significant introns (or genes) are listed
+#' in the results. If TRUE, results are assembled for all
+#' samples and introns/genes regardless of significance.
+#' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets
+#' of genes of interest have been calculated, this parameter
+#' indicates whether additionally the transcriptome-wide results
+#' should be returned as well (default), or whether only results
+#' for those subsets should be retrieved.
+#' @param ... Further arguments can be passed to the method. If "n",
+#' "padjVals", "dPsi" or "rhoVals" are given, the values of those
+#' arguments are used to define the aberrant events.
#'
#' @return For \code{results}: GRanges object containing significant results.
#' For \code{aberrant}: Either a of logical values of size
@@ -770,186 +927,212 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
#' # get data, fit and compute p-values and z-scores
#' fds <- createTestFraserDataSet()
#'
-#' # extract results: for this example dataset, z score cutoff of 2 is used to
-#' # get at least one result and show the output
-#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+#' # extract results: for this example dataset, no cutoffs are used to
+#' # show the output of the results function
+#' res <- results(fds, all=TRUE)
#' res
#'
#' # aggregate the results by genes (gene symbols need to be annotated first
#' # using annotateRanges() function)
-#' resultsByGenes(res)
+#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE)
+#'
+#' # aggregate the results by genes and collapse over all psi types to obtain
+#' # only one row per gene in the results table
+#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE,
+#' collapse=TRUE)
#'
#' # get aberrant events per sample: on the example data, nothing is aberrant
#' # based on the adjusted p-value
-#' aberrant(fds, type="psi5", by="sample")
+#' aberrant(fds, type="jaccard", by="sample")
#'
#' # get aberrant events per gene (first annotate gene symbols)
#' fds <- annotateRangesWithTxDb(fds)
-#' aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA,
-#' aggregate=TRUE)
+#' aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE)
#'
#' # find aberrant junctions/splice sites
-#' aberrant(fds, type="psi5")
-#' @export
-setMethod("results", "FraserDataSet", function(object,
- sampleIDs=samples(object), padjCutoff=0.05,
- zScoreCutoff=NA, deltaPsiCutoff=0.3,
- minCount=5, psiType=c("psi3", "psi5", "theta"),
- additionalColumns=NULL, BPPARAM=bpparam(), ...){
- FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
- zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff,
- minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE),
- additionalColumns=additionalColumns, BPPARAM=BPPARAM)
-})
-
-#' @rdname results
-#' @export
-resultsByGenes <- function(res, geneColumn="hgncSymbol", method="BY"){
- # sort by pvalue
- res <- res[order(res$pValue)]
-
- # extract subset
- if(is(res, "GRanges")){
- ans <- as.data.table(mcols(res)[,c(geneColumn, "pValue", "sampleID")])
- colnames(ans) <- c("features", "pval", "sampleID")
- } else {
- ans <- featureNames <- res[,.(
- features=get(geneColumn), pval=pValue, sampleID=sampleID)]
- }
-
- # remove NAs
- naIdx <- ans[,is.na(features)]
- ansNoNA <- ans[!is.na(features)]
-
- # compute pvalues by gene
- ansNoNA[,pByFeature:=min(p.adjust(pval, method="holm")),
- by="sampleID,features"]
-
- # subset to lowest pvalue by gene
- dupIdx <- duplicated(ansNoNA[,.(features,sampleID)])
- ansGenes <- ansNoNA[!dupIdx]
-
- # compute FDR
- ansGenes[,fdrByFeature:=p.adjust(pByFeature, method=method),
- by="sampleID"]
-
- # get final result table
- finalAns <- res[!naIdx][!dupIdx]
- finalAns$pValueGene <- ansGenes$pByFeature
- finalAns$padjustGene <- ansGenes$fdrByFeature
- finalAns
-}
-
-#'
-#' Mapping of chromosome names
-#'
-#' @param fds FraserDataSet
-#' @param style The style of the chromosome names.
-#' @param ... Further parameters. For mapSeqLevels: further parameters
-#' passed to GenomeInfoDb::mapSeqlevels().
+#' aberrant(fds, type="jaccard")
+#'
+#' # retrieve results limiting FDR correction to only a subset of genes
+#' # first, we need to create a list of genes per sample that will be tested
+#' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
+#' fds <- calculatePadjValues(fds, type="jaccard",
+#' subsets=list("exampleSubset"=geneList))
+#' results(fds, all=TRUE, returnTranscriptomewideResults=FALSE)
#'
-#' @rdname fds-methods
#' @export
-mapSeqlevels <- function(fds, style="UCSC", ...){
-
- mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...))
- # fix missing names() when fds has only a single chromosome
- if(is.null(names(mappings))){
- names(mappings) <- seqlevels(fds)
+setMethod("results", "FraserDataSet", function(object,
+ sampleIDs=samples(object), padjCutoff=0.1,
+ deltaPsiCutoff=0.1,
+ rhoCutoff=NA, aggregate=FALSE, collapse=FALSE,
+ minCount=5, psiType=psiTypes,
+ geneColumn="hgnc_symbol", all=FALSE,
+ returnTranscriptomewideResults=TRUE,
+ additionalColumns=NULL, BPPARAM=bpparam()){
+ psiType <- match.arg(psiType, several.ok=TRUE)
+ FDRsets <- availableFDRsubsets(object)
+
+ if(isFALSE(returnTranscriptomewideResults) && is.null(FDRsets)){
+ warning("Retrieving transcriptome-wide results as no other ",
+ "FDR subsets are available in the fds object.")
+ returnTranscriptomewideResults <- TRUE
+ }
+ if(isTRUE(returnTranscriptomewideResults)){
+ res <- FRASER.results(object=object, sampleIDs=sampleIDs,
+ fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff,
+ rhoCutoff=rhoCutoff, minCount=minCount,
+ psiType=psiType, all=all,
+ aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
+ subsetName=NULL, additionalColumns=additionalColumns,
+ BPPARAM=BPPARAM)
}
-
- if(length(mappings) != length(seqlevels(fds))){
- message(date(), ": Drop non standard chromosomes for compatibility.")
- fds <- keepStandardChromosomes(fds)
- nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds))
- validObject(fds)
+
+ # add results for FDR_subsets if requested
+ if(!is.null(FDRsets)){
+ resls_subsets <- lapply(FDRsets, function(setName){
+ res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs,
+ fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff,
+ rhoCutoff=rhoCutoff, minCount=minCount,
+ psiType=psiType, all=all,
+ aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
+ subsetName=setName, additionalColumns=additionalColumns,
+ BPPARAM=BPPARAM)
+ })
+
+ if(isTRUE(returnTranscriptomewideResults)){
+ res <- unlist(GRangesList(unlist(list(res, resls_subsets))))
+ } else{
+ res <- unlist(GRangesList(unlist(resls_subsets)))
+ }
+
+ # sort it if existing
+ if(length(res) > 0){
+ res <- res[order(res$pValue)]
+ if(isTRUE(aggregate)){
+ res <- res[!is.na(res$pValueGene)]
+ }
+ }
}
- fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)]
-
- seqlevels(fds) <- as.vector(mappings)
- seqlevels(nonSplicedReads(fds)) <- as.vector(mappings)
-
- return(fds)
-}
-
-
-aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05,
- deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5,
- by=c("none", "sample", "feature"), aggregate=FALSE, ...){
+ return(res)
+})
- checkNaAndRange(zScoreCutoff, min=0, max=Inf, na.ok=TRUE)
- checkNaAndRange(padjCutoff, min=0, max=1, na.ok=TRUE)
- checkNaAndRange(deltaPsiCutoff, min=0, max=1, na.ok=TRUE)
+aberrant.FRASER <- function(object, type=fitMetrics(object),
+ padjCutoff=0.1, deltaPsiCutoff=0.1,
+ minCount=5, rhoCutoff=NA,
+ by=c("none", "sample", "feature"),
+ aggregate=FALSE, geneColumn="hgnc_symbol",
+ subsetName=NULL, all=FALSE, ...){
+
+ checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE)
+ checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE)
+ checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE)
+ checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE)
by <- match.arg(by)
-
+ type <- match.arg(type)
+
+ if(is.na(rhoCutoff)){
+ rhoCutoff <- 1
+ }
+
dots <- list(...)
if("n" %in% names(dots)){
n <- dots[['n']]
} else {
n <- N(object, type=type)
}
- if("zscores" %in% names(dots)){
- zscores <- dots[['zscores']]
- } else {
- zscores <- zScores(object, type=type)
- }
if("padjVals" %in% names(dots)){
padj <- dots[['padjVals']]
} else {
- padj <- padjVals(object, type=type)
+ # check if padj values are available for the given filters
+ pvalsAvailable <- checkPadjAvailableForFilters(object, type=type,
+ filters=list(rho=rhoCutoff),
+ aggregate=aggregate,
+ subsetName=subsetName)
+ if(isFALSE(pvalsAvailable)){
+ stop("For the given filters, pvalues are not yet computed. \n",
+ "Please compute them first by running the ",
+ "calculatePadjValues function with the requested filters.")
+ }
+ padj <- padjVals(object, type=type, level="site", subsetName=subsetName,
+ filters=list(rho=rhoCutoff))
}
if("dPsi" %in% names(dots)){
dpsi <- dots[['dPsi']]
} else {
dpsi <- deltaPsiValue(object, type=type)
}
-
-
- # create cutoff matrix
- goodCutoff <- matrix(TRUE, nrow=nrow(zscores), ncol=ncol(zscores),
- dimnames=dimnames(zscores))
- if("hgnc_symbol" %in% colnames(mcols(object, type=type)) &
- nrow(mcols(object, type=type)) == nrow(goodCutoff)){
- rownames(goodCutoff) <- mcols(object, type=type)[,"hgnc_symbol"]
- } else if(isTRUE(aggregate)){
- stop("Please provide hgnc symbols to compute gene p values!")
+ if("rhoVals" %in% names(dots)){
+ rho <- dots[['rhoVals']]
+ } else {
+ rho <- matrix(rho(object, type=type),
+ nrow=nrow(dpsi), ncol=ncol(dpsi))
+ }
+ if(isTRUE(aggregate)){
+ if("padjGeneVals" %in% names(dots)){
+ padj_gene <- dots[['padjGeneVals']]
+ } else{
+ padj_gene <- padjVals(object, type=type, level="gene",
+ subsetName=subsetName,
+ filters=list(rho=rhoCutoff))
+ }
+
}
- # check each cutoff if in use (not NA)
- if(!is.na(minCount)){
- goodCutoff <- goodCutoff & as.matrix(n >= minCount)
- }
- if(!is.na(zScoreCutoff)){
- goodCutoff <- goodCutoff & as.matrix(abs(zscores) >= zScoreCutoff)
- }
- if(!is.na(deltaPsiCutoff)){
- goodCutoff <- goodCutoff & as.matrix(abs(dpsi) >= deltaPsiCutoff)
+ if(is.na(padjCutoff)){
+ padjCutoff <- 1
}
- if(!is.na(padjCutoff)){
- goodCutoff <- goodCutoff & as.matrix(padj <= padjCutoff)
- }
- goodCutoff[is.na(goodCutoff)] <- FALSE
- # check if we should go for aggregation
- # TODO to speed it up we only use any hit within a feature
- # but should do a holm's + BY correction per gene and genome wide
+ if(isTRUE(all)){
+ aberrantEvents <- matrix(TRUE, nrow=nrow(object), ncol=ncol(object))
+ colnames(aberrantEvents) <- colnames(fds)
+ } else{
+ aberrantEvents <- as.matrix(padj) <= padjCutoff
+
+ # check each cutoff if in use (not NA)
+ if(!is.na(minCount)){
+ aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
+ }
+ if(!is.na(deltaPsiCutoff)){
+ aberrantEvents <- aberrantEvents &
+ as.matrix(abs(dpsi) >= deltaPsiCutoff)
+ }
+ if(!is.na(rhoCutoff)){
+ aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff)
+ }
+ aberrantEvents[is.na(aberrantEvents)] <- FALSE
+ }
+
if(isTRUE(aggregate)){
- goodCutoff <- as.matrix(data.table(goodCutoff, keep.rownames=TRUE)[,
- as.data.table(t(colAnys(as.matrix(.SD)))), by=rn][,-1])
- rownames(goodCutoff) <- unique(mcols(object, type=type)[,"hgnc_symbol"])
- colnames(goodCutoff) <- colnames(zscores)
+ if(is.null(rownames(padj_gene))){
+ stop("Missing rownames for gene-level padj values.")
+ }
+ # reduce aberrant matrix to one row per gene
+ # (TRUE if any junction is aberrant for each sample)
+ ab_dt <- data.table(geneID=getGeneIDs(object, type=type, unique=FALSE,
+ geneColumn=geneColumn),
+ aberrantEvents)
+ ab_dt[, dt_idx:=seq_len(.N)]
+ dt_tmp <- ab_dt[!is.na(geneID), splitGenes(geneID), by="dt_idx"]
+ ab_dt <- ab_dt[dt_tmp$dt_idx]
+ ab_dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
+ ab_dt <- ab_dt[,lapply(.SD, any), by="geneID"]
+ aberrantEvents <- as.matrix(ab_dt[,-1])
+ rownames(aberrantEvents) <- ab_dt[,geneID]
+
+ if(isFALSE(all)){
+ aberrantEvents <- aberrantEvents & as.matrix(
+ padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)]
+ ) <= padjCutoff
+ }
}
- # return results
- if(by == "feature"){
- return(rowSums(goodCutoff))
- }
- if(by == "sample"){
- return(colSums(goodCutoff))
- }
- return(goodCutoff)
+ return(switch(match.arg(by),
+ none = aberrantEvents,
+ sample = colSums(aberrantEvents, na.rm=TRUE),
+ feature = rowSums(aberrantEvents, na.rm=TRUE)
+ ))
}
#' @rdname results
#' @export
setMethod("aberrant", "FraserDataSet", aberrant.FRASER)
+
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 87e5ad96..a175a0bd 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -22,15 +22,18 @@
#'
### GRange/Experiment/bamFile packages
#' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which
-#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes
+#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes exons
+#' fiveUTRsByTranscript threeUTRsByTranscript seqlevels0
#' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions
#' readGAlignmentPairs
#' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames
#' colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment
#' rbind Assays
#' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
-#' makeGRangesFromDataFrame invertStrand
-#' @importFrom IRanges subsetByOverlaps from to IRanges ranges
+#' makeGRangesFromDataFrame invertStrand start end start<- end<-
+#' seqinfo<-
+#' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance
+#' %over%
#' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter
#' bamWhich bamWhich<- BamFile idxstatsBam
#' @importFrom Rsubread featureCounts
@@ -41,16 +44,14 @@
#' @importFrom biomaRt useEnsembl getBM
#' @importFrom AnnotationDbi select
#'
-#'
### Plotting
#'
#' @importFrom plotly plot_ly subplot layout add_trace ggplotly
#' @importFrom pheatmap pheatmap
#' @importFrom RColorBrewer brewer.pal
-#' @importFrom cowplot theme_cowplot
+#' @importFrom cowplot theme_cowplot background_grid
#' @importFrom ggrepel geom_text_repel
#'
-#'
### Data handling
#'
#' @importFrom HDF5Array writeHDF5Array path HDF5Array
@@ -74,18 +75,20 @@
#' @importFrom R.utils renameFile withTimeout
#' @importFrom tools file_path_as_absolute
#' @importFrom methods as callNextMethod is new show slot slot<- validObject
-#' @importFrom utils capture.output packageVersion
+#' @importFrom utils capture.output packageVersion tail
#'
#'
#'
### To be added into the functions above
#'
#' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<-
-#' start end metadata metadata<- subjectHits queryHits
+#' start end metadata metadata<- subjectHits queryHits elementMetadata
+#' values values<-
#' @importFrom grDevices colorRampPalette
#' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels
#' seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames
-#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels
+#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels
+#' sortSeqlevels
#' @importFrom DelayedArray rowMaxs rowMeans path<- cbind plogis qlogis
#' DelayedArray
#' @importFrom DelayedMatrixStats colSds rowMedians rowSds colMeans2 rowMeans2
@@ -100,9 +103,10 @@
#' scale_y_log10 scale_color_gradientn labs theme_bw theme
#' scale_color_brewer scale_color_discrete scale_linetype_manual
#' annotate geom_histogram scale_fill_manual xlim scale_colour_manual
-#' element_blank annotation_logticks
+#' element_blank annotation_logticks ylim quo_name facet_grid
+#' facet_wrap geom_text
#'
-#' @importFrom tibble as_tibble
+#' @importFrom tibble as_tibble %>%
#'
#' @useDynLib FRASER
#'
@@ -128,5 +132,12 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
"model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa",
"padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5",
"psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr",
- "uniqueID", "V1", "value", "zscore", "maxDTheta"),
+ "uniqueID", "V1", "value", "zscore", "maxDTheta", "par", "genes_donor",
+ "genes_acceptor", "gene_pval", "gene_padj", "dt_idx",
+ "blacklist", "potentialImpact", "causesFrameshift", "annotatedJunction",
+ "distNearestGene", "UTR_overlap", "meanCount", "medianCount",
+ "potentialImpact2", "nonsplitProportion", "nonsplitCounts",
+ "nonsplitProportion_99quantile", "startID", "endID", "j_idx", "jidx",
+ "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id",
+ "pvalue"),
package="FRASER")
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index c3b9b967..92d0b492 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -31,7 +31,7 @@
#' splicing types.
#' @param implementation The method that should be used to correct for
#' confounders.
-#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing
+#' @param type The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing
#' efficiency)
#' @param iterations The maximal number of iterations. When the autoencoder has
#' not yet converged after these number of iterations, the fit stops anyway.
@@ -61,16 +61,32 @@
#' # The functions run inside the FRASER function can also be directly
#' # run themselves.
#' # To directly run the fit function:
-#' fds <- fit(fds, implementation="PCA", q=2, type="psi5")
+#' fds <- fit(fds, implementation="PCA", q=2, type="jaccard")
#'
#' # To directly run the nomial and adjusted p value and z score
#' # calculation, the following functions can be used:
-#' fds <- calculatePvalues(fds, type="psi5")
-#' head(pVals(fds, type="psi5"))
-#' fds <- calculatePadjValues(fds, type="psi5", method="BY")
-#' head(padjVals(fds, type="psi5"))
-#' fds <- calculateZscore(fds, type="psi5")
-#' head(zScores(fds, type="psi5"))
+#' fds <- calculatePvalues(fds, type="jaccard")
+#' head(pVals(fds, type="jaccard"))
+#' fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+#' head(padjVals(fds, type="jaccard"))
+#' fds <- calculateZscore(fds, type="jaccard")
+#' head(zScores(fds, type="jaccard"))
+#'
+#' # example of restricting FDR correction to subsets of genes of interest
+#' genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+#' fds <- calculatePadjValues(fds, type="jaccard",
+#' subsets=list("exampleSubset"=genesOfInterest))
+#' padjVals(fds, type="jaccard", subsetName="exampleSubset")
+#' padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset")
+#' fds <- calculatePadjValues(fds, type="jaccard",
+#' subsets=list("anotherExampleSubset"=c("TIMMDC1")))
+#' padjVals(fds, type="jaccard", subsetName="anotherExampleSubset")
+#'
+#' # only adding FDR corrected pvalues on a subset without calculating
+#' # transcriptome-wide FDR again:
+#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest,
+#' subsetName="setOfInterest", type="jaccard")
+#' padjVals(fds, type="jaccard", subsetName="setOfInterest")
#'
#' @seealso \code{\link[FRASER]{fit}}
#'
@@ -85,8 +101,9 @@ NULL
#' the beta-binomial fit, the computation of Z scores and p values as well as
#' the computation of delta-PSI values.
#' @export
-FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder",
- "AE-weighted", "AE", "BB"),
+FRASER <- function(fds, q, type=fitMetrics(fds),
+ implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted",
+ "AE", "BB"),
iterations=15, BPPARAM=bpparam(), correction, ...){
# Check input
implementation <- match.arg(implementation)
@@ -103,7 +120,7 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder",
}
# fit each splicing type separately
- for(i in psiTypes){
+ for(i in type){
# get type specific q
if(missing(q)){
@@ -128,8 +145,8 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder",
message(date(), ": Adjust p values for: '", i, "'.")
fds <- calculatePadjValues(fds, type=i)
- message(date(), ": Compute Z scores for: '", i, "'.")
- fds <- calculateZscore(fds, type=i)
+ # message(date(), ": Compute Z scores for: '", i, "'.")
+ # fds <- calculateZscore(fds, type=i)
}
# return final analysis
diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R
index 630361d4..62eee8e8 100644
--- a/R/FraserDataSet-class.R
+++ b/R/FraserDataSet-class.R
@@ -218,13 +218,13 @@ showFraserDataSet <- function(object) {
cat("\n")
cat("-------------------- BAM parameters --------------------\n")
- if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){
- cat(paste0("Default used with: ",
- "bamMapqFilter=", bamMapqFilter(scanBamParam(object))
- ))
- } else {
+ # if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){
+ # cat(paste0("Default used with: ",
+ # "bamMapqFilter=", bamMapqFilter(scanBamParam(object))
+ # ))
+ # } else {
show(scanBamParam(object))
- }
+ # }
cat("\n\n")
}
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 659f3b4c..42ec8fbc 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -41,6 +41,10 @@ truncNLL_rho <- function(rho, yi, ki, ni) {
.Call('_FRASER_truncNLL_rho', PACKAGE = 'FRASER', rho, yi, ki, ni)
}
+truncNLL_rho_penalized <- function(logit_rho, yi, ki, ni, lambda) {
+ .Call('_FRASER_truncNLL_rho_penalized', PACKAGE = 'FRASER', logit_rho, yi, ki, ni, lambda)
+}
+
fullNLL <- function(y, rho, k, n, D, lambda, byRows = FALSE) {
.Call('_FRASER_fullNLL', PACKAGE = 'FRASER', y, rho, k, n, D, lambda, byRows)
}
diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
index 62f6c377..8796cd0b 100644
--- a/R/annotationOfRanges.R
+++ b/R/annotationOfRanges.R
@@ -17,6 +17,10 @@
#' \code{TxDb.Hsapiens.UCSC.hg19.knownGene}.
#' @param orgDb An \code{orgDb} object or a data table to map the feature names.
#' If this is NULL, then \code{org.Hs.eg.db} is used as the default.
+#' @param filter A named list specifying the filters which should be applied to
+#' subset to e.g. only protein-coding genes for annotation.
+#' \code{names(filter)} needs to be column names in the given
+#' orgDb object (default: no filtering).
#' @param keytype The keytype or column name of gene IDs in the \code{TxDb}
#' object (see
#' \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}}
@@ -32,13 +36,13 @@
#' # either using biomart with GRCh38
#' try({
#' fds <- annotateRanges(fds, GRCh=38)
-#' rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
+#' rowRanges(fds, type="j")[,c("hgnc_symbol")]
#' })
#'
#' # either using biomart with GRCh37
#' try({
#' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
-#' rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
+#' rowRanges(fds, type="j")[,c("hgnc_symbol_37")]
#' })
#'
#' # or with a provided TxDb object
@@ -47,7 +51,7 @@
#' require(org.Hs.eg.db)
#' orgDb <- org.Hs.eg.db
#' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
-#' rowRanges(fds, type="psi5")[,"hgnc_symbol"]
+#' rowRanges(fds, type="j")[,"hgnc_symbol"]
#'
#' @rdname annotateRanges
#' @export
@@ -59,9 +63,6 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
if(length(fds) == 0) return(fds)
# useEnsembl only understands GRCh=37 or GRCh=NULL (uses 38 then)
- if(is.null(GRCh)){
- GRCh <- 38
- }
if(GRCh == 38){
GRCh <- NULL
}
@@ -89,17 +90,16 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
annotation <- getFeatureAsGRange(ensembl, feature, featureName,
biotype, useUSCS)
- # annotate split reads
- for(i in c("psi3", "theta")){
- gr <- rowRanges(fds, type=i)
- if(any(strand(gr) == "*")){
- strand(annotation) <- "*"
- }
- annos <- getAnnotationFeature(data=gr, featureName, annotation)
- mcols(fds, type=i)[[featureName]] <- annos[["feature"]]
- mcols(fds, type=i)[[paste0("other_", featureName)]] <-
- annos[["other_features"]]
+ # annotate splice sites first
+ gr <- rowRanges(fds, type="theta")
+ if(any(strand(gr) == "*")){
+ strand(annotation) <- "*"
}
+ annos <- getAnnotationFeature(data=gr, featureName, annotation)
+ mcols(fds, type="theta")[[featureName]] <- annos
+
+ # annotate junctions with genes at donor and acceptor sites
+ fds <- annotateFeatureFromSpliceSite(fds, featureName)
return(fds)
}
@@ -108,7 +108,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
#' @export
annotateRangesWithTxDb <- function(fds, feature="SYMBOL",
featureName="hgnc_symbol", keytype="ENTREZID",
- txdb=NULL, orgDb=NULL){
+ txdb=NULL, orgDb=NULL, filter=list()){
gene_id <- NULL
# check input
@@ -132,41 +132,49 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL",
}
}
- for(i in c("psi3", "theta")){
- # get GRanges object with the split reads which should be annotated
- gr <- rowRanges(fds, type=i)
-
- # get the annotation to compare to
- anno <- genes(txdb)
- if(is.data.table(orgDb)){
- tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb,
- by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[,
- .(gene_id, feature=get(feature))]
- setnames(tmp, "feature", feature)
- } else {
- tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"],
- columns=feature, keytype=keytype))
- }
-
- # add the new feature to the annotation
- tmp[, uniqueID := .GRP, by=keytype]
- anno <- anno[tmp[,uniqueID]]
- mcols(anno)[[featureName]] <- tmp[,get(feature)]
+ # get GRanges object with the splice sites which should be annotated
+ gr <- rowRanges(fds, type="theta")
+
+ # get the annotation to compare to
+ anno <- genes(txdb)
+ if(is.data.table(orgDb)){
+ tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb,
+ by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[,
+ c("gene_id", feature, names(filter)), with=FALSE]
+ } else {
+ tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"],
+ columns=c(feature, names(filter)), keytype=keytype))
+ }
- # clean up of NA and "" ids
- anno <- anno[!is.na(mcols(anno)[,featureName]),]
- anno <- anno[mcols(anno)[,featureName] != "",]
- if(any(strand(gr) == "*")){
- strand(anno) <- "*"
+ # filter genes as specified by user (e.g. only protein_coding)
+ tmp[, include:=TRUE]
+ if(!is.null(filter) & length(filter) > 0 & !is.null(names(filter))){
+ for(n in names(filter)){
+ stopifnot(n %in% colnames(tmp))
+ tmp[!(get(n) %in% filter[[n]]), include:=FALSE]
}
+ }
+
+ # add the new feature to the annotation
+ tmp[, uniqueID := .GRP, by=keytype]
+ tmp <- tmp[include == TRUE,]
+ anno <- anno[tmp[,uniqueID]]
+ mcols(anno)[[featureName]] <- tmp[,get(feature)]
- # retrieve the feature of interest for the split reads
- annos <- getAnnotationFeature(data=gr, featureName, anno)
- mcols(fds, type=i)[[featureName]] <- annos[["feature"]]
- mcols(fds, type=i)[[paste0("other_", featureName)]] <-
- annos[["other_features"]]
+ # clean up of NA and "" ids
+ anno <- anno[!is.na(mcols(anno)[,featureName]),]
+ anno <- anno[mcols(anno)[,featureName] != "",]
+ if(any(strand(gr) == "*")){
+ strand(anno) <- "*"
}
+ # retrieve the feature of interest for the splice sites
+ annos <- getAnnotationFeature(data=gr, featureName, anno)
+ mcols(fds, type="theta")[[featureName]] <- annos
+
+ # transfer annoated features for splice sites to junctions
+ fds <- annotateFeatureFromSpliceSite(fds, featureName)
+
return(fds)
}
@@ -228,14 +236,12 @@ getAnnotationFeature <- function(data, feature, annotation){
}
# extract only the feature and group them with a ";"
- featureDT <- featureDT[,
- list(first_feature=unique(feature)[1],
- other_features=paste(unique(feature)[-1], collapse = ";")),
- by="from"
- ]
+ featureDT <- featureDT[,feature:=paste(unique(feature), collapse = ";"),
+ by="from"]
+ featureDT <- featureDT[!duplicated(featureDT),]
+ featureDT[feature == "NA", feature:=NA]
- return(list(feature=featureDT[order(from),first_feature],
- other_features=featureDT[order(from),other_features]))
+ return(featureDT[order(from),feature])
}
@@ -314,4 +320,27 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE,
fds
}
-
+#' annotate junctions with genes at donor and acceptor sites
+#' @noRd
+annotateFeatureFromSpliceSite <- function(fds, featureName){
+ ssdt <- data.table(spliceSiteID=mcols(fds, type="theta")$spliceSiteID,
+ genes=mcols(fds, type="theta")[[featureName]]
+ )
+ junction_dt <- data.table(startID=mcols(fds, type="psi3")$startID,
+ endID=mcols(fds, type="psi3")$endID
+ )
+ junction_dt <- merge(junction_dt, ssdt, all.x=TRUE,
+ by.x="startID", by.y="spliceSiteID", sort=FALSE)
+ setnames(junction_dt, "genes", "genes_donor")
+ junction_dt <- merge(junction_dt, ssdt, all.x=TRUE,
+ by.x="endID", by.y="spliceSiteID", sort=FALSE)
+ setnames(junction_dt, "genes", "genes_acceptor")
+
+ junction_dt[,genes:=paste(uniqueIgnoreNA(
+ c(splitGenes(genes_donor), splitGenes(genes_acceptor))),
+ collapse=";"),
+ by="startID,endID"]
+ junction_dt[genes == "NA", genes:=NA]
+ mcols(fds, type="j")[[featureName]] <- junction_dt[,genes]
+ return(fds)
+}
diff --git a/R/autoencoder.R b/R/autoencoder.R
index e0e066a3..d9d0e7a2 100644
--- a/R/autoencoder.R
+++ b/R/autoencoder.R
@@ -2,8 +2,9 @@
#' Main autoencoder fit function
#'
#' @noRd
-fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1,
- rhoRange=c(1e-5, 1-1e-5), lambda=0, convergence=1e-5,
+fitAutoencoder <- function(fds, q, type=currentType(fds), noiseAlpha=1,
+ minDeltaPsi=0.1,
+ rhoRange=c(-30, 30), lambda=0, convergence=1e-5,
iterations=15, initialize=TRUE, control=list(),
BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5,
weighted=FALSE, nSubset=15000, multiRho=FALSE,
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index e605ba67..f9f3f991 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -14,14 +14,14 @@
#'
#' @inheritParams countRNA
#' @param types A vector with the psi types which should be calculated. Default
-#' is all of psi5, psi3 and theta.
+#' is all of jaccard, psi5, psi3 and theta.
#' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will
#' be recalculated based on the existing junction counts (aka K)
#' @return FraserDataSet
#' @export
#' @examples
#' fds <- createTestFraserDataSet()
-#' fds <- calculatePSIValues(fds, types="psi5")
+#' fds <- calculatePSIValues(fds, types="jaccard")
#'
#' ### usually one would run this function for all psi types by using:
#' # fds <- calculatePSIValues(fds)
@@ -36,6 +36,10 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE,
overwriteCts=overwriteCts, BPPARAM=BPPARAM)
}
+ # calculate intron jaccard index
+ fds <- calculateIntronNonsplitSum(fds, overwriteCts=overwriteCts)
+ fds <- calculateJaccardIntronIndex(fds, overwriteCts=overwriteCts)
+
# calculate the delta psi value
for(psiType in types){
assayName <- paste0("delta_", psiType)
@@ -183,7 +187,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
# check input
stopifnot(is(fds, "FraserDataSet"))
- message(date(), ": Calculate the PSI site values ...")
+ message(date(), ": Calculate the theta values ...")
psiName <- "theta"
psiROCName <- "rawOtherCounts_theta"
@@ -319,3 +323,101 @@ getOtherCountsCacheFolder <- function(fds){
# return it
return(cachedir)
}
+
+#'
+#' calculates the jaccard intron value for the given junctions
+#'
+#' @noRd
+calculateJaccardIntronIndex <- function(fds, overwriteCts){
+ stopifnot(is(fds, "FraserDataSet"))
+
+ message(date(), ": Calculate the Jaccard Intron values ...")
+
+ # check if we have computed N_psi3, N_psi5 and K_nonsplit already
+ if(!all(c(paste0("rawOtherCounts_psi", c(5, 3)), "rawCountsJnonsplit") %in%
+ assayNames(fds))){
+ stop("Please calculate N_psi3, N_psi5 and K_nonsplit first before ",
+ "calling this function.")
+ }
+
+ # calculate intron jaccard value
+ jaccard_denom <- N(fds, "psi3") + N(fds, "psi5") +
+ assay(fds, "rawCountsJnonsplit") - K(fds, type="j")
+ jaccardValues <- K(fds, type="j") / jaccard_denom
+ otherCounts_jaccard <- jaccard_denom - K(fds, type="j")
+
+ # assign it to our object
+ assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues
+
+ if(isTRUE(overwriteCts) ||
+ !("rawOtherCounts_jaccard" %in% assayNames(fds))){
+ assay(fds, type="j", "rawOtherCounts_jaccard",
+ withDimnames=FALSE) <- otherCounts_jaccard
+ }
+
+ return(fds)
+}
+
+#' Calculates the sum of nonsplit reads overlapping either the donor or
+#' acceptor splice site and stores it as a new assay (one value for each
+#' junction and sample).
+#'
+#' @noRd
+calculateIntronNonsplitSum <- function(fds, overwriteCts){
+ stopifnot(is(fds, "FraserDataSet"))
+
+ message(date(), ": Calculate the total nonsplit counts for each intron ",
+ "...")
+
+
+ # get splice site nonsplit counts
+ nsr_ss <- K(fds, "theta")
+
+ # retrieve junction and splice site annotation
+ junction_dt <- as.data.table(rowRanges(fds, type="j"))[,
+ .(seqnames, start, end,
+ strand, startID, endID)]
+ junction_dt[, j_idx:=seq_len(.N)]
+ ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID,
+ nsr_idx=seq_len(nrow(nonSplicedReads(fds))))
+
+ junction_dt <- merge(junction_dt, ss_map,
+ by.x="startID", by.y="spliceSiteID",
+ all.x=TRUE)
+ setnames(junction_dt, "nsr_idx", "start_idx")
+ junction_dt <- merge(junction_dt, ss_map,
+ by.x="endID", by.y="spliceSiteID",
+ all.x=TRUE)
+ setnames(junction_dt, "nsr_idx", "end_idx")
+
+ # for each junction, find the two rows in K_theta corresponding to its
+ # donor and acceptor splice site
+ donor_sites <- junction_dt[!is.na(start_idx),]
+ acc_sites <- junction_dt[!is.na(end_idx),]
+
+ # set nsr counts to 0 for junctions for which no mapping by spliceSiteID
+ # could be found
+ nsr_donor <- matrix(0, nrow=nrow(fds), ncol=ncol(fds))
+ nsr_acc <- matrix(0, nrow=nrow(fds), ncol=ncol(fds))
+
+ nsr_donor[donor_sites[,j_idx],] <-
+ as.matrix(nsr_ss[donor_sites[,start_idx],])
+ nsr_acc[acc_sites[,j_idx],] <-
+ as.matrix(nsr_ss[acc_sites[,end_idx],])
+
+ # sum them
+ nsr_j <- nsr_donor + nsr_acc
+
+ if(nrow(nsr_j) != nrow(fds)){
+ warning("Unequal number of junctions in fds and junctions with ",
+ "computed nonsplit count sum!")
+ }
+
+ # assign it to our object
+ if(isTRUE(overwriteCts) ||
+ !("rawCountsJnonsplit" %in% assayNames(fds))){
+ assay(fds, type="j", "rawCountsJnonsplit", withDimnames=FALSE) <- nsr_j
+ }
+
+ return(fds)
+}
diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index f68c4bdc..9ef6ef5f 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -361,7 +361,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges,
" splice junctions are found.")
# extract donor and acceptor sites
- spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+ spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
message(date(), ": In total ", length(spliceSiteCoords),
" splice sites (acceptor/donor) will be counted ...")
@@ -569,6 +569,10 @@ countSplitReadsPerChromosome <- function(chromosome, bamFile,
bamFile, param=param, strandMode=strandMode)
}
+ # remove read pairs with NA seqnames
+ # (occurs if reads of a pair align to different chromosomes)
+ galignment <- galignment[!is.na(seqnames(galignment))]
+
# remove the strand information if unstranded data
if(isFALSE(as.logical(strandMode))){
strand(galignment) <- "*"
@@ -852,7 +856,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
}
# extract donor and acceptor sites
- spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+ spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
}
@@ -895,7 +899,8 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
# extract the counts with Rsubread
tmp_ssc <- checkSeqLevelStyle(spliceSiteCoords, fds, sampleID, TRUE)
- anno <- GRanges2SAF(tmp_ssc, minAnchor=minAnchor)
+ # use minAnchor+1 here to allow for small variants in the anchor region
+ anno <- GRanges2SAF(tmp_ssc, minAnchor=(minAnchor+1))
rsubreadCounts <- featureCounts(files=bamFile, annot.ext=anno,
minOverlap=minAnchor*2,
allowMultiOverlap=TRUE,
@@ -974,17 +979,12 @@ readJunctionMap <- function(junctionMap){
#' extracts the splice site coordinates from a junctions GRange object (
#' @noRd
-extractSpliceSiteCoordinates <- function(junctions, fds){
+extractSpliceSiteCoordinates <- function(junctions){
- if(strandSpecific(fds) >= 1L){
- spliceSiteCoords <- unlist(GRangesList(
- extractSpliceSiteCoordsPerStrand(junctions, "+"),
- extractSpliceSiteCoordsPerStrand(junctions, "-")
- ))
- } else {
- strand(junctions) <- "*"
- spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*")
- }
+ spliceSiteCoords <- unlist(GRangesList(
+ lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand,
+ junctions=junctions)
+ ))
return(unique(sort(spliceSiteCoords)))
}
@@ -1049,15 +1049,21 @@ annotateSpliceSite <- function(gr){
dt <- GRanges2SAF(gr)
# extract donor/acceptor annotation
- startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
- endSideDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"]
+ startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
+ endSiteDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"]
+ startSiteDT[,Start:=Start-1]
+ endSiteDT[,End:=End+1]
# annotate and enumerate donor/acceptor
- annotadedDT <- rbind(startSideDT, endSideDT)
- annotadedDT[,id:=seq_len(nrow(annotadedDT))]
+ annotatedDT <- rbind(startSiteDT, endSiteDT)
+ annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"]
+
+ # set back start / end positions for merging with junction ranges
+ annotatedDT[type == "start", Start:=End]
+ annotatedDT[type == "end", End:=Start]
# convert back to granges
- annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE)
+ annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE)
ids <- lapply(c("start", "end"), function(type){
# reduce annogr to only the specific type to prevent overlap
diff --git a/R/example_functions.R b/R/example_functions.R
index 620b5ffc..c4b992be 100644
--- a/R/example_functions.R
+++ b/R/example_functions.R
@@ -61,7 +61,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){
if(all(file.exists(hdf5Files))){
if(isFALSE(rerun)){
fds <- loadFraserDataSet(workingDir, name="Data_Analysis")
- if(all(paste0(c("zScores", "padjBetaBinomial", "predictedMeans"),
+ if(all(paste0(c("padjBetaBinomial", "predictedMeans"),
"_", rep(psiTypes, 3)) %in% assayNames(fds))){
message(date(), ": Use existing cache data.")
return(fds)
@@ -80,12 +80,12 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){
fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=5,
minDeltaPsi=0, quantileMinExpression=0)
- # run FRASER pipeline
- fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2)
-
# annotate it
suppressMessages({ fds <- annotateRangesWithTxDb(fds) })
+ # run FRASER pipeline
+ fds <- FRASER(fds, q=c(jaccard=2, psi5=2, psi3=2, theta=2), iterations=2)
+
# save data for later
fds <- saveFraserDataSet(fds)
diff --git a/R/filterExpression.R b/R/filterExpression.R
index dde3b29a..6c7eab8c 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -17,6 +17,9 @@
#' passed all filters is returned. If FALSE, no subsetting is done and the
#' information of whether an intron passed the filters is only stored in the
#' mcols.
+#' @param filterOnJaccard If TRUE, the Intron Jaccard Metric is used to define
+#' express introns during fitlering. Otherwise, the psi5, psi3 and theta
+#' metrics are used (default: TRUE).
#' @param delayed If FALSE, count matrices will be loaded into memory,
#' otherwise the function works on the delayedMatrix representations. The
#' default value depends on the number of samples in the fds-object.
@@ -27,8 +30,8 @@
#' @examples
#' fds <- createTestFraserDataSet()
#' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
-#' mcols(fds, type="psi5")[, c(
-#' "maxCount", "passedExpression", "maxDPsi3", "passedVariability")]
+#' mcols(fds, type="jaccard")[, c(
+#' "maxCount", "passedExpression", "maxDJaccard", "passedVariability")]
#'
#' plotFilterExpression(fds)
#' plotFilterVariability(fds)
@@ -42,21 +45,25 @@ NULL
#' read support and introns that are not variable across samples.
#' @export
filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
- quantile=0.95, quantileMinExpression=10, minDeltaPsi=0.05,
+ quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0,
filter=TRUE,
delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard=TRUE,
BPPARAM=bpparam()){
+
# filter introns with low read support and corresponding splice sites
object <- filterExpression(object,
minExpressionInOneSample=minExpressionInOneSample,
quantile=quantile,
quantileMinExpression=quantileMinExpression,
filter=filter, delayed=delayed,
+ filterOnJaccard=filterOnJaccard,
BPPARAM=BPPARAM)
# filter introns that are not variable across samples
object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter,
- delayed=delayed, BPPARAM=BPPARAM)
+ delayed=delayed, filterOnJaccard=filterOnJaccard,
+ BPPARAM=BPPARAM)
# return fds
message(date(), ": Filtering done!")
@@ -64,11 +71,307 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
}
-filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
- quantile=0.95, quantileMinExpression=10, filter=TRUE,
+#' @noRd
+filterExpression.FRASER2 <- function(object, minExpressionInOneSample=20,
+ quantile=0.75, quantileMinExpression=10, filter=TRUE,
+ delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard=TRUE, BPPARAM=bpparam()){
+ if(isTRUE(filterOnJaccard)){
+ return(filterExpression_jaccard(object,
+ minExpressionInOneSample=minExpressionInOneSample,
+ quantile=quantile,
+ quantileMinExpression=quantileMinExpression,
+ filter=filter, delayed=delayed,
+ BPPARAM=BPPARAM))
+ } else{
+ return(filterExpression.FRASER(object,
+ minExpressionInOneSample=minExpressionInOneSample,
+ quantile=quantile,
+ quantileMinExpression=quantileMinExpression,
+ filter=filter, delayed=delayed,
+ BPPARAM=BPPARAM))
+ }
+}
+
+#' @describeIn filtering This function filters out introns and corresponding
+#' splice sites that have low read support in all samples.
+#' @export
+setMethod("filterExpression", signature="FraserDataSet",
+ filterExpression.FRASER2)
+
+#' This function filters out introns and corresponding
+#' splice sites which are expressed at very low levels across samples.
+#' @noRd
+filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
+ quantile=0.75, quantileMinExpression=10, filter=TRUE,
delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
BPPARAM=bpparam()){
+
+ stopifnot(is(object, "FraserDataSet"))
+
+ message(date(), ": Filtering out introns with low read support ...")
+
+ # extract counts
+ cts <- K(object, type="j")
+ ctsN <- N(object, type="jaccard")
+
+ if(isFALSE(delayed)){
+ cts <- as.matrix(cts)
+ ctsN <- as.matrix(ctsN)
+ }
+
+ # cutoff functions
+ f1 <- function(cts, ...){
+ rowMaxs(cts) }
+ f2 <- function(cts, ctsN, quantile, ...){
+ rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] }
+
+ funs <- c(maxCount=f1, quantileValueN=f2)
+
+ # run it in parallel
+ cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
+ cts=cts, ctsN=ctsN, quantile=quantile)
+
+ # add annotation to object
+ for(n in names(cutoffs)){
+ mcols(object, type="j")[n] <- cutoffs[[n]]
+ }
+
+ mcols(object, type="j")[['passedExpression']] <-
+ cutoffs$maxCount >= minExpressionInOneSample &
+ cutoffs$quantileValueN >= quantileMinExpression
+ if("passedVariability" %in% colnames(mcols(object, type="j"))){
+ mcols(object, type="j")[['passed']] <-
+ mcols(object, type="j")[['passedExpression']] &
+ mcols(object, type="j")[['passedVariability']]
+ } else{
+ mcols(object, type="j")[['passed']] <-
+ mcols(object, type="j")[['passedExpression']]
+ }
+
+ # filter if requested
+ if(isTRUE(filter)){
+ object <- applyExpressionFilters_jaccard(object,
+ minExpressionInOneSample,
+ quantileMinExpression)
+ }
+
+ validObject(object)
+ return(object)
+}
+#' @noRd
+filterVariability.FRASER2 <- function(object, minDeltaPsi=0, filter=TRUE,
+ delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard=TRUE, BPPARAM=bpparam()){
+ if(isTRUE(filterOnJaccard)){
+ object <- filterVariability_jaccard(object, minDeltaPsi=minDeltaPsi,
+ filter=filter, delayed=delayed, BPPARAM=BPPARAM)
+ } else{
+ object <- filterVariability.FRASER(object, minDeltaPsi=minDeltaPsi,
+ filter=filter, delayed=delayed, BPPARAM=BPPARAM)
+ }
+}
+
+#' @describeIn filtering This function filters out introns and corresponding
+#' splice sites that have low read support in all samples.
+#' @export
+setMethod("filterVariability", signature="FraserDataSet",
+ filterVariability.FRASER2)
+
+
+#' This function filters out introns and corresponding
+#' splice sites which do not show variablity across samples.
+#' @noRd
+filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE,
+ delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ BPPARAM=bpparam()){
+
+ message(date(), ": Filtering out non-variable introns ...")
+
+ # extract counts
+ cts <- K(object, type="j")
+ ctsN <- N(object, type="jaccard")
+
+ if(isFALSE(delayed)){
+ cts <- as.matrix(cts)
+ ctsN <- as.matrix(ctsN)
+ }
+
+ # cutoff functions
+ f1 <- function(cts, ctsN, ...) {
+ jaccard <- cts/ctsN
+ rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)),
+ na.rm=TRUE) }
+
+ funs <- c(maxDJaccard=f1)
+
+ # run it in parallel
+ cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
+ cts=cts, ctsN=ctsN)
+
+ # add annotation to object
+ for(n in names(cutoffs)){
+ mcols(object, type="j")[n] <- cutoffs[[n]]
+ }
+
+ # add annotation of theta on splice sites of introns to mcols
+ intron_dt <- as.data.table(rowRanges(object, type="j"))
+
+ # check which introns pass the filter
+ mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
+ cutoffs$maxDJaccard,
+ 0) >= minDeltaPsi
+ if("passedExpression" %in% colnames(mcols(object, type="j"))){
+ mcols(object, type="j")[['passed']] <-
+ mcols(object, type="j")[['passedExpression']] &
+ mcols(object, type="j")[['passedVariability']]
+ } else{
+ mcols(object, type="j")[['passed']] <-
+ mcols(object, type="j")[['passedVariability']]
+ }
+
+ # filter if requested
+ if(isTRUE(filter)){
+ object <- applyVariabilityFilters_jaccard(object, minDeltaPsi)
+ }
+
+ validObject(object)
+ return(object)
+}
+
+#' Applies previously calculated filters for expression filters
+#' @noRd
+applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample,
+ quantileMinExpression){
+
+ maxCount <- mcols(fds, type="j")[['maxCount']]
+ quantileValueN <- mcols(fds, type="j")[['quantileValueN']]
+
+ # report rare junctions that passed minExpression filter but not
+ # quantileFilter as SE obj
+ junctionsToReport <- maxCount >= minExpressionInOneSample &
+ !(quantileValueN >= quantileMinExpression)
+ outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
+
+ if(any(junctionsToReport)){
+ # get SE object of junctions to report
+ rareJunctions <- asSE(fds[junctionsToReport, by="j"])
+ for(aname in assayNames(rareJunctions)){
+ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5",
+ "rawOtherCounts_psi3", "psi5", "psi3",
+ "delta_psi5", "delta_psi3", "jaccard",
+ "rawOtherCounts_intron_jaccard"))){
+ assay(rareJunctions, aname) <- NULL
+ }
+ }
+ rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions,
+ dir=file.path(tempdir(), "tmp_rJ"),
+ replace=TRUE)
+
+ # check if folder already exists from previous filtering
+ rareJctsDir <- file.path(outputDir, "rareJunctions")
+ if(dir.exists(rareJctsDir)){
+ warning("Filtering has already been applied previously. Introns ",
+ "that were already filtered out but should be kept now ",
+ "cannot be restored.")
+ rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir)
+ toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample &
+ !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression)
+
+ rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions)
+
+ for(aname in assayNames(rJ_tmp)){
+ assay(rJ_tmp, aname) <-
+ rbind(as.matrix(assay(rareJunctions, aname)),
+ as.matrix(assay(rJ_stored[toReport,], aname)) )
+ }
+ rareJunctions <- rJ_tmp
+ rm(rJ_tmp)
+ }
+
+ rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions,
+ dir=rareJctsDir, replace=TRUE)
+ }
+
+ # apply filter
+ numFilt <- sum(mcols(fds, type="j")[['passedExpression']])
+ message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
+ ". This is ", signif(numFilt/length(fds)*100, 3),
+ "% of the junctions"))
+ fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
+
+ return(fds)
+}
+
+
+#' Applies previously calculated variablilty filters
+#' @noRd
+applyVariabilityFilters_jaccard <- function(fds, minDeltaPsi){
+
+ #
+ passedVariability <- mcols(fds, type="j")[['passedVariability']]
+
+ # store information of non-variable junctions
+ filtered <- !passedVariability
+
+ outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
+ if(any(filtered)){
+ # get SE object of junctions to report
+ nonVariableJunctions <- asSE(fds[filtered, by="j"])
+ for(aname in assayNames(nonVariableJunctions)){
+ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5",
+ "rawOtherCounts_psi3", "psi5", "psi3",
+ "delta_psi5", "delta_psi3", "jaccard",
+ "rawOtherCounts_intron_jaccard"))){
+ assay(nonVariableJunctions, aname) <- NULL
+ }
+ }
+ nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE,
+ nonVariableJunctions,
+ dir=file.path(tempdir(), "tmp_nvJ"))
+
+ # check if folder already exists from previous filtering
+ nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions")
+ if(dir.exists(nonVarJctsDir)){
+ warning("Filtering has already been applied previously. Introns ",
+ "that were already filtered out but should be kept now ",
+ "cannot be restored.")
+ nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir)
+ toReport <- mcols(nV_stored)$maxDJaccard < minDeltaPsi
+
+ nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
+ for(aname in assayNames(nVJunctions)){
+ assay(nVJunctions, aname) <-
+ rbind(as.matrix(assay(nonVariableJunctions, aname)),
+ as.matrix(assay(nV_stored[toReport,], aname)) )
+ }
+ nonVariableJunctions <- nVJunctions
+ rm(nVJunctions)
+ }
+
+ nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir,
+ x=nonVariableJunctions, replace=TRUE)
+
+ }
+
+ # apply filtering
+ numFilt <- sum(passedVariability)
+ message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
+ ". This is ", signif(numFilt/length(fds)*100, 3),
+ "% of the junctions"))
+ fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
+ return(fds)
+}
+
+
+#' Old FRASER1 filtering functions
+#' @noRd
+filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
+ quantile=0.95, quantileMinExpression=10, filter=TRUE,
+ delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ BPPARAM=bpparam()){
+
stopifnot(is(object, "FraserDataSet"))
message(date(), ": Filtering out introns with low read support ...")
@@ -83,30 +386,30 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
ctsN5 <- as.matrix(ctsN5)
ctsN3 <- as.matrix(ctsN3)
}
-
+
# cutoff functions
f1 <- function(cts, ...){
- rowMaxs(cts) }
+ rowMaxs(cts) }
f2 <- function(cts, ctsN5, quantile, ...){
- rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] }
+ rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] }
f3 <- function(cts, ctsN3, quantile, ...) {
- rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] }
-
+ rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] }
+
funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3)
-
+
# run it in parallel
cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
- cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile)
-
+ cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile)
+
# add annotation to object
for(n in names(cutoffs)){
mcols(object, type="j")[n] <- cutoffs[[n]]
}
mcols(object, type="j")[['passedExpression']] <-
- cutoffs$maxCount >= minExpressionInOneSample &
- (cutoffs$quantileValue5 >= quantileMinExpression &
- cutoffs$quantileValue3 >= quantileMinExpression)
+ cutoffs$maxCount >= minExpressionInOneSample &
+ (cutoffs$quantileValue5 >= quantileMinExpression &
+ cutoffs$quantileValue3 >= quantileMinExpression)
if("passedVariability" %in% colnames(mcols(object, type="j"))){
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedExpression']] &
@@ -119,25 +422,18 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
# filter if requested
if(isTRUE(filter)){
object <- applyExpressionFilters(object, minExpressionInOneSample,
- quantileMinExpression)
+ quantileMinExpression)
}
-
+
validObject(object)
return(object)
}
-#' @describeIn filtering This function filters out introns and corresponding
-#' splice sites that have low read support in all samples.
-#' @export
-setMethod("filterExpression", signature="FraserDataSet",
- filterExpression.FRASER)
-
-#' @describeIn filtering This function filters out introns and corresponding
-#' splice sites which do not show variablity across samples.
-#' @export
-filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE,
- delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
- BPPARAM=bpparam()){
+#' Old FRASER1 filtering functions
+#' @noRd
+filterVariability.FRASER <- function(object, minDeltaPsi=0.05, filter=TRUE,
+ delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+ BPPARAM=bpparam()){
message(date(), ": Filtering out non-variable introns ...")
@@ -167,7 +463,7 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE,
theta <- ctsSE/ctsNSE
dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)),
na.rm=TRUE) }
-
+
funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3)
@@ -194,14 +490,14 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE,
mcols(object, type="j")["maxDThetaAcceptor"] <-
merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID",
all.x=TRUE, sort=FALSE)[,maxDTheta]
-
+
# check which introns pass the filter
mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
- cutoffs$maxDPsi3,
- cutoffs$maxDPsi5,
- mcols(object, type="j")$maxDThetaDonor,
- mcols(object, type="j")$maxDThetaAcceptor,
- 0) >= minDeltaPsi
+ cutoffs$maxDPsi3,
+ cutoffs$maxDPsi5,
+ mcols(object, type="j")$maxDThetaDonor,
+ mcols(object, type="j")$maxDThetaAcceptor,
+ 0) >= minDeltaPsi
if("passedExpression" %in% colnames(mcols(object, type="j"))){
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedExpression']] &
@@ -232,8 +528,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
# report rare junctions that passed minExpression filter but not
# quantileFilter as SE obj
junctionsToReport <- maxCount >= minExpressionInOneSample &
- !(quantileValue5 >= quantileMinExpression &
- quantileValue3 >= quantileMinExpression)
+ !(quantileValue5 >= quantileMinExpression &
+ quantileValue3 >= quantileMinExpression)
outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
if(any(junctionsToReport)){
@@ -247,8 +543,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
}
}
rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions,
- dir=file.path(tempdir(), "tmp_rJ"),
- replace=TRUE)
+ dir=file.path(tempdir(), "tmp_rJ"),
+ replace=TRUE)
# check if folder already exists from previous filtering
rareJctsDir <- file.path(outputDir, "rareJunctions")
@@ -284,7 +580,6 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
return(fds)
-
}
#' Applies previously calculated variablilty filters
@@ -308,8 +603,8 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
nonVariableJunctions <- asSE(fds[filtered, by="j"])
for(aname in assayNames(nonVariableJunctions)){
if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5",
- "rawOtherCounts_psi3", "psi5", "psi3",
- "delta_psi5", "delta_psi3"))){
+ "rawOtherCounts_psi3", "psi5", "psi3",
+ "delta_psi5", "delta_psi3"))){
assay(nonVariableJunctions, aname) <- NULL
}
}
@@ -325,15 +620,15 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
"cannot be restored.")
nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir)
toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi &
- mcols(nV_stored)$maxDPsi3 < minDeltaPsi &
- mcols(nV_stored)$maxDThetaDonor < minDeltaPsi &
- mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi
+ mcols(nV_stored)$maxDPsi3 < minDeltaPsi &
+ mcols(nV_stored)$maxDThetaDonor < minDeltaPsi &
+ mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi
nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
for(aname in assayNames(nVJunctions)){
assay(nVJunctions, aname) <-
- rbind(as.matrix(assay(nonVariableJunctions, aname)),
- as.matrix(assay(nV_stored[toReport,], aname)) )
+ rbind(as.matrix(assay(nonVariableJunctions, aname)),
+ as.matrix(assay(nV_stored[toReport,], aname)) )
}
nonVariableJunctions <- nVJunctions
rm(nVJunctions)
@@ -351,5 +646,5 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
"% of the junctions"))
fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
return(fds)
-
+
}
diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R
index b01a4d1a..ad4873ec 100644
--- a/R/find_encoding_dimensions.R
+++ b/R/find_encoding_dimensions.R
@@ -27,6 +27,8 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){
fds <- calculatePvalues(fds, type=type, implementation=implementation,
BPPARAM=BPPARAM)
+ fds <- calculatePadjValues(fds, type=type, geneLevel=FALSE,
+ BPPARAM=BPPARAM)
return(fds)
}
@@ -50,7 +52,7 @@ eval_prot <- function(fds, type){
}, FUN.VALUE=logical(length(unique(index))) ) ) + 0
if(any(is.na(scores))){
- warning(sum(is.na(scores)), " P-values where NAs.")
+ # warning(sum(is.na(scores)), " P-values where NAs.")
scores[is.na(scores)] <- min(scores, na.rm=TRUE)-1
}
pr <- pr.curve(scores, weights.class0=labels)
@@ -111,17 +113,18 @@ findEncodingDim <- function(i, fds, type, params, implementation,
#' @examples
#' # generate data
#' fds <- makeSimulatedFraserDataSet(m=15, j=20)
+#' fds <- calculatePSIValues(fds)
#'
#' # run hyperparameter optimization
-#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5))
+#' fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5))
#'
#' # get estimated optimal dimension of the latent space
-#' bestQ(fds, type="psi5")
-#' hyperParams(fds, type="psi5")
+#' bestQ(fds, type="jaccard")
+#' hyperParams(fds, type="jaccard")
#'
#' @export
-optimHyperParams <- function(fds, type, implementation="PCA",
- q_param=seq(2, min(40, ncol(fds)), by=3),
+optimHyperParams <- function(fds, type=psiTypes, implementation="PCA",
+ q_param=getEncDimRange(fds),
noise_param=0, minDeltaPsi=0.1,
iterations=5, setSubset=50000, injectFreq=1e-2,
BPPARAM=bpparam(), internalThreads=1, plot=TRUE,
@@ -227,3 +230,19 @@ optimHyperParams <- function(fds, type, implementation="PCA",
return(fds)
}
+#' Get default range of latent space dimensions to test during hyper param opt
+#' @noRd
+getEncDimRange <- function(fds, mp=3){
+ # Get range for latent space dimension
+ a <- 2
+ b <- min(ncol(fds), nrow(fds)) / mp # N/mp
+
+ maxSteps <- 12
+ if(mp < 6){
+ maxSteps <- 15
+ }
+
+ Nsteps <- min(maxSteps, b)
+ pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique
+ return(pars_q)
+}
diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R
index f9b67599..1b726403 100644
--- a/R/fitCorrectionMethods.R
+++ b/R/fitCorrectionMethods.R
@@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
"AE", "AE-weighted", "PCA-BB-full", "fullAE",
"PCA-regression", "PCA-reg-full",
"PCA-BB-Decoder-no-weights", "BB"),
- q, type="psi3", rhoRange=c(1e-8, 1-1e-8),
+ q, type=psiTypes, rhoRange=c(-30, 30),
weighted=FALSE, noiseAlpha=1, convergence=1e-5,
iterations=15, initialize=TRUE, control=list(),
BPPARAM=bpparam(), nSubset=15000,
@@ -51,6 +51,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
paste(names(list(...)), collapse=", "))
}
method <- match.arg(implementation)
+ type <- match.arg(type)
verbose <- verbose(object) > 0
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 94e8ca5e..00fb0055 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -10,11 +10,14 @@
#' @param level Indicates if the retrieved p values should be adjusted on the
#' donor/acceptor site-level (default) or if unadjusted junction-level
#' p values should be returned.
+#' @param filters A named list giving the filters that were applied for masking
+#' during p value correction. Used for storing and retrieving the
+#' correct set of requested p values.
#' @param value The new value to be assigned.
#' @param all Logical value indicating whether \code{hyperParams(fds)} should
#' return the results of all evaluated parameter combinations or only
#' for the optimal parameter combination.
-#' @param ... Internally used parameteres.
+#' @param ... Internally used parameters.
#' @return A (delayed) matrix or vector dependent on the type of data retrieved.
#'
#' @name getter_setter_functions
@@ -29,7 +32,7 @@
#' dontWriteHDF5 <- TRUE
#'
#' # get/set the splice metric for which results should be retrieved
-#' currentType(fds) <- "psi5"
+#' currentType(fds) <- "jaccard"
#' currentType(fds)
#'
#' # get fitted parameters
@@ -40,6 +43,9 @@
#' # get statistics
#' pVals(fds)
#' padjVals(fds)
+#'
+#' # zscore not calculated by default
+#' fds <- calculateZscore(fds, type="jaccard")
#' zScores(fds)
#'
#' # set and get pseudocount
@@ -47,9 +53,9 @@
#' pseudocount()
#'
#' # retrieve or set a mask to exclude certain junctions in the fitting step
-#' featureExclusionMask(fds, type="theta") <- sample(
-#' c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE)
-#' featureExclusionMask(fds, type="theta")
+#' featureExclusionMask(fds, type="jaccard") <- sample(
+#' c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE)
+#' featureExclusionMask(fds, type="jaccard")
#'
#' # controlling the verbosity level of the output of some algorithms
#' verbose(fds) <- 2
@@ -198,7 +204,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){
}
-`setAssayMatrix<-` <- function(fds, name, type, ..., value){
+`setAssayMatrix<-` <- function(fds, name, type=currentType(fds), ..., value){
if(!is.matrix(value)){
value <- matrix(value, ncol=ncol(fds), nrow=nrow(mcols(fds, type=type)))
}
@@ -217,7 +223,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){
fds
}
-getAssayMatrix <- function(fds, name, type, byGroup=FALSE){
+getAssayMatrix <- function(fds, name, type=currentType(fds), byGroup=FALSE){
if(missing(name)){
name <- type
} else {
@@ -248,8 +254,8 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
#' @describeIn getter_setter_functions This returns the calculated p-values.
#' @export
pVals <- function(fds, type=currentType(fds), level="site",
- dist="BetaBinomial", ...){
- level <- match.arg(level, choices=c("site", "junction"))
+ filters=list(), dist="BetaBinomial", ...){
+ level <- match.arg(level, choices=c("site", "junction", "gene"))
dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
aname <- paste0("pvalues", dist)
if(level == "junction"){
@@ -260,33 +266,148 @@ pVals <- function(fds, type=currentType(fds), level="site",
warning("Did not find junction-level p values. ",
"Using site-level p values instead.")
}
+ } else{
+ aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+ # add information on used filters
+ if(is.null(names(filters))){
+ filters <- list(rho=1)
+ }
+ for(n in sort(names(filters))){
+ aname_new <- paste0(aname, "_", n, filters[[n]])
+ if(n == "rho" && filters[[n]] == 1){
+ if(any(grepl(aname_new, assayNames(fds))) ||
+ any(grepl(aname_new, names(metadata(fds))))){
+ aname <- aname_new
+ }
+ }else{
+ aname <- aname_new
+ }
+ }
+ if(level == "gene"){
+ if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
+ stop("Did not find gene-level p values. ",
+ "Please compute them first.")
+ }
+ return(metadata(fds)[[paste(aname, type, sep="_")]])
+ }
}
+
getAssayMatrix(fds, aname, type=type, ...)
}
`pVals<-` <- function(fds, type=currentType(fds), level="site",
+ filters=list(),
dist="BetaBinomial", ..., value){
- level <- match.arg(level, choices=c("site", "junction"))
+ level <- match.arg(level, choices=c("site", "junction", "gene"))
dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
aname <- paste0("pvalues", dist)
if(level == "junction"){
aname <- paste0(aname, "_junction")
+ setAssayMatrix(fds, name=aname, type=type, ...) <- value
+ return(fds)
+ } else if(level == "gene"){
+ aname <- paste0(aname, "_gene")
+ }
+ # add information on used filters
+ for(n in sort(names(filters))){
+ aname <- paste0(aname, "_", n, filters[[n]])
+ }
+
+ if(level == "gene"){
+ if(is.null(rownames(value))){
+ stop("Missing rownames when storing gene-level pvalues.")
+ }
+ metadata(fds)[[paste(aname, type, sep="_")]] <- value
+ } else{
+ setAssayMatrix(fds, name=aname, type=type, ...) <- value
}
- setAssayMatrix(fds, name=aname, type=type, ...) <- value
return(fds)
}
#' @describeIn getter_setter_functions This returns the adjusted p-values.
#' @export
-padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){
+padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
+ level="site", subsetName=NULL, filters=list(), ...){
+ level <- match.arg(level, choices=c("site", "gene"))
dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
- return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...))
+ aname <- paste0("padj", dist)
+ aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+ if(!is.null(subsetName)){
+ aname <- paste0(aname, "_", subsetName)
+ }
+ # add information on used filters
+ if(is.null(names(filters))){
+ filters <- list(rho=1)
+ }
+ for(n in sort(names(filters))){
+ aname_new <- paste0(aname, "_", n, filters[[n]])
+ if(n == "rho" && filters[[n]] == 1){
+ if(any(grepl(aname_new, assayNames(fds))) ||
+ any(grepl(aname_new, names(metadata(fds))))){
+ aname <- aname_new
+ }
+ }else{
+ aname <- aname_new
+ }
+ }
+ if(level == "gene"){
+ if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
+ stop("Did not find gene-level padj values. ",
+ "Please compute them first.")
+ }
+ return(metadata(fds)[[paste(aname, type, sep="_")]])
+ }
+ return(getAssayMatrix(fds, aname, type=type, ...))
}
-`padjVals<-` <- function(fds, type=currentType(fds),
- dist="BetaBinomial", ..., value){
+`padjVals<-` <- function(fds, type=currentType(fds), level="site",
+ dist="BetaBinomial", subsetName=NULL, filters=list(), ...,
+ value){
+ level <- match.arg(level, choices=c("site", "gene"))
dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
- setAssayMatrix(fds, name=paste0("padj", dist), type=type, ...) <- value
+ aname <- paste0("padj", dist)
+ aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+ if(!is.null(subsetName)){
+ aname <- paste0(aname, "_", subsetName)
+ }
+ # add information on used filters
+ for(n in sort(names(filters))){
+ aname <- paste0(aname, "_", n, filters[[n]])
+ }
+ if(level == "gene"){
+ if(is.null(rownames(value))){
+ stop("Missing rownames when storing gene-level pvalues.")
+ }
+ metadata(fds)[[paste(aname, type, sep="_")]] <- value
+ } else{
+ setAssayMatrix(fds, name=aname, type=type, ...) <- value
+ }
+ return(fds)
+}
+
+#' @describeIn getter_setter_functions This returns the names of FDR subsets
+#' for which adjusted p values have been calculated.
+#' @export
+availableFDRsubsets <- function(fds){
+ ans <- metadata(fds)[["FDRsubsets"]]
+ return(ans)
+}
+
+`availableFDRsubsets<-` <- function(fds, value){
+ metadata(fds)[["FDRsubsets"]] <- value
+ return(fds)
+}
+
+`addToAvailableFDRsubsets<-` <- function(fds, value){
+ if(!isScalarCharacter(value)){
+ stop("The assigned value needs to be a scalar character.")
+ }
+ ans <- metadata(fds)[["FDRsubsets"]]
+ if(is.null(ans)){
+ metadata(fds)[["FDRsubsets"]] <- value
+ } else{
+ metadata(fds)[["FDRsubsets"]] <- unique(c(ans, value))
+ }
return(fds)
}
@@ -311,10 +432,14 @@ deltaPsiValue <- function(fds, type=currentType(fds)){
#' @describeIn getter_setter_functions Returns the psi type that is used
-#' within several methods in the FRASER package.
+#' within several methods in the FRASER package (defaults to jaccard).
#' @export
currentType <- function(fds){
- return(metadata(fds)[['currentType']])
+ curType <- metadata(fds)[['currentType']]
+ if(is.null(curType)){
+ curType <- "jaccard"
+ }
+ return(curType)
}
#' @describeIn getter_setter_functions Sets the psi type that is to be used
@@ -326,6 +451,27 @@ currentType <- function(fds){
return(fds)
}
+#' @describeIn getter_setter_functions Returns the splice metrics that will be
+#' fitted (defaults to jaccard, used within several methods in the
+#' FRASER package).
+#' @export
+fitMetrics <- function(fds){
+ metrics <- metadata(fds)[['fit_metrics']]
+ if(is.null(metrics)){
+ metrics <- "jaccard"
+ }
+ return(metrics)
+}
+
+#' @describeIn getter_setter_functions Sets the splice metrics that will be
+#' fitted (used within several methods in the FRASER package).
+#' @export
+`fitMetrics<-` <- function(fds, value){
+ stopifnot(is.character(whichPSIType(value)))
+ metadata(fds)[['fit_metrics']] <- whichPSIType(value)
+ return(fds)
+}
+
#' @describeIn getter_setter_functions Sets and returns the pseudo count used
#' within the FRASER fitting procedure.
#' @export
@@ -342,7 +488,7 @@ pseudocount <- function(value=NULL){
# set pseudo count if provided
stopifnot(isScalarNumeric(value))
stopifnot(value >= 0)
- value <- as.integer(value)
+ value <- as.numeric(value)
options('FRASER.pseudoCount'=value)
devNULL <- .setPseudoCount(value)
stopifnot(value == devNULL)
@@ -433,7 +579,7 @@ dontWriteHDF5 <- function(fds){
return(fds)
}
-getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){
+getTrueOutliers <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
ans <- getAssayMatrix(fds, "trueOutliers", type)
if(isTRUE(byGroup)){
ans <- getAbsMaxByGroup(fds, type, ans, ...)
@@ -443,7 +589,7 @@ getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){
pmin(pmax(ans, -1), 1)
}
-getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
+getTrueDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
ans <- getAssayMatrix(fds, "trueDeltaPSI", type)
if(isTRUE(byGroup)){
ans <- getAbsMaxByGroup(fds, type, ans, ...)
@@ -451,7 +597,8 @@ getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
ans
}
-getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){
+getAbsMaxByGroup <- function(fds, type=currentType(fds), mat, index=NULL,
+ BPPARAM=bpparam()){
if(is.null(index)){
index <- getSiteIndex(fds, type)
}
@@ -470,13 +617,13 @@ getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){
return(values)
}
-getByGroup <- function(fds, type, value){
+getByGroup <- function(fds, type=currentType(fds), value){
index <- getSiteIndex(fds, type)
idx <- !duplicated(index)
return(value[idx,])
}
-getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
+getDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
mu <- predictedMeans(fds, type)
dataPsi <- (K(fds, type) + pseudocount())/(N(fds, type) + 2*pseudocount())
deltaPSI <- dataPsi - mu
@@ -488,13 +635,14 @@ getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
# calculate FRASER weights
-calcFraserWeights <- function(fds, psiType){
+calcFraserWeights <- function(fds, psiType=currentType(fds)){
k <- as.matrix(K(fds, psiType))
n <- as.matrix(N(fds, psiType))
mu <- t(predictMu(fds, psiType))
rho <- rho(fds, psiType)
- dataPsi <- plogis(t(
- x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL)))
+ # dataPsi <- plogis(t(
+ # x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL)))
+ dataPsi <- k / n
# pearson residuals for BB
# on counts of success k
@@ -503,18 +651,23 @@ calcFraserWeights <- function(fds, psiType){
# (1+((n+2*pseudocount())-1)*rho))
# on probability of success mu
r <- (dataPsi - mu) / sqrt(
- mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) /
- (n+2*pseudocount()))
+ # mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) /
+ # (n+2*pseudocount()))
+ mu * (1-mu) * (1+(n-1)*rho) / n
+ )
# weights according to Huber function (as in edgeR)
c <- 1.345; # constant, as suggested in edgeR paper
w <- ifelse(abs(r) > c, c/abs(r) , 1)
+ # set weights to 0 if NA (i.e. N=0)
+ w[is.na(w)] <- 0
+
return(w)
}
# get FRASER weights
-weights <- function(fds, type){
+weights <- function(fds, type=currentType(fds)){
return(getAssayMatrix(fds, "weights", type))
}
@@ -524,7 +677,7 @@ weights <- function(fds, type){
return(fds)
}
-getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
+getIndexFromResultTable <- function(fds, resultTable){
type <- as.character(resultTable$type)
target <- makeGRangesFromDataFrame(resultTable)
if(type == "theta"){
@@ -541,8 +694,9 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
ov
}
-getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
- idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, ...){
+getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds),
+ result=NULL, idx=NULL, aggregate=FALSE, pvalLevel="site",
+ Ncpus=3, geneColumn="hgnc_symbol", ...){
if(!is.null(result)){
type <- as.character(result$type)
idx <- getIndexFromResultTable(fds, result)
@@ -564,8 +718,8 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
spliceID <- getSiteIndex(fds, type=type)[idxrow]
feature_names <- rownames(mcols(fds, type=type))[idxrow]
- if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){
- feature_names <- mcols(fds, type=type)[idxrow,"hgnc_symbol"]
+ if(geneColumn %in% colnames(mcols(fds, type=type))){
+ feature_names <- mcols(fds, type=type)[idxrow, geneColumn]
}
if(is.null(feature_names)){
feature_names <- as.character(seq_row(mcols(fds, type=type)))[idxrow]
@@ -587,42 +741,62 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
pval = c(pVals(fds, type=type,
level=pvalLevel)[idxrow, idxcol]),
padj = c(padjVals(fds, type=type)[idxrow, idxcol]),
- zscore = c(zScores(fds, type=type)[idxrow, idxcol]),
- obsPsi = c((k + pseudocount())/(n + 2*pseudocount())),
- predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]))
+ obsPsi = c(k/n),
+ predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]),
+ rho = rep(rho(fds, type=type)[idxrow],
+ ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol)))
+ )
dt[, deltaPsi:=obsPsi - predPsi]
# add aberrant information to it
aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)],
- dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)])
+ dPsi=dt[,.(deltaPsi)], n=dt[,.(n)],
+ rhoVals=dt[,.(rho)], aggregate=FALSE)
dt[,aberrant:=aberrantVec]
- # if requested return gene p values (correct for multiple testing again)
+ # if requested return gene p values
if(isTRUE(aggregate)){
dt <- dt[!is.na(featureID)]
-
- # correct by gene and take the smallest p value
- dt <- rbindlist(bplapply(unique(dt[,sampleID]),
- BPPARAM=getBPParam(Ncpus, length(unique(dt[,sampleID]))),
- FUN=function(x){
- dttmp <- dt[sampleID == x]
- dttmp[, pval:=p.adjust(pval, method="holm"),
- by="sampleID,featureID,type"]
- dttmp <- dttmp[order(sampleID, featureID, type, -aberrant,
- pval, -abs(deltaPsi))][
- !duplicated(data.table(sampleID, featureID, type))]
- dttmp <- dttmp[, padj:=p.adjust(pval, method="BY"),
- by="sampleID,type"]
- dttmp
- }))
+ # split featureID into several rows if more than one
+ dt[, dt_idx:=seq_len(.N)]
+ dt_tmp <- dt[, splitGenes(featureID), by="dt_idx"]
+ dt <- dt[dt_tmp$dt_idx,]
+ dt[,`:=`(featureID=dt_tmp$V1, dt_idx=NULL)]
+
+ # get gene-level pvalue matrices
+ pvalsGene <- lapply(c("pval", "padj"), function(x){
+ if(x == "pval"){
+ pvalsGene <- pVals(fds, type=type,
+ level="gene")[,idxcol,drop=FALSE]
+ } else {
+ pvalsGene <- padjVals(fds, type=type,
+ level="gene")[,idxcol,drop=FALSE]
+ }
+ pvalsGene <- data.table(featureID=rownames(pvalsGene), pvalsGene)
+ pvalsGene <- melt(pvalsGene, value.name=paste0("gene_", x),
+ id.vars="featureID", variable.name="sampleID")
+ return(pvalsGene)
+ })
+ pvalsGene <- merge(pvalsGene[[1]], pvalsGene[[2]],
+ by=c("featureID", "sampleID"))
+
+ # merge with gene pval matrix
+ dt <- merge(dt, pvalsGene, by=c("featureID", "sampleID"))
+ dt[,`:=`(pval=gene_pval, padj=gene_padj,
+ gene_pval=NULL, gene_padj=NULL)]
+
+ # sort
+ dt <- dt[order(sampleID, featureID, type, -aberrant,
+ padj, -abs(deltaPsi))][
+ !duplicated(data.table(sampleID, featureID, type))]
}
-
+
# return object
dt
}
-#' @describeIn getter_setter_functions Dependend on the level of verbosity
+#' @describeIn getter_setter_functions Dependent on the level of verbosity
#' the algorithm reports more or less to the user. 0 means being quiet
#' and 10 means everything.
#' @export
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 15b91fde..03d8de3b 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -71,7 +71,7 @@ checkReadType <- function(fds, type){
# check if type is null or missing
if(missing(type) | is.null(type)){
- if(verbose(fds) > 0){
+ if(verbose(fds) > 3){
warning("Read type was not specified!",
"We will assume the default: 'j'")
}
@@ -79,7 +79,7 @@ checkReadType <- function(fds, type){
}
type <- unique(type)
stopifnot(isScalarCharacter(type))
- correctTypes <- c(psi3="j", psi5="j", theta="ss")
+ correctTypes <- c(psi3="j", psi5="j", theta="ss", jaccard="j")
# check if it is already the correct type
if(type %in% correctTypes) return(type)
@@ -109,7 +109,7 @@ checkReadType <- function(fds, type){
#'
#' @noRd
whichPSIType <- function(type){
- unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE)))
+ unlist(regmatches(type, gregexpr("psi(3|5)|theta|jaccard", type, perl=TRUE)))
}
#'
@@ -122,7 +122,8 @@ whichReadType <- function(fds, name){
# check writing
if(name == "ss" | endsWith(name, "theta"))
return("ss")
- if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3"))
+ if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3") |
+ endsWith(name, "jaccard"))
return("j")
# check assay names
@@ -355,18 +356,18 @@ assayExists <- function(fds, assayName){
return(aexists)
}
-getAssayAsVector <- function(fds, prefix, psiType, sampleID){
+getAssayAsVector <- function(fds, prefix, psiType=currentType(fds), sampleID){
as.vector(assay(fds, paste0(prefix, psiType))[,sampleID])
}
-variableJunctions <- function(fds, type, minDeltaPsi=0.1){
+variableJunctions <- function(fds, type=currentType(fds), minDeltaPsi=0.1){
psi <- K(fds, type=type)/N(fds, type=type)
j2keep <- rowMaxs(abs(psi - rowMeans(psi, na.rm=TRUE)), na.rm=TRUE)
j2keep >= minDeltaPsi
}
-subsetKMostVariableJunctions <- function(fds, type, n){
+subsetKMostVariableJunctions <- function(fds, type=currentType(fds), n){
curX <- x(fds, type=type, all=TRUE, center=FALSE, noiseAlpha=NULL)
xsd <- colSds(curX)
nMostVarJuncs <- which(xsd >= sort(xsd, TRUE)[min(length(xsd), n*2)])
@@ -375,7 +376,8 @@ subsetKMostVariableJunctions <- function(fds, type, n){
ans
}
-getSubsetVector <- function(fds, type, minDeltaPsi=0.1, nSubset=15000){
+getSubsetVector <- function(fds, type=currentType(fds), minDeltaPsi=0.1,
+ nSubset=15000){
# get any variable intron
ans <- variableJunctions(fds, type, minDeltaPsi=minDeltaPsi)
@@ -549,6 +551,130 @@ getStrandString <- function(fds){
return(strand)
}
+
+#'
+#' Check if adjusted pvalues have been computed for a given set of filters.
+#' @noRd
+checkPadjAvailableForFilters <- function(fds, type=currentType(fds),
+ filters=list(), dist="BetaBinomial", aggregate=FALSE,
+ subsetName=NULL){
+ dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
+ aname <- paste0("padj", dist)
+ aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname)
+ aname <- ifelse(!is.null(subsetName), paste0(aname, "_", subsetName), aname)
+
+ # add information on used filters
+ for(n in sort(names(filters))){
+ aname_new <- paste0(aname, "_", n, filters[[n]])
+ if(n == "rho" && filters[[n]] == 1){
+ if(any(grepl(aname_new, assayNames(fds))) ||
+ any(grepl(aname_new, names(metadata(fds))))){
+ aname <- aname_new
+ }
+ }else{
+ aname <- aname_new
+ }
+ }
+ aname <- paste(aname, type, sep="_")
+ if(isTRUE(aggregate)){
+ pvalsAvailable <- aname %in% names(metadata(fds))
+ } else{
+ pvalsAvailable <- aname %in% assayNames(fds)
+ }
+ return(pvalsAvailable)
+}
+
+#'
+#' Find most aberrant junction for each aberrant gene
+#'
+#' @param gr GRanges object with information about junctions.
+#' @param aberrantGenes Significant genes for which the corresponding junction
+#' should be extracted.
+#' @param pvals Vector of pvalues (for one sample).
+#' @param dpsi Vector of delta psi values (for one sample).
+#' @param aberrantJunctions Vector indicating which junctions are considered
+#' aberrant.
+#' @param geneColumn Name of the column in mcols(fds) that has gene annotation.
+#' @noRd
+findJunctionsForAberrantGenes <- function(gr, aberrantGenes, pvals, dpsi,
+ aberrantJunctions, geneColumn="hgnc_symbol"){
+ dt <- data.table(idx=mcols(gr)$idx,
+ geneID=mcols(gr)[,geneColumn],
+ pval=pvals,
+ dpsi=abs(dpsi),
+ aberrant=aberrantJunctions)
+ dt[, dt_idx:=seq_len(.N)]
+ dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"]
+ dt <- dt[dt_tmp$dt_idx,]
+ dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
+ dt <- dt[geneID %in% aberrantGenes,]
+ dt <- dt[!is.na(aberrant) & aberrant == TRUE,]
+
+ # sort per gene by lowest pvalue / highest deltaPsi and return index
+ dt <- dt[order(geneID, -aberrant, pval, -dpsi)]
+ dt <- dt[!duplicated(dt, by="geneID"),]
+
+ # remove gene-level significant result if no junction in that gene passed
+ # the filters
+ dt <- dt[!is.na(pval),]
+
+ junctionsToReport <- dt[,idx]
+ names(junctionsToReport) <- dt[,geneID]
+ junctionsToReport <- sort(junctionsToReport)
+ return(junctionsToReport)
+}
+
+collapseResTablePerGene <- function(res, geneColumn="hgncSymbol"){
+ if(length(res) == 0){
+ return(res)
+ }
+ if(!is.data.table(res)){
+ res <- as.data.table(res)
+ }
+
+ if(any(!c("pValue", "pValueGene", geneColumn) %in% colnames(res))){
+ stop("For collapsing per gene, the results table needs to contain ",
+ "the columns pValue, pValueGene and ", geneColumn, ".")
+ }
+
+ res <- res[order(res$pValueGene, res$pValue)]
+ naIdx <- is.na(res[, get(geneColumn)])
+ ansNoNA <- res[!is.na(res[, get(geneColumn)]),]
+
+ # get final result table
+ dupIdx <- duplicated(data.table(as.vector(ansNoNA[, get(geneColumn)]),
+ as.vector(ansNoNA$sampleID)))
+ ans <- res[!naIdx,][!dupIdx,]
+ return(ans)
+}
+
+#' ignores NA in unique if other values than NA are present
+#' @noRd
+uniqueIgnoreNA <- function(x){
+ uniq <- unique(x)
+ if(length(uniq) > 1) uniq <- uniq[!is.na(uniq)]
+ return(uniq)
+}
+
+#' split string of gene names into vector
+#' @noRd
+splitGenes <- function(x, sep=";"){
+ return(unlist(strsplit(as.character(x), sep, fixed=TRUE)))
+}
+
+#' cap string of gene names to show max 3 gene names
+#' @noRd
+limitGeneNamesList <- function(gene_names, maxLength=3){
+ gene_names <- as.character(gene_names)
+ numFeatures <- unlist(lapply(gene_names, function(x) length(splitGenes(x))))
+ gene_names[numFeatures > maxLength] <-
+ unlist(lapply(gene_names[numFeatures > maxLength], function(x){
+ paste(c(splitGenes(x)[seq_len(maxLength)], "..."),
+ collapse=";")
+ } ))
+ return(gene_names)
+}
+
checkForAndCreateDir <- function(object, dir){
verbose <- 0
if(is(object, "FraserDataSet")){
@@ -569,4 +695,3 @@ checkForAndCreateDir <- function(object, dir){
}
return(TRUE)
}
-
diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
index 6e29adf8..7a1fb6f9 100644
--- a/R/makeSimulatedDataset.R
+++ b/R/makeSimulatedDataset.R
@@ -435,9 +435,10 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10,
#' @examples
#' # A generic dataset
#' fds <- makeSimulatedFraserDataSet()
+#' fds <- calculatePSIValues(fds)
#' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
#' @export
-injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
+injectOutliers <- function(fds, type=psiTypes,
freq=1E-3, minDpsi=0.2, minCoverage=2,
deltaDistr="uniformDistr", verbose=FALSE,
method=c('samplePSI', 'meanPSI', 'simulatedPSI'),
@@ -472,6 +473,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
setAssayMatrix(fds, type="psi3", "originalOtherCounts",
withDimnames=FALSE) <-
counts(fds, type="psi3", side="other")
+ setAssayMatrix(fds, type="jaccard", "originalOtherCounts",
+ withDimnames=FALSE) <-
+ counts(fds, type="jaccard", side="other")
}
# get infos from the fds
@@ -500,7 +504,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
dt[,groupSize:=.N, by=groupID]
# Get groups where outlier can be injected
- available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), unique(groupID)]
+ available_groups <- dt[groupSize > ifelse(type == "theta" |
+ type == "jaccard", 0, 1),
+ unique(groupID)]
# e.g. for psi3/5: no donor/acceptor
# groups with at least 2 junctions (e.g in simulationBB)
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 6d824392..6f7a9a22 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -15,6 +15,10 @@
#' \item plotFilterExpression()
#' \item plotFilterVariability()
#' \item plotEncDimSearch()
+#' \item plotBamCoverage()
+#' \item plotBamCoverageFromResultTable()
+#' \item plotManhattan()
+#' \item plotSpliceMetricRank()
#' }
#'
#' For a detailed description of each plot function please see the details.
@@ -25,10 +29,10 @@
#' @param type The psi type: either psi5, psi3 or theta (for SE).
#' @param sampleID A sample ID which should be plotted. Can also be a vector.
#' Integers are treated as indices.
-#' @param idx,site A junction site ID or gene ID or one of both, which
+#' @param idx A junction site ID or gene ID or one of both, which
#' should be plotted. Can also be a vector. Integers are treated
#' as indices.
-#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta
+#' @param padjCutoff,deltaPsiCutoff Significance or delta
#' psi cutoff to mark outliers
#' @param global Flag to plot a global Q-Q plot, default FALSE
#' @param normalized If TRUE, the normalized psi values are used, the default,
@@ -52,11 +56,61 @@
#' sample-sample correlation heatmap or \code{"junctionSample"}
#' for a junction-sample correlation heatmap.
#' @param onlyVariableIntrons Logical value indicating whether to show only
-#' introns that also pass the variability filter. Defaults to
-#' FALSE.
+#' introns that also pass the variability filter. Defaults to
+#' FALSE.
#' @param onlyExpressedIntrons Logical value indicating whether to show only
-#' introns that also pass the expression filter. Defaults to
-#' FALSE.
+#' introns that also pass the expression filter. Defaults to
+#' FALSE.
+#' @param gr A GRanges object indicating the genomic range that should be shown
+#' in \code{plotBamCoverage}.
+#' @param control_samples The sampleIDs of the samples used as control in
+#' \code{plotBamCoverage}.
+#' @param min_junction_count The minimal junction count across samples required
+#' for a junction to appear in the splicegraph and coverage tracks
+#' of \code{plotBamCoverage}.
+#' @param txdb A TxDb object giving the gene/transcript annotation to use.
+#' @param orgDb A OrgDb object giving the mapping of gene ids and symbols.
+#' @param show_full_gene Should the full genomic range of the gene be shown in
+#' \code{plotBamCoverageFromResultTable} (default: FALSE)?
+#' If FALSE, only a certain region (see parameters left_extension
+#' and right_extension) around the outlier junction is shown.
+#' @param left_extension Indicating how far the plotted range around the outlier
+#' junction should be extended to the left in
+#' \code{plotBamCoverageFromResultTable}.
+#' @param right_extension Indicating how far the plotted range around the
+#' outlier junction should be extended to the right in
+#' \code{plotBamCoverageFromResultTable}.
+#' @param res_gene_col The column name in the given results table that
+#' contains the gene annotation.
+#' @param res_geneid_type The type of gene annotation in the results table in
+#' \code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This
+#' information is needed for mapping between the results table and
+#' the provided annotation in the txdb object.
+#' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)}
+#' (e.g. ENTREZID). This information is needed for
+#' mapping between the results table and the provided annotation
+#' in the txdb object.
+#' @param highlight_range A \code{GenomicRanges} or \code{GenomicRangesList}
+#' object of ranges to be highlighted in the splicegraph of
+#' \code{plotBamCoverage}.
+#' @param highlight_range_color The color of highlighted ranges in
+#' the splicegraph of \code{plotBamCoverage}.
+#' @param toscale In \code{plotBamCoverage}, indicates which part of the
+#' plotted region should be drawn to scale. Possible values are
+#' 'exon' (exonic regions are drawn to scale),
+#' 'gene' (both exonic and intronic regions are drawn to scale) or
+#' 'none' (exonic and intronic regions have constant length)
+#' (see SGSeq package).
+#' @param splicegraph_labels Indicated the format of exon/splice junction
+#' labels in the splicegraph of \code{plotBamCoverage}.
+#' Possible values are 'genomic_range' (gives the start position
+#' of the first exon and the end position of the last exon that
+#' are shown), 'id' (format E1,... J1,...), 'name' (format
+#' type:chromosome:start-end:strand for each feature),
+#' 'none' for no labels (see SGSeq package).
+#' @param splicegraph_position The position of the splicegraph relative to the
+#' coverage tracks in \code{plotBamCoverage}. Possible values
+#' are 'top' (default) and 'bottom'.
#'
#### Graphical parameters
#' @param main Title for the plot, if missing a default title will be used.
@@ -89,6 +143,31 @@
#' @param bins Set the number of bins to be used in the histogram.
#' @param legend.position Set legend position (x and y coordinate), defaults to
#' the top right corner.
+#' @param color_annotated The color for exons and junctions present in
+#' the given annotation (in the splicegraph of
+#' \code{plotBamCoverage}).
+#' @param color_novel The color for novel exons and junctions not present in
+#' the given annotation (in the splicegraph of
+#' \code{plotBamCoverage}).
+#' @param color_sample_interest The color in \code{plotBamCoverage} for the
+#' sample of interest.
+#' @param color_control_samples The color in \code{plotBamCoverage} for the
+#' samples used as controls.
+#' @param curvature_splicegraph The curvature of the junction arcs in the
+#' splicegraph in \code{plotBamCoverage}. Decrease this value
+#' for flatter arcs and increase it for steeper arcs.
+#' @param curvature_coverage The curvature of the junction arcs in the
+#' coverage tracks of \code{plotBamCoverage}. Decrease this
+#' value for flatter arcs and increase it for steeper arcs.
+#' @param mar The margin of the plot area for \code{plotBamCoverage}
+#' (b,l,t,r).
+#' @param cex For controlling the size of text and numbers in
+#' \code{plotBamCoverage}.
+#' @param chr Vector of chromosome names to show in \code{plotManhattan}. The
+#' default is to show all chromosomes.
+#' @param value Indicates which assay is shown in the manhattan plot. Defaults
+#' to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.
+#' @param chrColor Interchanging colors by chromosome for \code{plotManhattan}.
#'
#### Additional ... parameter
#' @param ... Additional parameters passed to plot() or plot_ly() if not stated
@@ -116,6 +195,9 @@
#'
#' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi
#' against the predicted psi for a given site.
+#'
+#' \code{plotSpliceMetricRank}: This function plots for a given intron the
+#' observed values of the selected splice metrix against the sample rank.
#'
#' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either
#' of the full data set (i.e. sample-sample correlations) or of the top x most
@@ -137,6 +219,17 @@
#' It plots the encoding dimension against the achieved loss (area under the
#' precision-recall curve). From this plot the optimum should be choosen for
#' the \code{q} in fitting process.
+#'
+#' \code{plotManhattan}: A Manhattan plot showing the junction pvalues by
+#' genomic position. Useful to identify if outliers cluster by genomic position.
+#'
+#' \code{plotBamCoverage}: A sashimi plot showing the read coverage from
+#' the underlying bam files for a given genomic range and sampleIDs.
+#'
+#' \code{plotBamCoverageFromResultTable}: A sashimi plot showing the read
+#' coverage from the underlying bam files for a row in the results table. Can
+#' either show the full range of the gene with the outlier junction or only a
+#' certain region around the outlier.
#'
#' @return If base R graphics are used nothing is returned else the plotly or
#' the gplot object is returned.
@@ -145,42 +238,80 @@
#' @rdname plotFunctions
#' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ
#' plotExpression plotCountCorHeatmap plotFilterExpression
-#' plotExpectedVsObservedPsi plotEncDimSearch
+#' plotExpectedVsObservedPsi plotEncDimSearch plotManhattan
+#' plotBamCoverage plotBamCoverageFromResultTable
#' @examples
#' # create full FRASER object
#' fds <- makeSimulatedFraserDataSet(m=40, j=200)
#' fds <- calculatePSIValues(fds)
#' fds <- filterExpressionAndVariability(fds, filter=FALSE)
-#' # this step should be done for all splicing metrics and more dimensions
-#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
+#' # this step should be done for more dimensions in practice
+#' fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25))
#' fds <- FRASER(fds)
#'
#' # QC plotting
#' plotFilterExpression(fds)
#' plotFilterVariability(fds)
-#' plotCountCorHeatmap(fds, "theta")
-#' plotCountCorHeatmap(fds, "theta", normalized=TRUE)
-#' plotEncDimSearch(fds, type="psi5")
+#' plotCountCorHeatmap(fds, "jaccard")
+#' plotCountCorHeatmap(fds, "jaccard", normalized=TRUE)
+#' plotEncDimSearch(fds, type="jaccard")
#'
#' # extract results
#' plotAberrantPerSample(fds, aggregate=FALSE)
-#' plotVolcano(fds, "sample1", "psi5")
+#' plotVolcano(fds, "sample1", "jaccard")
#'
#' # dive into gene/sample level results
#' res <- results(fds)
#' res
#' plotExpression(fds, result=res[1])
#' plotQQ(fds, result=res[1])
-#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
+#' plotExpectedVsObservedPsi(fds, res=res[1])
+#'
+#' # create manhattan plot of pvalues by genomic position
+#' if(require(ggbio)){
+#' plotManhattan(fds, type="jaccard", sampleID="sample10")
+#' }
+#'
+#' # plot splice graph and coverage from bam files in a given region
+#' if(require(SGSeq)){
+#' fds <- createTestFraserSettings()
+#' gr <- GRanges(seqnames="chr19",
+#' IRanges(start=7587496, end=7598895),
+#' strand="+")
+#' plotBamCoverage(fds, gr=gr, sampleID="sample3",
+#' control_samples="sample2", min_junction_count=5,
+#' curvature_splicegraph=1, curvature_coverage=1,
+#' mar=c(1, 7, 0.1, 3))
+#'
+#' # plot coverage from bam file for a row in the result table
+#' fds <- createTestFraserDataSet()
+#' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+#' require(org.Hs.eg.db)
+#' orgDb <- org.Hs.eg.db
+#'
+#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+#' res_dt <- as.data.table(res)
+#' res_dt <- res_dt[sampleID == "sample2",]
+#'
+#' # plot full range of gene containing outlier junction
+#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+#' txdb=txdb, orgDb=orgDb, control_samples="sample3")
+#'
+#' # plot only certain range around outlier junction
+#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE,
+#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+#' curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+#' splicegraph_labels="id")
+#' }
#'
-#'
NULL
plotVolcano.FRASER <- function(object, sampleID,
- type=c("psi3", "psi5", "theta"), basePlot=TRUE,
+ type=fitMetrics(object), basePlot=TRUE,
aggregate=FALSE, main=NULL, label=NULL,
- deltaPsiCutoff=0.3, padjCutoff=0.1, ...){
+ deltaPsiCutoff=0.1, padjCutoff=0.1, ...){
type <- match.arg(type)
@@ -213,7 +344,8 @@ plotVolcano.FRASER <- function(object, sampleID,
}
if(!is.na(padjCutoff)){
- if(dt[,any(padj <= padjCutoff)]){
+ if(dt[padj <= padjCutoff, .N] > 0){
+ padj_line <- min(dt[padj <= padjCutoff, -log10(pval)])
padj_line <- min(dt[padj <= padjCutoff, -log10(pval)])
}
if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){
@@ -278,8 +410,8 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
plotAberrantPerSample.FRASER <- function(object, main,
- type=c("psi3", "psi5", "theta"),
- padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3,
+ type=fitMetrics(object),
+ padjCutoff=0.1, deltaPsiCutoff=0.1,
aggregate=TRUE, BPPARAM=bpparam(), ...){
type <- match.arg(type, several.ok=TRUE)
@@ -293,7 +425,7 @@ plotAberrantPerSample.FRASER <- function(object, main,
# extract outliers
outliers <- bplapply(type, aberrant, object=object, by="sample",
- padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff,
+ padjCutoff=padjCutoff,
deltaPsiCutoff=deltaPsiCutoff, aggregate=aggregate, ...,
BPPARAM=BPPARAM)
dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){
@@ -307,7 +439,7 @@ plotAberrantPerSample.FRASER <- function(object, main,
geom_line() +
geom_hline(aes(yintercept=median, color=type, lty="Median")) +
theme_bw() +
- theme_cowplot() +
+ theme_cowplot() + background_grid(major="xy", minor="xy") +
ggtitle(main) +
xlab("Sample rank") +
ylab("Number of outliers") +
@@ -316,7 +448,8 @@ plotAberrantPerSample.FRASER <- function(object, main,
scale_linetype_manual(name="", values=2, labels="Median")
if(!all(dt2p[,value] == 0)){
- g <- g + scale_y_log10()
+ g <- g + scale_y_log10(limits=c(1, max(unlist(outliers)))) +
+ annotation_logticks(sides="l")
}
g
@@ -341,17 +474,19 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
#'
#' @rdname plotFunctions
#' @export
-plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
- site=NULL, result=NULL, colGroup=NULL,
+plotExpression <- function(fds, type=fitMetrics(fds),
+ idx=NULL, result=NULL, colGroup=NULL,
basePlot=TRUE, main=NULL, label="aberrant", ...){
if(!is.null(result)){
type <- as.character(result$type)
- site <- getIndexFromResultTable(fds, result)
+ idx <- getIndexFromResultTable(fds, result)
} else {
type <- match.arg(type)
}
- dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...)
+ dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...)
+ dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
+
if(!is.null(colGroup)){
if(all(colGroup %in% samples(fds))){
colGroup <- samples(fds) %in% colGroup
@@ -422,6 +557,116 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
plotBasePlot(g, basePlot)
}
+#'
+#' Junction splice metric plot
+#'
+#' Plots the observed values of the splice metric across samples for a
+#' junction of interest.
+#'
+#' @rdname plotFunctions
+#' @export
+plotSpliceMetricRank <- function(fds, type=fitMetrics(fds),
+ idx=NULL, result=NULL, colGroup=NULL,
+ basePlot=TRUE, main=NULL, label="aberrant", ...){
+ if(!is.null(result)){
+ type <- as.character(result$type)
+ idx <- getIndexFromResultTable(fds, result)
+ } else {
+ type <- match.arg(type)
+ }
+
+ dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...)
+ dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
+
+ # rank on observed value of splice metric of interest
+ dt[, rank := rank(obsPsi, ties.method="random", na.last=FALSE)]
+
+ if(!is.null(colGroup)){
+ if(all(colGroup %in% samples(fds))){
+ colGroup <- samples(fds) %in% colGroup
+ }
+ dt[colGroup,aberrant:=TRUE]
+ }
+ dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))]
+
+ gr <- granges(rowRanges(fds,type=type)[idx,])
+ genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr),
+ ":", strand(gr))
+
+ if(is.null(main)){
+ if(isTRUE(basePlot)){
+ # main <- as.expression(bquote(bold(paste(
+ # .(ggplotLabelPsi(type)[[1]]), " rank plot: ",
+ # .(genomic_pos_label),
+ # " (", bolditalic(.(as.character(dt[,unique(featureID)]))),
+ # ")"))))
+ main <- as.expression(bquote(bold(paste(
+ .(genomic_pos_label),
+ " (", bolditalic(.(as.character(dt[,unique(featureID)]))),
+ ")"))))
+ } else{
+ # main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]],
+ # " rank plot: ", dt[,unique(featureID)],
+ # " (site ", dt[,unique(idx)], ")")
+ main <- paste0(genomic_pos_label,
+ " (", dt[,unique(featureID)], ")")
+ }
+ }
+
+ if(isTRUE(basePlot)){
+ ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]]))
+ } else{
+ ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]])
+ }
+
+ g <- ggplot(dt, aes(x=rank, y=obsPsi, color=aberrant, label=sampleID,
+ text=paste0(
+ "Sample: ", sampleID, "
",
+ "Counts (K): ", k, "
",
+ "Total counts (N): ", n, "
",
+ "p value: ", signif(pval, 5), "
",
+ "padjust: ", signif(padj, 5), "
",
+ "Observed Psi: ", round(obsPsi, 2), "
",
+ "Predicted mu: ", round(predPsi, 2), "
"))) +
+ geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) +
+ theme_bw() +
+ theme(legend.position="none", title=) +
+ xlab("Sample rank") +
+ ylab(ylab) +
+ ggtitle(main, subtitle=paste0("fds row index: ", dt[,unique(idx)])) +
+ ylim(0,1)
+
+
+ if(isTRUE(basePlot) && !is.null(label)){
+ if(isScalarCharacter(label) && label == "aberrant"){
+ if(nrow(dt[aberrant == TRUE,]) > 0){
+ g <- g + geom_text_repel(data=dt[aberrant == TRUE,],
+ aes(col=aberrant),
+ fontface='bold', hjust=-.2, vjust=-.2)
+ }
+ }
+ else{
+ if(nrow(dt[sampleID %in% label]) > 0){
+ g <- g + geom_text_repel(data=subset(dt, sampleID %in% label),
+ aes(col=aberrant),
+ fontface='bold', hjust=-.2, vjust=-.2)
+ }
+ if(any(!(label %in% dt[,sampleID]))){
+ warning("Did not find sample(s) ",
+ paste(label[!(label %in% dt[,sampleID])],
+ collapse=", "), " to label.")
+ }
+ }
+ }
+
+ if(is.null(colGroup)){
+ g <- g + scale_colour_manual(
+ values=c("FALSE"="gray70", "TRUE"="firebrick"))
+ }
+
+ plotBasePlot(g, basePlot)
+}
+
#'
#' Expected over Overserved plot
@@ -431,7 +676,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
#'
#' @rdname plotFunctions
#' @export
-plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
+plotExpectedVsObservedPsi <- function(fds, type=fitMetrics(fds),
idx=NULL, result=NULL, colGroup=NULL, main=NULL,
basePlot=TRUE, label="aberrant", ...){
type <- match.arg(type)
@@ -441,6 +686,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
idx=idx, ...)
type <- as.character(unique(dt$type))
idx <- unique(dt$idx)
+ dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
if(is.null(main)){
if(isTRUE(basePlot)){
@@ -484,6 +730,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
geom_point(alpha=ifelse(dt$aberrant, 1, 0.5),
color=c("gray70", "firebrick")[dt$aberrant + 1]) +
geom_abline(intercept = 0, slope=1) +
+ xlim(c(0,1)) + ylim(c(0,1)) +
theme_bw() +
theme(legend.position="none") +
xlab(xlab) +
@@ -537,7 +784,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
if(isTRUE(global)){
if(is.null(type)){
- type <- psiTypes
+ type <- fitMetrics(object)
}
dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col",
idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...))
@@ -562,6 +809,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
} else {
type <- as.character(dt[,unique(type)])
featureID <- as.character(dt[,unique(featureID)])
+ featureID <- limitGeneNamesList(featureID, maxLength=3)
if(isTRUE(basePlot)){
main <- as.expression(bquote(bold(paste(
.(ggplotLabelPsi(type)[[1]]),
@@ -692,7 +940,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)
-plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"),
+plotEncDimSearch.FRASER <- function(object, type=psiTypes,
plotType=c("auc", "loss")){
type <- match.arg(type)
plotType <- match.arg(plotType)
@@ -717,6 +965,8 @@ plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"),
geom_smooth(method="loess", formula=y~x) +
geom_vline(data=data[isOptimalQ == TRUE,],
mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
+ geom_text(data=data[isOptimalQ == TRUE,],
+ aes(y=0.0, q+1, label=q)) +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
xlab("Estimated q") +
@@ -860,8 +1110,8 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
plotCountCorHeatmap.FRASER <- function(object,
- type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000,
- topJ=5000, minMedian=1, minCount=10,
+ type=psiTypes, logit=FALSE,
+ topN=50000, topJ=5000, minMedian=1, minCount=10,
main=NULL, normalized=FALSE, show_rownames=FALSE,
show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
annotation_row=NA, border_color=NA, nClust=5,
@@ -903,7 +1153,7 @@ plotCountCorHeatmap.FRASER <- function(object,
object <- object[,ids2plot]
}
- xmat <- (skmat + 1)/(snmat + 2)
+ xmat <- (skmat + 1*pseudocount())/(snmat + 2*pseudocount())
if(isTRUE(logit)){
xmat <- qlogisWithCap(xmat)
}
@@ -1056,6 +1306,355 @@ plotCountCorHeatmap.FRASER <- function(object,
setMethod("plotCountCorHeatmap", signature="FraserDataSet",
plotCountCorHeatmap.FRASER)
+#'
+#' Plot coverage from bam files for given genomic range and sample ids
+#'
+#' @rdname plotFunctions
+#' @export
+plotBamCoverage <- function(fds, gr, sampleID,
+ control_samples=sample(
+ samples(fds[, which(samples(fds) != sampleID)]),
+ min(3, ncol(fds)-length(sampleID))),
+ txdb=NULL, min_junction_count=20,
+ highlight_range=NULL, highlight_range_color="firebrick",
+ color_annotated="gray", color_novel="goldenrod3",
+ color_sample_interest="firebrick", color_control_samples="dodgerblue4",
+ toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5),
+ curvature_splicegraph=1, curvature_coverage=1, cex=1,
+ splicegraph_labels=c("genomic_range", "id", "name", "none"),
+ splicegraph_position=c("top", "bottom"), ...){
+
+ if(missing(fds)){
+ stop("Missing input: fds (FraserDataSet object)")
+ } else{
+ stopifnot(is(fds, "FraserDataSet"))
+ }
+ if(missing(gr)){
+ stop("Missing input gr (genomic range to plot).")
+ } else{
+ stopifnot(is(gr, "GenomicRanges"))
+ stopifnot(length(gr) > 0)
+ }
+ if(missing(sampleID)){
+ stop("Missing input: sample_of_interest")
+ }
+ toscale <- match.arg(toscale)
+ splicegraph_labels <- match.arg(splicegraph_labels)
+ splicegraph_position <- match.arg(splicegraph_position)
+
+ # extract bam info for sample ids to plot
+ all_sids <- c(sampleID, control_samples)
+ si_out <- getSGSeqSI(fds, all_sids)
+ sgseq_si <- si_out[[1]]
+ fds <- si_out[[2]]
+
+ # collapse input ranges if several
+ if(any(strand(gr) == "*")){
+ # seems to throw an error with * strand so guessing strand instead
+ if(all(strand(gr) == "*")){
+ guessStrand <- "+"
+ } else{
+ guessStrand <- strand(gr[strand(gr) != "*"])[1]
+ }
+ strand(gr)[strand(gr) == "*"] <- guessStrand
+ warning("Input genomic ranges contained unstranded ranges.\n",
+ "This function needs strand information, guessing strand to ",
+ "be ", guessStrand, ".")
+ }
+ if(!all(strand(gr) == strand(gr[1,]))){
+ warning("Input genomic ranges contained ranges on different strands,\n",
+ "only showing coverage for the ", strand(gr[1,]), " strand.")
+ strand(gr) <- rep(strand(gr[1,]), length(gr))
+ }
+ gr <- range(gr)
+ gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr))))
+
+ # convert highlight_range to GRangesList if not
+ if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){
+ stopifnot(is(highlight_range, "GRanges"))
+ highlight_range <- GRangesList(highlight_range)
+ }
+
+ # extract splice graph
+ sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr,
+ min_junction_count=min_junction_count, psi=0,
+ ...)
+
+ # overlap detected junctions with annotation
+ if(!is.null(txdb)){
+ # subset to chr of interest
+ seqlevels(txdb) <- unique(as.character(seqnames(gr)))
+
+ # extract transcript features with SGSeq package
+ txf <- SGSeq::convertToTxFeatures(txdb)
+ txf <- txf[txf %over% gr]
+
+ # restore seqlevels of txdb object
+ seqlevels(txdb) <- seqlevels0(txdb)
+
+ # annotate splice junctions with annotation features
+ sgfc_pred <- SGSeq::annotate(sgfc_pred, txf)
+ } else{
+ # when no annotation is given, show everything in the same color
+ color_novel <- color_annotated
+ }
+
+ # get genomic positions for first and last exon in given range
+ if(splicegraph_labels == "genomic_range"){
+ # tell plotSpliceGraph function to use custom labels
+ splicegraph_labels <- "label"
+ # create custom labels (only for first and last exon for readability)
+ mcols(sgfc_pred)$label <- ""
+ exons <- which(SGSeq::type(sgfc_pred) == "E" &
+ rowRanges(sgfc_pred) %over% gr)
+ exons <- unique(c(exons[1], tail(exons, n=1)))
+ if(length(exons) == 1){
+ mcols(sgfc_pred)$label[exons] <-
+ paste(seqnames(sgfc_pred),
+ paste(start(sgfc_pred), end(sgfc_pred), sep="-"),
+ strand(sgfc_pred), sep=":")[exons]
+ }
+ if(length(exons) == 2){
+ mcols(sgfc_pred)$label[exons[1]] <-
+ paste(seqnames(sgfc_pred),
+ start(sgfc_pred),
+ strand(sgfc_pred), sep=":")[exons[1]]
+ mcols(sgfc_pred)$label[exons[2]] <-
+ paste(seqnames(sgfc_pred),
+ end(sgfc_pred),
+ strand(sgfc_pred), sep=":")[exons[2]]
+ }
+ }
+
+ # plot splice graph and coverage of junctions from bam
+ nr_sa2p <- length(all_sids)
+ par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex)
+ if(splicegraph_position == "top"){
+ SGSeq::plotSpliceGraph(rowRanges(sgfc_pred),
+ which=gr,
+ toscale=toscale,
+ color=color_annotated,
+ color_novel=color_novel,
+ ypos=c(0.25, 0.1),
+ ranges=highlight_range,
+ ranges_color=highlight_range_color,
+ ranges_ypos=c(0.01, 0.02),
+ curvature=curvature_splicegraph,
+ label=splicegraph_labels)
+ }
+ for (j in seq_along(sampleID)) {
+ SGSeq::plotCoverage(
+ sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])],
+ which = gr,
+ toscale = toscale,
+ label=sampleID[j],
+ color=color_sample_interest,
+ curvature=curvature_coverage)
+ }
+ for (j in seq_along(control_samples)) {
+ SGSeq::plotCoverage(
+ sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])],
+ which = gr,
+ toscale = toscale,
+ label=control_samples[j],
+ color=color_control_samples,
+ curvature=curvature_coverage)
+ }
+ if(splicegraph_position == "bottom"){
+ SGSeq::plotSpliceGraph(rowRanges(sgfc_pred),
+ which=gr,
+ toscale=toscale,
+ color_novel=color_novel,
+ ypos=c(0.25, 0.1),
+ ranges=highlight_range,
+ ranges_color=highlight_range_color,
+ ranges_ypos=c(0.01, 0.02),
+ curvature=curvature_splicegraph,
+ label=splicegraph_labels)
+ }
+
+ return(invisible(fds))
+}
+
+#'
+#' Plot coverage from bam files for given row of results table
+#'
+#' @rdname plotFunctions
+#' @export
+plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
+ txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol",
+ res_geneid_type="SYMBOL", txdb_geneid_type="ENTREZID",
+ left_extension=1000, right_extension=1000, ...){
+ stopifnot(is(fds, "FraserDataSet"))
+
+ if(is(result, "GenomicRanges")){
+ result <- as.data.table(result)
+ }
+
+ stopifnot(is.data.table(result))
+ stopifnot(result[,.N] == 1)
+
+ sid <- result[,sampleID]
+ jidx <- getIndexFromResultTable(fds, result)
+ outlier_range <- rowRanges(fds, type=result[,type])[jidx,]
+
+ # showing either full range of the gene in which the outlier occured
+ if(show_full_gene == TRUE){
+ if(missing(txdb)){
+ stop("Missing input: txdb (for extracting gene range)")
+ }
+ if(missing(orgDb)){
+ stop("Missing input: orgDb (for mapping of IDs to txdb)")
+ }
+ result_gene <- result[,get(res_gene_col)]
+ result_gene <- strsplit(result_gene, ";", fixed=TRUE)[[1]]
+ if(is.data.table(orgDb)){
+ tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb,
+ by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE,
+ sort=FALSE)[,.(gene_id, feature=get(res_geneid_type))]
+ setnames(tmp, "feature", res_geneid_type)
+ txdb_geneid <- tmp[get(res_geneid_type) %in% result_gene, gene_id]
+ } else {
+ tmp <- as.data.table(
+ select(orgDb,
+ keys=result_gene,
+ columns=txdb_geneid_type,
+ keytype=res_geneid_type)
+ )
+ txdb_geneid <- tmp[, get(txdb_geneid_type)]
+ }
+ gr <- genes(txdb, filter=list("gene_id"=txdb_geneid))
+ if(length(gr) == 0){
+ stop("Could not extract genomic coordinates for input gene.")
+ }
+ } else{
+ # or just showing a certain region around the outlier junction
+ gr <- outlier_range
+ start(gr) <- start(gr) - left_extension
+ end(gr) <- end(gr) + right_extension
+ }
+
+ # if several genes overlap, only show those on same strand as outlier
+ if(as.character(strand(outlier_range)) != "*" &
+ length(gr[strand(gr) == strand(outlier_range),]) > 0){
+ gr <- gr[strand(gr) == strand(outlier_range),]
+ }
+
+ # create the coverage plot for the given outlier
+ fds <- plotBamCoverage(fds,
+ gr=gr,
+ sampleID=sid,
+ txdb=txdb,
+ highlight_range=outlier_range,
+ ...)
+ return(invisible(fds))
+}
+
+plotManhattan.FRASER <- function(object, sampleID, value="pvalue",
+ type=fitMetrics(object), chr=NULL,
+ main=paste0("sample: ", sampleID),
+ chrColor=c("black", "darkgrey"),
+ ...){
+ # check necessary packages
+ if (!requireNamespace('ggbio')){
+ stop("For this function, the ggbio package is required.")
+ }
+ if (!requireNamespace('biovizBase')){
+ stop("For this function, the biovizBase package is required.")
+ }
+
+ # check arguments
+ stopifnot(is(object, "FraserDataSet"))
+ stopifnot(sampleID %in% samples(object))
+ type <- match.arg(type)
+ additional_args <- list(...)
+ padjCutoff <- 0.1
+ if("padjCutoff" %in% names(additional_args)){
+ padjCutoff <- additional_args$padjCutoff
+ }
+ deltaPsiCutoff <- ifelse(type == "jaccard", 0.1, 0.3)
+ if("deltaPsiCutoff" %in% names(additional_args)){
+ deltaPsiCutoff <- additional_args$deltaPsiCutoff
+ }
+
+ # extract neccessary informations
+ gr_sample <- rowRanges(object, type=type)
+ seqlevelsStyle(gr_sample) <- seqlevelsStyle(object)
+ mcols(gr_sample)[,"pvalue"] <- -log10(
+ pVals(object, type=type, level="junction")[,sampleID])
+ mcols(gr_sample)[,"padjust"] <- -log10(
+ padjVals(object, type=type, level="site")[,sampleID])
+ mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID]
+
+ # Add values to granges
+ if(value %in% c('pvalue', 'pValue', 'pv')){
+ gr_sample$value <- mcols(gr_sample)[, "pvalue"]
+ ylabel <- expression(paste(-log[10], "(P-value)"))
+ }
+ if(value %in% c('zscore', 'zScore')){
+ gr_sample$value <- zScores(object, type=type)[, sampleID]
+ ylabel <- value
+ }
+ if(value %in% c('delta', 'deltaPsi', 'deltaJaccard')){
+ gr_sample$value <- mcols(gr_sample)[, "delta"]
+ ylabel <- bquote(Delta ~ .(ggplotLabelPsi(type)[[1]]))
+ }
+
+ # only one point per donor/acceptor site (relevant only for psi5 and psi3)
+ index <- getSiteIndex(object, type=type)
+ nonDup <- !duplicated(index)
+ gr_sample <- gr_sample[nonDup,]
+
+ # Sort granges for plot
+ gr_sample <- sortSeqlevels(gr_sample)
+ gr_sample <- sort(gr_sample)
+
+ # subset to chromosomes in chrSubset if requested
+ if(!is.null(chr)){
+ # check input
+ if(!all(chr %in% unique(seqnames(gr_sample)))){
+ stop("Not all chromosomes selected for subsetting are present ",
+ "in the GRanges object.")
+ }
+
+ # subset
+ gr_sample <- gr_sample[as.character(seqnames(gr_sample)) %in% chr]
+
+ # add chr to plot title if only one chr given
+ if(length(chr) == 1){
+ main <- paste0(main, "; ",
+ paste(chr, collapse=", ", sep=""))
+ }
+ }
+
+ # find outlier indices
+ if(!type %in% c("psi3", "psi5")){
+ outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) &
+ abs(gr_sample$delta) >= deltaPsiCutoff)
+ } else{
+ outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff))
+ }
+ message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...")
+
+ # plot manhattan plot
+ plotGrandLinear.adapted(gr_sample, aes(y=value),
+ color=chrColor,
+ highlight.gr=gr_sample[outlier_idx,],
+ highlight.overlap="equal",
+ use.genome.coords=is.null(chr)) +
+ labs(title=main, x="", y=ylabel)
+
+}
+
+#'
+#' Plot manhattan plot of junction pvalues
+#'
+#' @rdname plotFunctions
+#' @export
+setMethod("plotManhattan", signature="FraserDataSet",
+ plotManhattan.FRASER)
+
+
#'
#' helper function to get the annotation as data frame from the col data object
#'
@@ -1099,6 +1698,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
if(isFALSE(asCharacter)){
vapply(type, FUN=function(x)
switch (x,
+ jaccard = c(bquote(Intron~Jaccard~Index)),
psi5 = c(bquote(psi[5])),
psi3 = c(bquote(psi[3])),
theta = c(bquote(theta))),
@@ -1106,9 +1706,180 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
} else{
vapply(type, FUN=function(x)
switch (x,
+ jaccard = "Intron-Jaccard-Index",
psi5 = "psi[5]",
psi3 = "psi[3]",
theta = "theta"),
FUN.VALUE=character(1))
}
}
+
+#'
+#' Extract info from bam files needed for SGSeq functions to work
+#'
+#' @noRd
+getSGSeqSI <- function(fds, sample_ids){
+
+ # check if bam info is already stored in fds for given samples
+ if("SGSeq_sampleinfo" %in% names(metadata(fds))){
+ si <- metadata(fds)[["SGSeq_sampleinfo"]]
+ si <- si[si$sample_name %in% sample_ids,]
+ if(nrow(si) != length(sample_ids)){
+ # add bam info for missing sample_ids
+ missing_ids <- sample_ids[!sample_ids %in% si$sample_name]
+ message("Extracting SGSeq sample info from BAM files for samples ",
+ paste(missing_ids, collapse=", "), " ...")
+ df_missing <- data.frame(
+ sample_name=samples(fds)[samples(fds) %in% missing_ids],
+ file_bam=bamFile(fds)[samples(fds) %in% missing_ids])
+ si_new <- SGSeq::getBamInfo(df_missing, yieldSize=1e6)
+ si_new$lib_size <- 50e6 # dummy value to speed up this part
+ si <- rbind(si, si_new)
+ metadata(fds)[["SGSeq_sampleinfo"]] <-
+ rbind(metadata(fds)[["SGSeq_sampleinfo"]], si_new)
+ }
+ return(list(si, fds))
+ } else{
+ message("Extracting SGSeq sample info from BAM files for samples ",
+ paste(sample_ids, collapse=", "), " ...")
+ df <- data.frame(
+ sample_name=samples(fds)[samples(fds) %in% sample_ids],
+ file_bam=bamFile(fds)[samples(fds) %in% sample_ids])
+ si <- SGSeq::getBamInfo(df, yieldSize=1e6)
+ si$lib_size <- 50e6 # dummy value to speed up this part
+ metadata(fds)[["SGSeq_sampleinfo"]] <- si
+ return(list(si, fds))
+ }
+}
+
+#'
+#' Adapted function from ggbio package to create manhattan plot.
+#' Adapted to allow highlighting only ranges that exactly match. Uses functions
+#' from package biovizBase.
+#'
+#' @noRd
+plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
+ geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1,
+ legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL,
+ highlight.name = NULL, highlight.col = "red", highlight.label = TRUE,
+ highlight.label.size = 5, highlight.label.offset = 0.05,
+ highlight.label.col = "black",
+ highlight.overlap = c("any", "start", "end", "within", "equal"),
+ spaceline = FALSE, use.genome.coords=TRUE){
+ if (is.null(geom))
+ geom <- "point"
+ args <- list(...)
+ args.aes <- biovizBase::parseArgsForAes(args)
+ args.non <- biovizBase::parseArgsForNonAes(args)
+ two.color <- c("#0080FF", "#4CC4FF")
+ .is.seq <- FALSE
+ if (!"colour" %in% names(args.aes)) {
+ if (!any(c("color", "colour") %in% names(args.non))) {
+ .color <- two.color
+ args.aes$color <- as.name("seqnames")
+ .is.seq <- TRUE
+ }
+ else {
+ if (length(args.non$color) > 1) {
+ .color <- args.non$color
+ args.aes$color <- as.name("seqnames")
+ .is.seq <- TRUE
+ args.non <- args.non[!names(args.non) %in% c("colour",
+ "color")]
+ }
+ }
+ }
+ else {
+ if (quo_name(args.aes$colour) == "seqnames")
+ args.aes$colour <- as.name("seqnames")
+ }
+ if (!"y" %in% names(args.aes))
+ stop("need to provide y")
+ if(isTRUE(use.genome.coords)){
+ args.non$coord <- "genome"
+ }
+ args.non$space.skip <- space.skip
+ args.non$geom <- geom
+ args.non$object <- obj
+ aes.res <- do.call(aes, args.aes)
+ p <- do.call(ggbio::autoplot, c(list(aes.res), args.non))
+ if (!legend)
+ p <- p + theme(legend.position = "none")
+ if (!missing(ylab))
+ p <- p + ylab(ylab)
+ if (!is.null(cutoff))
+ p <- p + geom_hline(yintercept = cutoff, color = cutoff.color,
+ size = cutoff.size)
+ chrs <- names(seqlengths(obj))
+ if (.is.seq) {
+ N <- length(chrs)
+ cols <- rep(.color, round(N/length(.color)) + 1)[1:N]
+ names(cols) <- chrs
+ p <- p + scale_color_manual(values = cols)
+ }
+ if (!missing(facets)) {
+ args$facets <- facets
+ args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid,
+ facet_wrap)
+ facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets)
+ p <- p + facet
+ }
+ p <- p + theme(panel.grid.minor = element_blank())
+ if (!is.null(highlight.gr)) {
+ highlight.overlap <- match.arg(highlight.overlap)
+ idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap)
+ .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)),
+ function(id) {
+ gr <- GRanges(as.character(seqnames(p@data))[id][1],
+ IRanges(start = min(start(p@data[id])), end = max(end(p@data[id]))))
+ val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)]))
+ val <- val * (1 + highlight.label.offset)
+ values(gr)$val <- val
+ gr
+ })
+ .h.pos <- suppressWarnings(do.call("c", unname(.h.pos)))
+ if (length(.h.pos)) {
+ if (is.null(highlight.name)) {
+ highlight.name <- names(highlight.gr)
+ }
+ else {
+ highlight.name <- values(highlight.gr)[, highlight.name]
+ }
+ p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]),
+ do.call(aes, list(x = substitute(midpoint), y = args.aes$y)),
+ color = highlight.col)
+ if (!is.null(highlight.name)) {
+ seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj)
+ suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj))
+ .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip)
+ values(.trans)$mean <- (start(.trans) + end(.trans))/2
+ values(.trans)$names <- highlight.name
+ p <- p + geom_text(data = biovizBase::mold(.trans),
+ size = highlight.label.size,
+ vjust = 0, color = highlight.label.col, do.call(aes,
+ list(x = substitute(mean), y = as.name("val"),
+ label = as.name("names"))))
+ }
+ }
+ }
+ if (spaceline) {
+ vline.df <- p@ggplot$data
+ vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames,
+ function(dd) {
+ data.frame(start = min(dd$start), end = max(dd$end))
+ }))
+ gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2
+ p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") +
+ theme(panel.grid = element_blank())
+ }
+ if (!missing(main))
+ p <- p + labs(title = main)
+ if (!missing(xlim))
+ p <- p + xlim(xlim)
+ if (!missing(ylim))
+ p <- p + ylim(ylim)
+ if (missing(xlab))
+ xlab <- ""
+ p <- p + ggplot2::xlab(xlab)
+ p
+}
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index deac8b59..be256009 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -33,7 +33,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){
#' @describeIn FRASER This function calculates two-sided p-values based on
#' the beta-binomial distribution (or binomial or normal if desired). The
-#' returned p values are already adjusted with Holm's method per donor or
+#' returned p values are not yet adjusted with Holm's method per donor or
#' acceptor site, respectively.
#'
#' @param distributions The distribution based on which the p-values are
@@ -109,11 +109,6 @@ calculatePvalues <- function(fds, type=currentType(fds),
pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5)
pVals(fds, dist="BetaBinomial", level="junction",
withDimnames=FALSE) <- pvals
- fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
- pvals=pvals, index, BPPARAM=BPPARAM)
- fwer_pvals <- do.call(cbind, fwer_pval)
- pVals(fds, dist="BetaBinomial", level="site",
- withDimnames=FALSE) <- fwer_pvals
}
if("binomial" %in% distributions){
@@ -125,11 +120,6 @@ calculatePvalues <- function(fds, type=currentType(fds),
pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5)
pVals(fds, dist="Binomial", level="junction",
withDimnames=FALSE) <- pvals
- fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
- pvals=pvals, index, BPPARAM=BPPARAM)
- fwer_pvals <- do.call(cbind, fwer_pval)
- pVals(fds, dist="Binomial", level="site",
- withDimnames=FALSE) <- fwer_pvals
}
if("normal" %in% distributions){
@@ -142,22 +132,57 @@ calculatePvalues <- function(fds, type=currentType(fds),
pvals <- 2 * pmin(pval, 1 - pval, 0.5)
pVals(fds, dist="Normal", level="junction",
withDimnames=FALSE) <- pvals
- fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
- pvals=pvals, index, BPPARAM=BPPARAM)
- fwer_pvals <- do.call(cbind, fwer_pval)
- pVals(fds, dist="Normal", level="site",
- withDimnames=FALSE) <- fwer_pvals
}
fds
}
-adjust_FWER_PValues <- function(i, pvals=pvals, index=index){
- dt <- data.table(p=pvals[,i], idx=index)
- dt2 <- dt[,.(pa=min(p.adjust(p, method="holm"), na.rm=TRUE)),by=idx]
+adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff,
+ method="holm"){
+ dt <- data.table(p=pvals[,i], idx=index, rho=rho)
+ dt[rho > rhoCutoff, p:=NA]
+ suppressWarnings(dt2 <- dt[,.(pa=min(p.adjust(p, method=method),
+ na.rm=TRUE)),by=idx])
+ dt2[is.infinite(pa), pa:=NA]
setkey(dt2, "idx")[J(index)][,pa]
}
+adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff,
+ method="holm"){
+ pvals[rho > rhoCutoff,] <- NA
+ dttmp <- data.table(idx=index, rho=rho,
+ apply(pvals, 2, as.numeric))[idx == i,]
+ suppressWarnings(
+ pa <- apply(as.matrix(dttmp[,-c("idx", "rho")]), 2,
+ function(x) min(p.adjust(x, method=method),
+ na.rm = TRUE) )
+ )
+ pa[is.infinite(pa)] <- NA
+ return(pa)
+}
+
+getFWERpvals_bySample <- function(pvals, index, rho, method="holm",
+ rhoCutoff, BPPARAM=bpparam()){
+ fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
+ pvals=pvals, index, BPPARAM=BPPARAM,
+ method=method, rho=rho, rhoCutoff=rhoCutoff)
+ fwer_pvals <- do.call(cbind, fwer_pval)
+ return(fwer_pvals)
+}
+
+getFWERpvals_byIdx <- function(pvals, index, rho, method="holm",
+ rhoCutoff, BPPARAM=bpparam()){
+ unique_idx <- unique(index)
+ fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx,
+ pvals=pvals, index, BPPARAM=BPPARAM,
+ method=method, rho=rho, rhoCutoff=rhoCutoff)
+ fwer_pvals <- do.call(rbind, fwer_pval)
+ fwer_pvals <- as.matrix(
+ setkey(data.table(idx=unique_idx, fwer_pvals),
+ "idx")[J(index)][,-c("idx")])
+ return(fwer_pvals)
+}
+
singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){
ki <- k[idx,]
@@ -171,7 +196,7 @@ singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){
pvals <- pmin(1, pbbinom(ki, ni, alphai, betai))
if(any(is.na(pvals))){
- message(date(), " : ", idx)
+ message(date(), ": obtained NA pvalues for junction ", idx)
}
return (pvals)
@@ -188,37 +213,159 @@ singlePvalueBinomial <- function(idx, k, n, mu){
}
#' @describeIn FRASER This function adjusts the previously calculated
-#' p-values per sample for multiple testing.
+#' p-values per sample for multiple testing. First, the previoulsy calculated
+#' junction-level p values are adjusted with Holm's method per donor or
+#' acceptor site, respectively. Then, if gene symbols have been annotated to
+#' junctions (and not otherwise requested), gene-level p values are computed.
#'
-#' @param method The p.adjust method that should be used.
+#' @param method The p.adjust method that should be used for genome-wide
+#' multiple testing correction.
+#' @param rhoCutoff The cutoff value on the fitted rho value
+#' (overdispersion parameter of the betabinomial) above which junctions are
+#' masked with NA during p value adjustment (default: NA, no masking).
+#' @param geneLevel Logical value indiciating whether gene-level p values
+#' should be calculated. Defaults to TRUE.
+#' @param geneColumn The column name of the column that has the gene annotation
+#' that will be used for gene-level pvalue computation.
+#' @param subsets A named list of named lists specifying any number of gene
+#' subsets (can differ per sample). For each subset, FDR correction
+#' will be limited to genes in the subset, and the FDR corrected
+#' pvalues stored as an assay in the fds object in addition to the
+#' transcriptome-wide FDR corrected pvalues. See the examples for
+#' how to use this argument.
#'
#' @export
-calculatePadjValues <- function(fds, type=currentType(fds), method="BY"){
+calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
+ rhoCutoff=NA, geneLevel=TRUE,
+ geneColumn="hgnc_symbol", subsets=NULL,
+ BPPARAM=bpparam()){
currentType(fds) <- type
index <- getSiteIndex(fds, type=type)
idx <- !duplicated(index)
for(i in c("BetaBinomial", "Binomial", "Normal")){
# only do it if it exists
- if(!paste0("pvalues", i, "_", type) %in% assayNames(fds)){
+ if(!paste0("pvalues", i, "_junction_", type) %in% assayNames(fds)){
next
}
- pvals <- pVals(fds, dist=i)
- padj <- apply(pvals[idx,], 2, p.adjust, method=method)
+ pvals <- pVals(fds, dist=i, level="junction")
+ rho <- rho(fds, type=type)
+
+ # splice site-level pval correction
+ message(date(), ": adjusting junction-level pvalues ...")
+ fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm",
+ rhoCutoff=ifelse(is.na(rhoCutoff), 1, rhoCutoff),
+ BPPARAM=BPPARAM)
+ if(!is.na(rhoCutoff)){
+ filters <- list(rho=rhoCutoff)
+ } else{
+ filters <- list()
+ }
+ pVals(fds, dist=i, level="site", filters=filters,
+ withDimnames=FALSE) <- fwer_pvals
+
+ # junction-level FDR correction
+ message(date(), ": genome-wide FDR for junction-level pvalues ...")
+ padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method)
padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)]
padjDT[,i:=NULL]
- padjVals(fds, dist=i, withDimnames=FALSE) <- as.matrix(padjDT)
+ padjVals(fds, dist=i, level="site", filters=filters,
+ withDimnames=FALSE) <- as.matrix(padjDT)
+
+ # gene-level pval correction and FDR
+ if(isTRUE(geneLevel) &&
+ geneColumn %in% colnames(mcols(fds, type=type))){
+ message(date(), ": calculating gene-level pvalues ...")
+ gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals,
+ method="holm", FDRmethod=method,
+ geneColumn=geneColumn,
+ BPPARAM=BPPARAM)
+ pVals(fds, dist=i, level="gene", filters=filters,
+ withDimnames=FALSE) <- gene_pvals[["pvals"]]
+ padjVals(fds, dist=i, level="gene", filters=filters,
+ withDimnames=FALSE) <- gene_pvals[["padj"]]
+ } else if(isTRUE(geneLevel)){
+ warning("Gene-level pvalues could not be calculated as column ",
+ geneColumn, "\nwas not found for the given fds object. ",
+ "Please annotate gene symbols \nfirst using the ",
+ "annotateRanges function.")
+ }
+
+ # calculate FDR for each provided subset and assign to fds
+ if(!is.null(subsets)){
+ stopifnot(is.list(subsets))
+ stopifnot(!is.null(names(subsets)))
+ for(setName in names(subsets)){
+ geneListSubset <- subsets[[setName]]
+ fds <- calculatePadjValuesOnSubset(fds=fds,
+ genesToTest=geneListSubset,
+ subsetName=setName,
+ type=type, method=method,
+ geneColumn=geneColumn,
+ BPPARAM=BPPARAM)
+ }
+ }
}
return(fds)
}
-getSiteIndex <- function(fds, type){
+getPvalsPerGene <- function(fds, type=currentType(fds),
+ pvals=pVals(fds, type=type, level="site"),
+ sampleID=NULL, method="holm", FDRmethod="BY",
+ geneColumn="hgnc_symbol", BPPARAM=bpparam()){
+ # extract data and take only the first index of per site
+ message(date(), ": starting gene-level pval computation for type ", type)
+ samples <- samples(fds)
+ if(is.null(colnames(pvals))){
+ colnames(pvals) <- samples
+ }
+ dt <- data.table(
+ idx=getSiteIndex(fds, type=type),
+ geneID=getGeneIDs(fds, type=type, unique=FALSE,
+ geneColumn=geneColumn),
+ as.data.table(pvals))
+ dt <- dt[!is.na(geneID)]
+ geneIDs <- getGeneIDs(fds, type=type, unique=TRUE,
+ geneColumn=geneColumn)
+
+ # separate geneIDs into rows
+ dt[, dt_idx:=seq_len(.N)]
+ dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"]
+ dt <- dt[dt_tmp$dt_idx,]
+ dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
+ setkey(dt, geneID)
+
+ # extract samples
+ if(!is.null(sampleID)){
+ samples <- sampleID
+ }
+
+ # aggregate pvalues to gene level per sample
+ message(date(), ": gene-level pval computation per gene (n=",
+ length(geneIDs), ")")
+ pvalsPerGene <- genePvalsByGeneID(dt, samples=samples, geneIDs=geneIDs,
+ method=method, BPPARAM=BPPARAM)
+
+ # compute FDR
+ message(date(), ": genome-wide FDR for gene-level pvals for type ", type)
+ padjPerGene <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod)
+
+ message(date(), ": finished gene-level pval computation for type ", type)
+ return(list(pvals=pvalsPerGene, padj=padjPerGene))
+
+}
+
+getSiteIndex <- function(fds, type=currentType(fds)){
if(type == "theta"){
return(mcols(fds, type=type)[['spliceSiteID']])
}
+ if(type == "jaccard"){
+ return(seq_len(nrow(fds)))
+ }
+
startId <- mcols(fds, type=type)[,"startID"]
endId <- mcols(fds, type=type)[,"endID"]
strand <- strand(rowRanges(fds, type=type))
@@ -233,48 +380,178 @@ getSiteIndex <- function(fds, type){
ans[selectionMat]
}
-getGeneIDs <- function(fds, type, unique=TRUE){
- geneIDs <- mcols(fds, type=type)$hgnc_symbol
+getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE,
+ geneColumn="hgnc_symbol"){
+ if(!geneColumn %in% colnames(mcols(fds, type=type))){
+ stop("Did not find column '", geneColumn, "' in mcols(fds, type='",
+ type, "'). Please assign introns\nto genes first using the ",
+ "annotateRanges(fds, ...) or annotateRangesWithTxDb(fds, ...) ",
+ "function.")
+ }
+
+ geneIDs <- mcols(fds, type=type)[[geneColumn]]
if(isTRUE(unique)){
- geneIDs <- unique(geneIDs)
+ geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){
+ unlist(strsplit(g, ";"))}) ))
geneIDs <- geneIDs[!is.na(geneIDs)]
}
geneIDs
}
-getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type),
- sampleID=NULL, method="holm", BPPARAM=bpparam()){
- # extract data and take only the first index of per site
- dt <- data.table(
- idx=getSiteIndex(fds, type=type),
- geneID=getGeneIDs(fds, type=type, unique=FALSE),
- as.data.table(pvals))
- dt <- dt[!duplicated(idx) & !is.na(geneID)]
- setkey(dt, geneID)
+genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){
+ pvalsPerGene <- bplapply(geneIDs, BPPARAM=BPPARAM,
+ FUN=function(g) {
+ dttmp <- dt[geneID == g][!duplicated(idx)]
+ suppressWarnings(
+ pval_g <- apply(as.matrix(dttmp[,-c("idx", "geneID")]), 2,
+ function(x) min(p.adjust(x, method=method), na.rm = TRUE) )
+ )
+ pval_g[is.infinite(pval_g)] <- NA
+ pval_g
+ })
+ pvalsPerGene <- do.call(rbind, pvalsPerGene)
+ rownames(pvalsPerGene) <- geneIDs
+ return(pvalsPerGene)
+}
+
+#' @describeIn FRASER This function does FDR correction only for all junctions
+#' in a certain subset of genes which can differ per sample. Requires gene
+#' symbols to have been annotated to junctions. As with the full FDR
+#' correction across all junctions, first the previously calculated
+#' junction-level p values are adjusted with Holm's method per donor or
+#' acceptor site, respectively. Then, gene-level p values are computed.
+#'
+#' @param genesToTest A named list with the subset of genes to test per sample.
+#' The names must correspond to the sampleIDs in the given fds object.
+#' @param subsetName The name under which the resulting FDR corrected pvalues
+#' will be stored in metadata(fds).
+#'
+#' @export
+calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName,
+ type=currentType(fds), method='BY',
+ geneColumn="hgnc_symbol", BPPARAM=bpparam()){
- samples <- samples(fds)
- if(!is.null(sampleID)){
- samples <- sampleID
+ # check input
+ stopifnot(!is.null(genesToTest))
+ stopifnot(is.list(genesToTest) || is.vector(genesToTest))
+
+ # replicate subset genes for each sample if given as vector
+ if(!is.list(genesToTest)){
+ genesToTest <- rep(list(genesToTest), ncol(fds))
+ names(genesToTest) <- colnames(fds)
}
- pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM,
- function(i){
- dttmp <- dt[,min(p.adjust(get(i), method=method)),by=geneID]
- setkey(dttmp, geneID)
- dttmp[J(getGeneIDs(fds, type=type)), V1]
- })), ncol=length(samples))
+ # check that names are present and correspond to samples in ods
+ stopifnot(!is.null(names(genesToTest)))
+ if(!all(names(genesToTest) %in% colnames(fds))){
+ stop("names(genesToTest) need to be sampleIDs in the given fds object.")
+ }
- colnames(pvalsPerGene) <- samples
- rownames(pvalsPerGene) <- getGeneIDs(fds, type=type)
+ # get genes from fds object
+ fds_genes <- getGeneIDs(fds, unique=TRUE, type=type, geneColumn=geneColumn)
+ ngenes <- length(fds_genes)
- return(pvalsPerGene)
-
+ # site index (for psi3/5)
+ site_idx <- getSiteIndex(fds, type=type)
+
+ # compute FDR on the given subsets of genes
+ message(date(), ": starting FDR calculation on subset of genes...")
+
+ # compute FDR on the given subsets of genes
+ fdrSubset <- bplapply(colnames(fds), FUN=function(sampleId){
+
+ # get genes to test for this sample
+ genesToTestSample <- genesToTest[[sampleId]]
+ padj <- rep(NA, nrow(mcols(fds, type=type)))
+ padj_gene <- rep(NA, ngenes)
+
+ # if no genes present in the subset for this sample, return NAs
+ if(is.null(genesToTestSample)){
+ return(list(padj=padj, padj_gene=padj_gene))
+ }
+
+ # get idx of junctions corresponding to genes to test
+ if(is.character(genesToTestSample)){
+ rowIdx <- sort(which(fds_genes %in% genesToTestSample))
+ rowIdx <- unlist(lapply(genesToTestSample, function(gene){
+ idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"),
+ mcols(fds, type=type)[, geneColumn]))
+ names(idx) <- rep(gene, length(idx))
+ if(length(idx) == 0 && verbose(fds) > 0){
+ warning("No introns found in fds object for gene: ", gene,
+ " and sample: ", sampleId, ". Skipping this gene.")
+ }
+ return(idx)
+ }))
+ rowIdx <- sort(rowIdx[!duplicated(rowIdx)])
+ } else{
+ stop("Genes in the list to test must be a character vector ",
+ "of geneIDs.")
+ }
+
+ # check that rowIdx is not empty vector
+ if(length(rowIdx) == 0){
+ warning("No genes from the given subset found in the fds ",
+ "object for sample: ", sampleId)
+ return(list(padj=padj, padj_gene=padj_gene))
+ }
+
+
+
+ # retrieve pvalues of introns to test
+ p <- as.matrix(pVals(fds, type=type))
+ if(ncol(p) == 1){
+ colnames(p) <- colnames(fds)
+ }
+ p <- p[rowIdx, sampleId]
+
+ # FDR correction on subset
+ non_dup_site_idx <- !duplicated(site_idx[rowIdx])
+ padjSub <- p.adjust(p[non_dup_site_idx], method=method)
+
+ # set intron FDR on subset (filled with NA for all other genes)
+ padj[rowIdx] <- padjSub
+
+ # gene level pvals
+ dt <- data.table(sampleID=sampleId, type=type, pval=p,
+ gene=names(rowIdx), jidx=rowIdx, site_idx=site_idx[rowIdx])
+ dt <- merge(dt,
+ data.table(site_idx=site_idx[rowIdx][non_dup_site_idx],
+ FDR_subset=padjSub),
+ by="site_idx")
+ dt[!duplicated(dt$site_idx),
+ pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
+ dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"]
+
+ # gene level FDR
+ dt2 <- dt[, unique(pval_gene), by="gene"]
+ dt2[, FDR_subset_gene := p.adjust(V1, method=method)]
+ dt2[, gene_rowIdx := sapply(gene, function(g) which(fds_genes == g))]
+
+ # set intron FDR on subset (filled with NA for all other genes)
+ padj_gene[dt2[,gene_rowIdx]] <- dt2[, FDR_subset_gene]
+
+ # return new FDR
+ return(list(padj=padj, padj_gene=padj_gene))
+
+ }, BPPARAM=BPPARAM)
+ padjSub <- vapply(fdrSubset, '[[',
+ double(nrow(mcols(fds, type=type))), 'padj')
+ padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene')
+
+ colnames(padjSub) <- colnames(fds)
+ rownames(padjSub_gene) <- fds_genes
+ colnames(padjSub_gene) <- colnames(fds)
+
+ # add FDR subset info to ods object and return
+ padjVals(fds, type=type, level="site", subsetName=subsetName,
+ withDimnames=FALSE) <- padjSub
+ padjVals(fds, type=type, level="gene", subsetName=subsetName,
+ withDimnames=FALSE) <- padjSub_gene
+ addToAvailableFDRsubsets(fds) <- subsetName
+
+ message(date(), ": finished FDR calculation on subset of genes.")
+ validObject(fds)
+ return(fds)
}
-getPadjPerGene <- function(pvals, method="BY"){
-
- padjPerGene <- apply(pvals, 2, p.adjust, method=method)
-
- return(padjPerGene)
-
-}
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
new file mode 100644
index 00000000..7ab9b61d
--- /dev/null
+++ b/R/resultAnnotations.R
@@ -0,0 +1,974 @@
+#'
+#' @title Additional result annotations
+#'
+#' @description These functions work on the result table and add additional
+#' annotations to the reported introns: the type of potential impact on
+#' splicing (e.g. exon skipping, exon truncation, ...), potential occurence
+#' of frameshift, overlap with UTR regions as well as a flag for introns
+#' that are located in blacklist regions of the genome.
+#'
+#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the
+#' fds for each intron based on the overlap of the intron's location with
+#' the reference annotation. Has to be run before the result table is
+#' created so that the new column can be included in it (see examples).
+#'
+#' \code{\link{annotatePotentialImpact}} annotates each intron in the results
+#' table with the type of potential impact on splicing and potential
+#' occurence of frameshift (likely, unlikely, inconclusive). Can also
+#' calculate overlap with annotated UTR regions. Potential impact can be:
+#' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage,
+#' exonTruncation, exonElongation, exonTruncation&Elongation,
+#' exonSkipping, splicingBeyondGene,
+#' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene,
+#' complex (everything else).
+#' Splice sites (theta metric) annotations indicate how the splice site is
+#' located with respect to the reference annotation. The annotated types
+#' are: annotatedSpliceSite, exonicRegion, intronicRegion.
+#'
+#' \code{\link{flagBlacklistRegions}} flags introns in the results table on
+#' whether or not they are located in a blacklist region of the genome. By
+#' default, the blacklist regions as reported in
+#' \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from
+#' \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here}
+#' are used.
+#'
+#' @param fds A FraserDataSet
+#' @param txdb A txdb object providing the reference annotation.
+#' @param result A result table as generated by FRASER, including the column
+#' \code{annotatedJunction} as generated by the function
+#' \code{annotateIntronReferenceOverlap}.
+#' @param addPotentialImpact Logical, indicating if the type of the potential
+#' impact should be added to the results table. Defaults to \code{TRUE}.
+#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
+#' should checked and added to the results table. Defaults to \code{TRUE}.
+#' @param minoverlap Integer value defining the number of base pairs around the
+#' splice site that need to overlap with UTR or blacklist region,
+#' respectivly, to be considered matching. Defaults to 5 bp.
+#' @param blacklist_regions A BED file that contains the blacklist regions.
+#' If \code{NULL} (default), the BED files that are packaged with FRASER
+#' are used (see Details for more information).
+#' @param assemblyVersion Indicates the genome assembly version of the intron
+#' coordinates. Only used if blacklist_regions is NULL. For other versions,
+#' please provide the BED file containing the blacklist regions directly.
+#' @param BPPARAM For controlling parallelization behavior. Defaults to
+#' \code{bpparam()}.
+#' @return An annotated FraserDataSet or results table, respectively
+#'
+#' @name potentialImpactAnnotations
+#' @rdname potentialImpactAnnotations
+#'
+#' @examples
+#' # get data, fit and compute p-values and z-scores
+#' fds <- createTestFraserDataSet()
+#'
+#' # load reference annotation
+#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+#'
+#' # add basic annotations for overlap with the reference annotation
+#' # run this function before creating the results table
+#' fds <- annotateIntronReferenceOverlap(fds, txdb)
+#'
+#' # extract results: for this small example dataset, no cutoffs used
+#' # to get some results
+#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+#'
+#' # annotate the type of potential impact on splicing and UTR overlap
+#' res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
+#'
+#' # annotate overlap with blacklist regions
+#' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
+#'
+#' # show results table containing additional annotations
+#' res
+#'
+NULL
+
+#' @describeIn potentialImpactAnnotations This method calculates basic annotations
+#' based on overlap with the reference annotation (start, end, none, both)
+#' for the full fds. The overlap type is added as a new column
+#' \code{annotatedJunction} in \code{mcols(fds)}.
+#' @export
+annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
+ message("loading introns ...")
+ #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+ introns <- unique(unlist(intronsByTranscript(txdb)))
+ # reduce the introns to only the actually expressed introns
+ fds_known <- fds[unique(to(findOverlaps(introns,
+ rowRanges(fds, type = "j"), type = "equal"))),]
+ anno_introns <- as.data.table(rowRanges(fds_known,
+ type="j"))[,.(seqnames, start, end, strand)]
+
+ # calculate extra columns with mean/median intron expression count
+ # add the new columns
+ sampleCounts <- as.matrix(K(fds_known, type = "j"))
+ anno_introns[, meanCount := rowMeans(sampleCounts)]
+ anno_introns[, medianCount := rowMedians(sampleCounts)]
+ # order by medianCount (highest first)
+ setorderv(anno_introns, "medianCount", order=-1)
+ anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns,
+ keep.extra.columns = TRUE)
+
+ # get all fds junctions
+ fds_junctions <- rowRanges(fds, type = "j")
+
+ # Do the annotation just for the intron with highest median expression
+ message("start calculating basic annotations ...")
+ overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
+ annotations <- bplapply(seq_len(length(fds_junctions)),
+ function(i, overlaps, fds_junctions, anno_introns_ranges){
+ # only select first intron as already ordered by medianCount beforehand
+ overlap <- overlaps[i]
+ if(is.na(overlap)) return("none") #no overlap with any intron
+
+ hit_equal <- from(findOverlaps(fds_junctions[i],
+ anno_introns_ranges[overlap],
+ type="equal"))
+ if(length(hit_equal) > 0) return("both")
+
+ hit_start <- from(findOverlaps(fds_junctions[i],
+ anno_introns_ranges[overlap],
+ type="start"))
+ if(length(hit_start) > 0) return("start")
+ hit_end <- from(findOverlaps(fds_junctions[i],
+ anno_introns_ranges[overlap],
+ type="end"))
+ if(length(hit_end) > 0) return("end")
+
+ # overlaps but no start/end match
+ return("none")
+ }, overlaps=overlaps, fds_junctions=fds_junctions,
+ anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM)
+ annotations <- unlist(annotations)
+
+ rowRanges(fds)$annotatedJunction <- annotations
+ mcols(fds, type="ss")$annotatedJunction <- "not computed"
+ message("basic annotations done")
+ return(fds)
+}
+
+#' @describeIn potentialImpactAnnotations This method annotates the splice event
+#' type to junctions in the given results table.
+#' @export
+annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE,
+ addUTRoverlap=TRUE, minoverlap=5,
+ BPPARAM=bpparam()){
+
+ # convert to data.table if not already
+ if(!is.data.table(result)){
+ annoResult <- as.data.table(result)
+ } else{
+ annoResult <- result
+ }
+
+ # Create basic annotation of overlap with reference
+ if(!("annotatedJunction" %in% colnames(annoResult))){
+ stop("Column 'annotatedJunction' not found in the results table!\n",
+ "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ",
+ "first and then extract \nthe results table using the ",
+ "'results(fds, ...)' function before calling this function.")
+ }
+
+ # Calculate splice types and frameshift
+ if(isTRUE(addPotentialImpact)){
+ annoResult <- addPotentialImpactLabels(annoResult, fds, txdb)
+ annoResult[potentialImpact == "singleExonSkipping",
+ potentialImpact := "exonSkipping"]
+ }
+
+ # Add UTR labels
+ if(isTRUE(addUTRoverlap)){
+ annoResult <- addUTRLabels(annoResult, txdb)
+ }
+
+ if(is(result, "GenomicRanges")){
+ annoResult <- makeGRangesFromDataFrame(annoResult,
+ keep.extra.columns=TRUE)
+ }
+
+ return(annoResult)
+}
+
+#' @describeIn potentialImpactAnnotations This method flags all introns and
+#' splice sites in the given results table for which at least one splice
+#' site (donor or acceptor) is located in a blacklist region. Blacklist
+#' regions of the genome are determined from the provided BED file.
+#' @export
+flagBlacklistRegions <- function(result, blacklist_regions=NULL,
+ assemblyVersion=c('hg19', 'hg38'),
+ minoverlap=5){
+
+ # convert to data.table if not already
+ if(!is.data.table(result)){
+ annoResult <- as.data.table(result)
+ } else{
+ annoResult <- result
+ }
+
+ assemblyVersion <- match.arg(assemblyVersion)
+ if(is.null(blacklist_regions)){
+ blacklist_regions <-
+ system.file("extdata", "blacklist_regions",
+ paste0(assemblyVersion, "-blacklist.v2.bed.gz"),
+ package = "FRASER")
+ }
+ if(!file.exists(blacklist_regions)){
+ stop("BED file with blacklist regions does not exist: ",
+ blacklist_regions)
+ }
+ message("Importing blacklist regions ...")
+ blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED")
+ annoResult <- addBlacklistLabels(annoResult, blacklist_gr)
+
+ if(is(result, "GenomicRanges")){
+ annoResult <- makeGRangesFromDataFrame(annoResult,
+ keep.extra.columns=TRUE)
+ }
+
+ return(annoResult)
+}
+
+############# helper functions ##############################
+
+#' blacklist annotation for aberrant splicing events
+#' @noRd
+addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){
+ # add the blacklist information
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
+
+ # get gr with start/end positions of each intron
+ gr_start_ss <- junctions_gr
+ end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
+ start(gr_start_ss) <- start(gr_start_ss) - minoverlap
+ gr_end_ss <- junctions_gr
+ start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
+ end(gr_end_ss) <- end(gr_end_ss) + minoverlap
+
+ # set to the same seqlevelsstyle
+ seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
+
+ ## create overlap with blacklist and annotate extra column
+ message("finding blacklist overlap ...")
+ black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr)))
+ black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr)))
+ junctions_dt[, blacklist := FALSE]
+
+ junctions_dt[black_hits_start_ss, blacklist := TRUE]
+ junctions_dt[black_hits_end_ss, blacklist := TRUE]
+ colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+
+ message("blacklist labels done")
+ return(junctions_dt)
+}
+
+#' adds UTR overlap annotation to results table
+#' @noRd
+addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
+ seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+
+ # get gr with start/end positions of each intron
+ gr_start_ss <- junctions_gr
+ end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
+ start(gr_start_ss) <- start(gr_start_ss) - minoverlap
+ gr_end_ss <- junctions_gr
+ start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
+ end(gr_end_ss) <- end(gr_end_ss) + minoverlap
+
+ ### UTR labels based on txdb file
+ ### add 5' 3' UTR labels
+ message("finding UTR overlap ...")
+ threes_start <- unique(from(findOverlaps(gr_start_ss,
+ threeUTRsByTranscript(txdb, use.names = TRUE))))
+ threes_end <- unique(from(findOverlaps(gr_end_ss,
+ threeUTRsByTranscript(txdb, use.names = TRUE))))
+ fives_start <- unique(from(findOverlaps(gr_start_ss,
+ fiveUTRsByTranscript(txdb, use.names = TRUE))))
+ fives_end <- unique(from(findOverlaps(gr_end_ss,
+ fiveUTRsByTranscript(txdb, use.names = TRUE))))
+ junctions_dt[, UTR_overlap := "no"]
+ junctions_dt[threes_start, UTR_overlap := "3'-UTR"]
+ junctions_dt[threes_end, UTR_overlap := "3'-UTR"]
+ junctions_dt[fives_start, UTR_overlap := "5'-UTR"]
+ junctions_dt[fives_end, UTR_overlap := "5'-UTR"]
+ colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+ message("UTR labels done")
+ return(junctions_dt)
+}
+
+
+
+#' adds type of splicing to each intron in the results table
+#' @noRd
+addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
+ message("preparing ...")
+ psi_positions <- which(junctions_dt$type != "theta")
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
+ keep.extra.columns = TRUE)
+ seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+
+ introns_tmp <- unique(unlist(intronsByTranscript(txdb)))
+ exons <- exons(txdb)
+
+ # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+ fds_known <- fds[unique(to(findOverlaps(introns_tmp,
+ rowRanges(fds, type = "j"),
+ type = "equal"))),]
+ grIntrons <- rowRanges(fds_known, type="j")
+ introns <- as.data.table(grIntrons)
+ introns <- introns[,.(seqnames, start, end, strand)]
+
+ sampleCounts <- K(fds_known, type = "j")
+ introns[, "meanCount" := rowMeans(sampleCounts)]
+ introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
+ intron_ranges <- makeGRangesFromDataFrame(introns,
+ keep.extra.columns = TRUE)
+
+ # prepare the results column
+ junctions_dt[, potentialImpact := "complex"]
+ junctions_dt[, causesFrameshift := "inconclusive"]
+ junctions_dt[annotatedJunction == "both" & deltaPsi >= 0,
+ potentialImpact := "annotatedIntron_increasedUsage"]
+ junctions_dt[annotatedJunction == "both" & deltaPsi < 0,
+ potentialImpact := "annotatedIntron_reducedUsage"]
+ junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
+
+ if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in%
+ colnames(junctions_dt))){
+ junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" &
+ type == "jaccard" &
+ nonsplitProportion >= nonsplitProportion_99quantile + 0.05 &
+ nonsplitCounts >= 10,
+ potentialImpact := "(partial)intronRetention"]
+
+ # TODO check frameshift for intron retention
+ junctions_dt[potentialImpact == "(partial)intronRetention",
+ causesFrameshift := "inconclusive"]
+ }
+
+ starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
+ ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
+ nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none")
+
+ message("calculating splice types ...")
+ # start junctions
+ start_results <- sapply(starts, function(i){
+ # find the most freq intron that overlaps again
+ overlap <- to(findOverlaps(junctions_gr[i], intron_ranges,
+ type = "start"))
+ expre <- sapply(overlap, function(j){
+ elementMetadata(intron_ranges[j])$medianCount
+ })
+ maxExpr <- which.max(expre)
+ return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE,
+ intron_ranges, exons))
+ })
+ junctions_dt[psi_positions[starts],
+ causesFrameshift:=start_results[2,]]
+ junctions_dt[psi_positions[starts],
+ potentialImpact := start_results[1,]]
+
+ # end junctions
+ end_results <- sapply(ends, function(i){
+ # find the most freq intron that overlaps again
+ overlap <- to(findOverlaps(junctions_gr[i], intron_ranges,
+ type = "end"))
+ expre <- sapply(overlap, function(j){
+ elementMetadata(intron_ranges[j])$medianCount
+ })
+ maxExpr <- which.max(expre)
+ return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE,
+ intron_ranges, exons))
+
+ })
+ junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
+ junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]]
+
+ # none junctions pt1
+ none_results <- sapply(nones, function(i){
+ # find most freq intron
+ # check start and end
+
+ # find the most freq intron that overlaps again
+ overlap <- to(findOverlaps(junctions_gr[i], intron_ranges))
+ if(length(overlap) == 0) return(c("noOverlap", "inconclusive"))
+ expre <- sapply(overlap, function(j){
+ elementMetadata(intron_ranges[j])$medianCount
+ })
+ maxExpr <- which.max(expre)
+
+ # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift
+ st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE,
+ intron_ranges, exons)
+ en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE,
+ intron_ranges, exons)
+
+ # merge, start and end results
+ # merge exon elongation/truncation
+ # if both likely/unlikely fine
+ # if one is likely -> return likely
+ # if one is notYet -> return notYet
+ if((st[1] == "singleExonSkipping" & !(en[1] %in%
+ c("singleExonSkipping", "exonSkipping"))) ||
+ (en[1] == "singleExonSkipping" & !(st[1] %in%
+ c("singleExonSkipping", "exonSkipping")))){
+ ## only one is single exonSkipping, the other is trunc/elong
+ if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){
+ frs = "likely"
+ }else{ frs = "unlikely"}
+ return(c("singleExonSkipping", frs))
+ }
+
+ if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in%
+ c("exonSkipping", "singleExonSkipping")){
+ return(c("exonSkipping", "inconclusive"))
+ }
+
+ if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){
+ frs = "likely"
+ }else{ frs = "unlikely"}
+ if( st[1] != en[1]){
+ combined = "exonTruncation&Elongation"
+ }else{combined = st[1]}
+ return(c(combined,frs))
+
+ })
+ junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
+ junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]]
+
+ noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap")
+ refseq.genes<- genes(txdb)
+
+ # none junctions pt2
+ noLaps_results <- sapply(noLaps, function(i){
+ overlap <- to(findOverlaps(junctions_gr[i], exons))
+ # no overlap with an intron or an exon
+ if(length(overlap) == 0){
+ return(checkIntergenic(junctions_gr, i, refseq.genes))
+ }
+
+ # for the exons, check if splice site is contained in the exon
+ for(j in overlap){
+ exon_start = start(exons[j])
+ exon_end = end(exons[j])
+ if(exon_start <= start(junctions_gr[i]) &
+ exon_end >= end(junctions_gr[i])){
+ if((end(junctions_gr[i]) -
+ start(junctions_gr[i]) + 1) %% 3 != 0){
+ frs = "likely"
+ }else{ frs = "unlikely"}
+ return(c("exonTruncation", frs))
+ }
+ }
+
+ return(c("complex","inconclusive"))
+ })
+ junctions_dt[psi_positions[noLaps],
+ causesFrameshift:=noLaps_results[2,]]
+ junctions_dt[psi_positions[noLaps],
+ potentialImpact := noLaps_results[1,]]
+
+ # theta annotations
+ thetas <- which(junctions_dt$type == "theta")
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,],
+ keep.extra.columns = TRUE)
+
+ # specify default type for theta results as NA
+ junctions_dt[thetas, potentialImpact := NA]
+ junctions_dt[thetas, causesFrameshift := NA]
+
+ # label all as intronic first if they have any intron overlap
+ intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
+ junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"]
+
+ # for exonic, check if theta is fully contained in an exon
+ # if one end is in an intron and the other in an exon it is a splice site
+ exonic <- unique(from(findOverlaps(junctions_gr, exons)))
+ within <- findOverlaps(junctions_gr, exons, type = "within")
+ all <- findOverlaps(junctions_gr, exons)
+ exonic_results <- sapply(exonic, function(i){
+ w <- unique(to(within)[which(from(within) == i)])
+ a <- unique(to(all)[which(from(all) == i)])
+ if(length(a) == length(w)) return("exonicRegion")
+ return("annotatedSpliceSite")
+ })
+ junctions_dt[thetas[exonic], potentialImpact := exonic_results]
+
+ # check cases that don't overlap with an exon/intron
+ nones <- which(is.na(junctions_dt[thetas,]$potentialImpact))
+ none_results <- sapply(nones, function(i){
+ if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
+ #return("intergenic")
+ if(start(refseq.genes[nearest(junctions_gr[i],
+ refseq.genes)]) > start(junctions_gr[i])){
+ ifelse(strand(junctions_gr[i]) == "+",
+ return("upstreamOfNearestGene"),
+ return("downstreamOfNearestGene"))
+ }else{
+ ifelse(strand(junctions_gr[i]) == "+",
+ return("downstreamOfNearestGene"),
+ return("upstreamOfNearestGene"))
+ }
+ })
+ junctions_dt[thetas[nones], potentialImpact := none_results]
+
+ # add distance to closest neighbour gene for intergenic results
+ # (both psi and theta)
+ message("adding distances to nearest gene ...")
+ up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene")
+ down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene")
+
+ # create full grange object containing psi and theta
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt,
+ keep.extra.columns = TRUE)
+
+ # Calculate distances
+ if(length(up) > 0){
+ distanceNearestGene_up <- sapply(up, function(i){
+ min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
+ if(length(distanceNearestGene_up > 0)){
+ junctions_dt[psi_positions[up],
+ distNearestGene := distanceNearestGene_up]
+ } else{
+ junctions_dt[psi_positions[up], distNearestGene := NA]
+ message("No distances found for upstream")
+ }
+ }else{message("No upstream targets")}
+
+ if(length(down) > 0){
+ distanceNearestGene_down <- sapply(down, function(i){
+ min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
+ if(length(distanceNearestGene_down > 0)){
+ junctions_dt[psi_positions[down],
+ distNearestGene := distanceNearestGene_down]
+ }else{
+ junctions_dt[psi_positions[down], distNearestGene := NA]
+ message("No distances found for downstream")
+ }
+ }else{message("No downstream targets")}
+
+ colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+ message("done calculating splice types")
+
+ # Add the subtypes for exonSkipping and inconclusive
+ junctions_dt <- checkExonSkipping(junctions_dt, txdb)
+ junctions_dt <- checkInconclusive(junctions_dt, txdb)
+
+ return(junctions_dt)
+}
+
+#'
+#' @noRd
+compareStarts <- function(junctions_gr, i, max_lap, shift_needed,
+ intron_ranges, exons){
+ intron_start = start(intron_ranges[max_lap])
+ ss_start = start(junctions_gr[i])
+
+ # found the most freq intron with same end again
+ # check if intron starts before splice site -> exon elongation -> FRS
+ if(intron_start < ss_start){
+ if(((ss_start - intron_start) %% 3) != 0){
+ frs = "likely"
+ }else{ frs = "unlikely"}
+
+ ifelse(shift_needed,
+ return(c("exonElongation", frs,
+ (ss_start - intron_start))),
+ return(c("exonElongation", frs)))
+ }
+
+ # check if splice site ends in following exon -> exon truncation -> FRS
+ if(intron_start > ss_start){
+
+ # create dummy exon find all exons starting from that intron end
+ dummy_exon <- GRanges(
+ seqnames = toString(seqnames(intron_ranges[max_lap])),
+ ranges = IRanges(intron_start-2, end = intron_start -1),
+ strand = toString(strand(intron_ranges[max_lap]))
+ )
+ exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end"))
+ for(j in exonChoices){
+ exon_start = start(exons[j])
+ if(exon_start < ss_start){
+ if((end(exons[j]) - ss_start + 1)%%3 != 0){
+ frs = "likely"
+ }else{frs = "unlikely"}
+ ifelse(shift_needed,
+ return(c("exonTruncation", frs,
+ (-1)*(end(exons[j]) - ss_start + 1))),
+ return(c("exonTruncation", frs)))
+ }
+ }
+
+ # check for single exon skipping
+ if(length(exonChoices) == 1){
+
+ # check if there is no other exon within the first intron:
+ # splice site end until exon end
+ dummyFirstItr <- GRanges(
+ seqnames = toString(seqnames(intron_ranges[max_lap])),
+ ranges = IRanges(end(exons[exonChoices[1]]) + 1,
+ end(junctions_gr[i])),
+ strand = toString(strand(intron_ranges[max_lap]))
+ )
+
+ if(length(findOverlaps(exons, dummyFirstItr,
+ type = "within")) > 0){
+ # another exon is contained within the most freq used intron
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+ }
+
+
+ secondItr <- GRanges(
+ seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+ strand = toString(intron_ranges[max_lap]@strand@values),
+ ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1)
+ # end of exon + 1, end of aberrant junction
+ )
+ secItrChoices <- to(findOverlaps(secondItr, intron_ranges,
+ type = "end"))
+ # only look at most used one
+ expre <- sapply(secItrChoices, function(j){
+ elementMetadata(intron_ranges[j])$medianCount
+ })
+ maxExpr <- which.max(expre)
+
+ if(length(secItrChoices) == 0){
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+ }
+
+ if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){
+ # check if there is no other exon in that range
+ if(length(findOverlaps(exons,
+ intron_ranges[secItrChoices[maxExpr]],
+ type = "within")) == 0){
+ # clear exon skipping, only exon is skipped
+ # calculate frameshift, skipped exon plus possible exon
+ # elongation
+
+ shift = (-1)*(end(exons[exonChoices[1]]) -
+ start(exons[exonChoices[1]]) + 1) +
+ ss_start - start(intron_ranges[secItrChoices[maxExpr]])
+
+ frs = ifelse(shift %% 3 == 0,"unlikely","likely")
+ ifelse(shift_needed,
+ return(c("singleExonSkipping", "inconclusive",
+ shift)),
+ return(c("singleExonSkipping", frs)))
+ }
+ }
+ } # single exon skipping end
+
+ }
+
+ # splice site longer than one intron + exon -> not defined for now
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+}
+
+#'
+#' @noRd
+compareEnds <- function(junctions_gr, i, max_lap, shift_needed,
+ intron_ranges, exons){
+ intron_end = end(intron_ranges[max_lap])
+ ss_end = end(junctions_gr[i])
+
+ # found the most freq intron with same start again
+ # check if intron ends after splice site -> exon elongation -> FRS -> done
+ if(intron_end > ss_end){
+ if(((intron_end - ss_end) %% 3) != 0){
+ frs = "likely"
+ }else{ frs = "unlikely"}
+
+ ifelse(shift_needed,
+ return(c("exonElongation", frs, (intron_end - ss_end))),
+ return(c("exonElongation", frs)))
+ }
+
+ # check if splice site ends in following exon -> exon truncation -> FRS
+ if(intron_end < ss_end){
+
+ # create dummy exon find all exons starting from that intron end
+ dummy_exon <- GRanges(
+ seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+ ranges = IRanges(intron_end + 1, end = intron_end + 2),
+ strand = toString(intron_ranges[max_lap]@strand@values)
+ )
+ exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start"))
+ for(j in exonChoices){
+ exon_end = end(exons[j])
+ if(exon_end > ss_end){
+ if((ss_end - start(exons[j]) + 1)%%3 != 0){
+ frs = "likely"
+ }else{frs = "unlikely"}
+ ifelse(shift_needed,
+ return(c("exonTruncation",frs,
+ (-1)*(ss_end - start(exons[j]) + 1))),
+ return(c("exonTruncation",frs)))
+ }
+ }
+
+ # check for single exon skipping
+ if(length(exonChoices) == 1){
+
+ # check if there is no other exon within the first intron:
+ # splice site end until exon end
+ dummyFirstItr <- GRanges(
+ seqnames = toString(seqnames(intron_ranges[max_lap])),
+ ranges = IRanges(start(junctions_gr[i]),
+ start(exons[exonChoices[1]]) - 1),
+ strand = toString(strand(intron_ranges[max_lap]))
+ )
+
+ if(length(findOverlaps(exons, dummyFirstItr,
+ type = "within")) > 0){
+ # another exon is contained within the most freq used intron
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+ }
+
+
+ secondItr <- GRanges(
+ seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+ strand = toString(intron_ranges[max_lap]@strand@values),
+ ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end)
+ # end of exon + 1, end of aberrant junction
+ )
+ secItrChoices <- to(findOverlaps(secondItr, intron_ranges,
+ type = "start"))
+ # only look at most used one
+ expre <- sapply(secItrChoices, function(j){
+ elementMetadata(intron_ranges[j])$medianCount
+ })
+ maxExpr <- which.max(expre)
+
+ if(length(secItrChoices) == 0){
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+ }
+
+ if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){
+ # check if there is no other exon in that range
+ if(length(findOverlaps(exons,
+ intron_ranges[secItrChoices[maxExpr]],
+ type = "within")) == 0){
+ # clear exon skipping, only exon is skipped
+ # calculate frameshift, skipped exon plus possible exon
+ # elongation at end
+ shift = (-1)*(end(exons[exonChoices[1]]) -
+ start(exons[exonChoices[1]]) + 1) +
+ end(intron_ranges[secItrChoices[maxExpr]]) - ss_end
+ frs = ifelse(shift%%3 == 0,"unlikely","likely")
+ ifelse(shift_needed,
+ return(c("singleExonSkipping", "inconclusive",
+ shift)),
+ return(c("singleExonSkipping", frs)))
+ }
+ }
+ } # single exon skipping end
+
+
+ }
+
+ # splice site longer than one intron + exon -> not defined for now
+ ifelse(shift_needed,
+ return(c("exonSkipping", "inconclusive", 0)),
+ return(c("exonSkipping", "inconclusive")))
+}
+
+#'
+#' @noRd
+checkIntergenic <- function(junctions_gr, i, refseq.genes){
+ # check if start > 1000
+ # start - 1000, end + 1000
+ start = start(junctions_gr[i])
+ # ifelse(start > 1000, start = start - 1000, start = 1)
+ # if(start > 1000){
+ # start = start - 1000
+ # }else{start = 1}
+
+ end = end(junctions_gr[i]) #+ 1000
+ if(start + 2 < end){
+ start = start + 1
+ end = end - 1
+ }
+
+ test_junction <- GRanges(
+ seqnames = seqnames(junctions_gr[i]),
+ ranges = IRanges(start, end),
+ strand = strand(junctions_gr[i])
+ )
+
+ # overlap with introns and exon
+ # IGNORE STRANDS? -> decided its not necessary
+
+ # check if distance to nearest is > 1000 -> intergenic
+ # otherwise up/downstream
+ dist = min(distance(test_junction, refseq.genes), na.rm = TRUE)
+ if(dist > 0){
+ # find nearest and compare starts
+ if(start(refseq.genes[nearest(junctions_gr[i],
+ refseq.genes)]) > start){
+ ifelse(strand(junctions_gr[i]) == "+",
+ return(c("upstreamOfNearestGene", "unlikely")),
+ return(c("downstreamOfNearestGene", "unlikely")))
+ }else{
+ ifelse(strand(junctions_gr[i]) == "+",
+ return(c("downstreamOfNearestGene", "unlikely")),
+ return(c("upstreamOfNearestGene", "unlikely")))
+ }
+ }
+ return(c("complex", "inconclusive"))
+}
+
+#'
+#' @noRd
+checkExonSkipping <- function(junctions_dt, txdb){
+ psi_positions <- which(junctions_dt$type != "theta")
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
+ keep.extra.columns = TRUE)
+ seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+
+ refseq.genes<- genes(txdb)
+
+ exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in%
+ c("exonSkipping", "singleExonSkipping"))
+
+ message("start checking exonSkipping")
+ newSkip_results <- sapply(exonSkip, function(i){
+ start = start(junctions_gr[i])
+ end = end(junctions_gr[i])
+
+ # reduce the junction width so adjacent genes have a distance of 1
+ if(start + 2 < end){
+ start = start + 1
+ end = end - 1
+ }
+
+ test_start <- GRanges(
+ seqnames = seqnames(junctions_gr[i]),
+ strand = strand(junctions_gr[i]),
+ ranges = IRanges(start, start + 1)
+ )
+
+ test_end <- GRanges(
+ seqnames = seqnames(junctions_gr[i]),
+ strand = strand(junctions_gr[i]),
+ ranges = IRanges(end - 1, end)
+ )
+
+ # check for which genes distance to start is 0
+ start_genes <- which(distance(test_start, refseq.genes) == 0)
+ # start is not in a gene
+ if(length(start_genes) == 0) return("splicingBeyondGene")
+
+ # start is in a gene -> is end in same gene
+ for(to in start_genes){
+ # end is in same gene
+ if(distance(test_end, refseq.genes[to]) == 0){
+ return("exonSkipping")
+ }
+ }
+
+ end_genes <- which(distance(test_end, refseq.genes) == 0)
+ # end is not in a gene
+ if(length(end_genes) == 0) return("splicingBeyondGene")
+ # end is in a different gene
+ return("multigenicSplicing")
+ })
+
+ # checking exonSkipping done
+ if(length(exonSkip) > 0){
+ junctions_dt[psi_positions[exonSkip],
+ potentialImpact2 := newSkip_results]
+ junctions_dt[potentialImpact2 == "splicingBeyondGene",
+ potentialImpact := "splicingBeyondGene"]
+ junctions_dt[potentialImpact2 == "splicingBeyondGene",
+ causesFrameshift := "inconclusive"]
+ junctions_dt[potentialImpact2 == "multigenicSplicing",
+ potentialImpact := "multigenicSplicing"]
+ junctions_dt[potentialImpact2 == "multigenicSplicing",
+ causesFrameshift := "inconclusive"]
+ junctions_dt[, potentialImpact2 := NULL]
+ }
+
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ return(junctions_dt)
+}
+
+#'
+#' @noRd
+checkInconclusive <- function(junctions_dt, txdb){
+ psi_positions <- which(junctions_dt$type != "theta")
+ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+ junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
+ keep.extra.columns = TRUE)
+ seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+
+ refseq.genes<- genes(txdb)
+
+ inconclusive <- which(junctions_dt[psi_positions
+ ]$potentialImpact == "complex")
+
+ inconclusive_results <- sapply(inconclusive, function(i){
+ start = start(junctions_gr[i])
+ end = end(junctions_gr[i])
+
+ # reduce the junction width so adjacent genes have a distance of 1
+ if(start + 2 < end){
+ start = start + 1
+ end = end - 1
+ }
+
+ test_start <- GRanges(
+ seqnames = seqnames(junctions_gr[i]),
+ strand = strand(junctions_gr[i]),
+ ranges = IRanges(start, start + 1)
+ )
+
+ test_end <- GRanges(
+ seqnames = seqnames(junctions_gr[i]),
+ strand = strand(junctions_gr[i]),
+ ranges = IRanges(end - 1, end)
+ )
+
+ # check for which genes distance to start is 0
+ start_genes <- which(distance(test_start, refseq.genes) == 0)
+ # start is not in a gene
+ if(length(start_genes) == 0) return("splicingBeyondGene")
+
+ # start is in a gene -> is end in same gene
+ for(to in start_genes){
+ # end is in same gene
+ if(distance(test_end, refseq.genes[to]) == 0){
+ return("complex")
+ }
+ }
+
+ end_genes <- which(distance(test_end, refseq.genes) == 0)
+ # end is not in a gene
+ if(length(end_genes) == 0) return("splicingBeyondGene")
+ # end is in a different gene
+ return("multigenicSplicing")
+ })
+
+ colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+
+ if(length(inconclusive) > 0){
+ junctions_dt[psi_positions[inconclusive],
+ potentialImpact := inconclusive_results]
+ }
+
+ return(junctions_dt)
+}
diff --git a/R/updateRho.R b/R/updateRho.R
index f5a0d429..628b21fd 100644
--- a/R/updateRho.R
+++ b/R/updateRho.R
@@ -8,11 +8,15 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
n <- N(fds)
y <- predictY(fds, noiseAlpha=currentNoiseAlpha(fds))
- fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho,
- k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM)
+ # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho,
+ # k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM)
+ fitparameters <- bplapply(seq_len(nrow(k)), estRho,
+ nll=fullNLLRho_penalized,
+ k=k, n=n, y=y, rhoRange=rhoRange, lambda=1e-4,
+ BPPARAM=BPPARAM)
- rho(fds) <- vapply(fitparameters, "[[",
- double(1), "minimum")
+ rho(fds) <- plogis(vapply(fitparameters, "[[",
+ double(1), "minimum"))
if(isTRUE(verbose)){
stxt <- capture.output(summary(rho(fds)))
@@ -23,16 +27,27 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
return(fds)
}
-estRho <- function(idx, k, n, y, rhoRange, nll, control=list()){
+estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=1e-4){
ki <- k[idx,]
ni <- n[idx,]
yi <- y[idx,]
- est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni,
+ # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni,
+ # maximum=FALSE, tol=0.0000001)
+ # est
+ est <- optimize(f=nll, interval=rhoRange,
+ mui=plogis(yi), ki=ki, ni=ni, lambda=lambda,
maximum=FALSE, tol=0.0000001)
est
}
+fullNLLRho_penalized <- function(logit_rho, ki, ni, mui, lambda=1e-4){
+ rho <- plogis(logit_rho)
+ nll <- -mean(dbetabinom(ki, ni, mui, rho, log=TRUE))
+ nll <- nll + lambda * (logit_rho^2)
+ return(nll)
+}
+
negLogLikelihoodRho <- function(rho, ki, ni, mui){
#-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE))
@@ -63,6 +78,20 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){
mean(alpha + beta - alphaK - betaNK )
}
+trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){
+ #-mean(dbetabinom(ki, ni, mui, rho, log=TRUE))
+
+ rho <- plogis(logit_rho)
+ r <- (1-rho)/rho
+ alpha <- lgamma(mui*r)
+ alphaK <- lgamma(mui*r + ki)
+ beta <- lgamma((mui-1)*(-r))
+ betaNK <- lgamma((mui-1)*(-r) + (ni - ki))
+
+ #mean negative log likelihood with pseudocounts
+ mean(alpha + beta - alphaK - betaNK ) + lambda * (logit_rho*logit_rho)
+}
+
methodOfMomentsRho <- function(k, n, rhoRange=c(1e-5, 1 - 1e-5)){
# taken from wiki:
diff --git a/R/variables.R b/R/variables.R
index ccba65f5..d097201c 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -1,11 +1,11 @@
#'
-#' Available psi types
+#' Available splice metrics
#'
#' @examples
-#' # to show available psi types:
+#' # to show all available splice metrics:
#' psiTypes
#'
+#' @rdname psiTypes
#' @export
-psiTypes <- c("psi5", "psi3", "theta")
-names(psiTypes) <- psiTypes
-
+psiTypes <- c("jaccard", "psi5", "psi3", "theta")
+names(psiTypes) <- c("Intron Jaccard Index", "psi5", "psi3", "theta")
diff --git a/R/zzz.R b/R/zzz.R
index 8f087a08..7f0b67d5 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -8,7 +8,7 @@
op.fraser <- list(
`FRASER-hdf5-chunk-nrow` = 30000,
`FRASER-hdf5-chunk-ncol` = 20,
- `FRASER.pseudoCount` = 1,
+ `FRASER.pseudoCount` = 0.1,
`FRASER.minSamplesForDelayed` = 1000,
`FRASER.maxSamplesNoHDF5` = 20,
`FRASER.maxJunctionsNoHDF5` = 1000)
diff --git a/README.md b/README.md
index 8e9443aa..ef17f68a 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,21 @@ Please cite our method paper if you use it in a publication:
> Mertes, C., Scheller, I.F., Yépez, V.A. *et al.* Detection of aberrant splicing events in RNA-seq data using FRASER. *Nat Commun* **12**, 529 (2021). https://doi.org/10.1038/s41467-020-20573-7
+## What's new
+
+FRASER 2.0, an improved version of FRASER, is now available and used by default (version 1.99.0 and above).
+FRASER 2.0 uses the Intron Jaccard Index as its splice metric instead of FRASER's
+previous three metrics along with some other parameter optimizations of pseudocount,
+filtering settings and default delta cutoff.
+
+To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics
+specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the
+new default delta cutoff is 0.1 instead of the previous value of 0.3. When using
+the 3 previous metrics, the delta cutoff should be set manually to 0.3
+during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`.
+
+The manuscript describing these changes in more detail will be available soon.
+
## Installation
`FRASER` is an R/Bioconductor software package requiring a running
diff --git a/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz
new file mode 100644
index 00000000..5d87eb13
Binary files /dev/null and b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz differ
diff --git a/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz
new file mode 100644
index 00000000..a4ec8581
Binary files /dev/null and b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz differ
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 22f3fff5..c4848052 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -5,11 +5,13 @@
\alias{calculateZscore}
\alias{calculatePvalues}
\alias{calculatePadjValues}
+\alias{calculatePadjValuesOnSubset}
\title{FRASER: Find RAre Splicing Events in RNA-seq data}
\usage{
FRASER(
fds,
q,
+ type = fitMetrics(fds),
implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"),
iterations = 15,
BPPARAM = bpparam(),
@@ -28,7 +30,26 @@ calculatePvalues(
capN = 5 * 1e+05
)
-calculatePadjValues(fds, type = currentType(fds), method = "BY")
+calculatePadjValues(
+ fds,
+ type = currentType(fds),
+ method = "BY",
+ rhoCutoff = NA,
+ geneLevel = TRUE,
+ geneColumn = "hgnc_symbol",
+ subsets = NULL,
+ BPPARAM = bpparam()
+)
+
+calculatePadjValuesOnSubset(
+ fds,
+ genesToTest,
+ subsetName,
+ type = currentType(fds),
+ method = "BY",
+ geneColumn = "hgnc_symbol",
+ BPPARAM = bpparam()
+)
}
\arguments{
\item{fds}{A \code{\link{FraserDataSet}} object}
@@ -38,6 +59,9 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown.
If a named vector is provided it is used for the different
splicing types.}
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing
+efficiency)}
+
\item{implementation}{The method that should be used to correct for
confounders.}
@@ -50,9 +74,6 @@ not yet converged after these number of iterations, the fit stops anyway.}
\item{...}{Additional parameters passed on to the internal fit function}
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing
-efficiency)}
-
\item{logit}{Indicates if z scores are computed on the logit scale (default)
or in the natural (psi) scale.}
@@ -62,7 +83,31 @@ calculated. Possible are beta-binomial, binomial and normal.}
\item{capN}{Counts are capped at this value to speed up the p-value
calculation}
-\item{method}{The p.adjust method that should be used.}
+\item{method}{The p.adjust method that should be used for genome-wide
+multiple testing correction.}
+
+\item{rhoCutoff}{The cutoff value on the fitted rho value
+(overdispersion parameter of the betabinomial) above which junctions are
+masked with NA during p value adjustment (default: NA, no masking).}
+
+\item{geneLevel}{Logical value indiciating whether gene-level p values
+should be calculated. Defaults to TRUE.}
+
+\item{geneColumn}{The column name of the column that has the gene annotation
+that will be used for gene-level pvalue computation.}
+
+\item{subsets}{A named list of named lists specifying any number of gene
+subsets (can differ per sample). For each subset, FDR correction
+will be limited to genes in the subset, and the FDR corrected
+pvalues stored as an assay in the fds object in addition to the
+transcriptome-wide FDR corrected pvalues. See the examples for
+how to use this argument.}
+
+\item{genesToTest}{A named list with the subset of genes to test per sample.
+The names must correspond to the sampleIDs in the given fds object.}
+
+\item{subsetName}{The name under which the resulting FDR corrected pvalues
+will be stored in metadata(fds).}
}
\value{
FraserDataSet
@@ -104,11 +149,21 @@ psi.
\item \code{calculatePvalues}: This function calculates two-sided p-values based on
the beta-binomial distribution (or binomial or normal if desired). The
-returned p values are already adjusted with Holm's method per donor or
+returned p values are not yet adjusted with Holm's method per donor or
acceptor site, respectively.
\item \code{calculatePadjValues}: This function adjusts the previously calculated
-p-values per sample for multiple testing.
+p-values per sample for multiple testing. First, the previoulsy calculated
+junction-level p values are adjusted with Holm's method per donor or
+acceptor site, respectively. Then, if gene symbols have been annotated to
+junctions (and not otherwise requested), gene-level p values are computed.
+
+\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions
+in a certain subset of genes which can differ per sample. Requires gene
+symbols to have been annotated to junctions. As with the full FDR
+correction across all junctions, first the previously calculated
+junction-level p values are adjusted with Holm's method per donor or
+acceptor site, respectively. Then, gene-level p values are computed.
}}
\examples{
@@ -132,16 +187,32 @@ fds <- FRASER(fds, q=2, implementation="PCA")
# The functions run inside the FRASER function can also be directly
# run themselves.
# To directly run the fit function:
-fds <- fit(fds, implementation="PCA", q=2, type="psi5")
+fds <- fit(fds, implementation="PCA", q=2, type="jaccard")
# To directly run the nomial and adjusted p value and z score
# calculation, the following functions can be used:
-fds <- calculatePvalues(fds, type="psi5")
-head(pVals(fds, type="psi5"))
-fds <- calculatePadjValues(fds, type="psi5", method="BY")
-head(padjVals(fds, type="psi5"))
-fds <- calculateZscore(fds, type="psi5")
-head(zScores(fds, type="psi5"))
+fds <- calculatePvalues(fds, type="jaccard")
+head(pVals(fds, type="jaccard"))
+fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+head(padjVals(fds, type="jaccard"))
+fds <- calculateZscore(fds, type="jaccard")
+head(zScores(fds, type="jaccard"))
+
+# example of restricting FDR correction to subsets of genes of interest
+genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+fds <- calculatePadjValues(fds, type="jaccard",
+ subsets=list("exampleSubset"=genesOfInterest))
+padjVals(fds, type="jaccard", subsetName="exampleSubset")
+padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset")
+fds <- calculatePadjValues(fds, type="jaccard",
+ subsets=list("anotherExampleSubset"=c("TIMMDC1")))
+padjVals(fds, type="jaccard", subsetName="anotherExampleSubset")
+
+# only adding FDR corrected pvalues on a subset without calculating
+# transcriptome-wide FDR again:
+fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest,
+ subsetName="setOfInterest", type="jaccard")
+padjVals(fds, type="jaccard", subsetName="setOfInterest")
}
\seealso{
diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd
index c82a3211..e74728b2 100644
--- a/man/annotateRanges.Rd
+++ b/man/annotateRanges.Rd
@@ -20,7 +20,8 @@ annotateRangesWithTxDb(
featureName = "hgnc_symbol",
keytype = "ENTREZID",
txdb = NULL,
- orgDb = NULL
+ orgDb = NULL,
+ filter = list()
)
}
\arguments{
@@ -52,6 +53,11 @@ one is used, currently this is
\item{orgDb}{An \code{orgDb} object or a data table to map the feature names.
If this is NULL, then \code{org.Hs.eg.db} is used as the default.}
+
+\item{filter}{A named list specifying the filters which should be applied to
+subset to e.g. only protein-coding genes for annotation.
+\code{names(filter)} needs to be column names in the given
+orgDb object (default: no filtering).}
}
\value{
FraserDataSet
@@ -67,13 +73,13 @@ fds <- createTestFraserDataSet()
# either using biomart with GRCh38
try({
fds <- annotateRanges(fds, GRCh=38)
- rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
+ rowRanges(fds, type="j")[,c("hgnc_symbol")]
})
# either using biomart with GRCh37
try({
fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
- rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
+ rowRanges(fds, type="j")[,c("hgnc_symbol_37")]
})
# or with a provided TxDb object
@@ -82,6 +88,6 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
require(org.Hs.eg.db)
orgDb <- org.Hs.eg.db
fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
-rowRanges(fds, type="psi5")[,"hgnc_symbol"]
+rowRanges(fds, type="j")[,"hgnc_symbol"]
}
diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd
index 0128221d..9e0c14fa 100644
--- a/man/calculatePSIValues.Rd
+++ b/man/calculatePSIValues.Rd
@@ -15,7 +15,7 @@ calculatePSIValues(
\item{fds}{A \code{\link{FraserDataSet}} object}
\item{types}{A vector with the psi types which should be calculated. Default
-is all of psi5, psi3 and theta.}
+is all of jaccard, psi5, psi3 and theta.}
\item{overwriteCts}{FALSE or TRUE (the default) the total counts (aka N) will
be recalculated based on the existing junction counts (aka K)}
@@ -31,7 +31,7 @@ based on the FraserDataSet object
}
\examples{
fds <- createTestFraserDataSet()
- fds <- calculatePSIValues(fds, types="psi5")
+ fds <- calculatePSIValues(fds, types="jaccard")
### usually one would run this function for all psi types by using:
# fds <- calculatePSIValues(fds)
diff --git a/man/counts.Rd b/man/counts.Rd
index 1ddac4d5..79beb47f 100644
--- a/man/counts.Rd
+++ b/man/counts.Rd
@@ -11,9 +11,14 @@ K(fds, type = currentType(fds))
N(fds, type = currentType(fds))
-\S4method{counts}{FraserDataSet}(object, type = NULL, side = c("ofInterest", "otherSide"))
-
-\S4method{counts}{FraserDataSet,ANY}(object, type = NULL, side = c("ofInterest", "otherSide"), ...) <- value
+\S4method{counts}{FraserDataSet}(object, type = currentType(object), side = c("ofInterest", "otherSide"))
+
+\S4method{counts}{FraserDataSet,ANY}(
+ object,
+ type = currentType(object),
+ side = c("ofInterest", "otherSide"),
+ ...
+) <- value
}
\arguments{
\item{fds, object}{FraserDataSet}
@@ -39,8 +44,10 @@ setter for count data
\examples{
fds <- createTestFraserDataSet()
- counts(fds, type="psi5", side="ofInterest")
- counts(fds, type="psi5", side="other")
+ counts(fds, side="ofInterest")
+ counts(fds, type="jaccard", side="other")
+ head(K(fds))
+ head(K(fds, type="psi5"))
head(K(fds, type="psi3"))
head(N(fds, type="theta"))
diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd
index 3e11e4e4..d6caf3fd 100644
--- a/man/fds-methods.Rd
+++ b/man/fds-methods.Rd
@@ -41,17 +41,7 @@
\alias{FRASER.mcols.get}
\alias{FRASER.rowRanges.get}
\alias{mapSeqlevels}
-\title{Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object.
-
-\code{samples} sets or gets the sample IDs; \code{condition} ;
-\code{}
-\code{nonSplicedReads} return a RangedSummarizedExperiment object
-containing the counts for the non spliced reads overlapping splice
-sites in the fds.
-\code{}}
+\title{Getter/Setter methods for the FraserDataSet}
\usage{
samples(object)
@@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().}
Getter method return the respective current value.
}
\description{
-Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object.
+The following methods are getter and setter methods to extract
+or set certain values of a FraserDataSet object.
\code{samples} sets or gets the sample IDs; \code{condition} ;
\code{}
@@ -162,8 +150,6 @@ certain values of a FraserDataSet object.
containing the counts for the non spliced reads overlapping splice
sites in the fds.
\code{}
-
-Mapping of chromosome names
}
\examples{
fds <- createTestFraserDataSet()
diff --git a/man/filtering.Rd b/man/filtering.Rd
index b40ad786..b99936dc 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -1,44 +1,52 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/filterExpression.R
-\name{filtering}
+% Please edit documentation in R/AllGenerics-definitions.R, R/filterExpression.R
+\name{filterVariability}
+\alias{filterVariability}
\alias{filtering}
\alias{filterExpressionAndVariability}
\alias{filterExpression,FraserDataSet-method}
-\alias{filterVariability}
+\alias{filterVariability,FraserDataSet-method}
\title{Filtering FraserDataSets}
\usage{
+filterVariability(object, ...)
+
filterExpressionAndVariability(
object,
minExpressionInOneSample = 20,
- quantile = 0.95,
+ quantile = 0.75,
quantileMinExpression = 10,
- minDeltaPsi = 0.05,
+ minDeltaPsi = 0,
filter = TRUE,
delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard = TRUE,
BPPARAM = bpparam()
)
\S4method{filterExpression}{FraserDataSet}(
object,
minExpressionInOneSample = 20,
- quantile = 0.95,
+ quantile = 0.75,
quantileMinExpression = 10,
filter = TRUE,
delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard = TRUE,
BPPARAM = bpparam()
)
-filterVariability(
+\S4method{filterVariability}{FraserDataSet}(
object,
- minDeltaPsi = 0.05,
+ minDeltaPsi = 0,
filter = TRUE,
delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+ filterOnJaccard = TRUE,
BPPARAM = bpparam()
)
}
\arguments{
\item{object}{A \code{\link{FraserDataSet}} object}
+\item{...}{Further parameters passed on to Rsubread::featureCounts.}
+
\item{minExpressionInOneSample}{The minimal read count in at least one
sample that is required for an intron to pass the filter.}
@@ -60,6 +68,10 @@ mcols.}
otherwise the function works on the delayedMatrix representations. The
default value depends on the number of samples in the fds-object.}
+\item{filterOnJaccard}{If TRUE, the Intron Jaccard Metric is used to define
+express introns during fitlering. Otherwise, the psi5, psi3 and theta
+metrics are used (default: TRUE).}
+
\item{BPPARAM}{the BiocParallel parameters for the parallelization}
}
\value{
@@ -78,15 +90,15 @@ read support and introns that are not variable across samples.
\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding
splice sites that have low read support in all samples.
-\item \code{filterVariability}: This function filters out introns and corresponding
-splice sites which do not show variablity across samples.
+\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding
+splice sites that have low read support in all samples.
}}
\examples{
fds <- createTestFraserDataSet()
fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
-mcols(fds, type="psi5")[, c(
- "maxCount", "passedExpression", "maxDPsi3", "passedVariability")]
+mcols(fds, type="jaccard")[, c(
+ "maxCount", "passedExpression", "maxDJaccard", "passedVariability")]
plotFilterExpression(fds)
plotFilterVariability(fds)
diff --git a/man/fit.Rd b/man/fit.Rd
index eb68296c..dcbee2dc 100644
--- a/man/fit.Rd
+++ b/man/fit.Rd
@@ -10,8 +10,8 @@
implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full",
"fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"),
q,
- type = "psi3",
- rhoRange = c(1e-08, 1 - 1e-08),
+ type = psiTypes,
+ rhoRange = c(-30, 30),
weighted = FALSE,
noiseAlpha = 1,
convergence = 1e-05,
@@ -35,7 +35,7 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown.
If a named vector is provided it is used for the different
splicing types.}
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing
efficiency)}
\item{rhoRange}{Defines the range of values that rho parameter from the
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index bf56e32c..9b10f832 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -15,10 +15,13 @@
\alias{zScores}
\alias{pVals}
\alias{padjVals}
+\alias{availableFDRsubsets}
\alias{predictedMeans}
\alias{deltaPsiValue}
\alias{currentType}
\alias{currentType<-}
+\alias{fitMetrics}
+\alias{fitMetrics<-}
\alias{pseudocount}
\alias{hyperParams}
\alias{dontWriteHDF5}
@@ -35,9 +38,26 @@ rho(fds, type = currentType(fds))
zScores(fds, type = currentType(fds), byGroup = FALSE, ...)
-pVals(fds, type = currentType(fds), level = "site", dist = "BetaBinomial", ...)
-
-padjVals(fds, type = currentType(fds), dist = c("BetaBinomial"), ...)
+pVals(
+ fds,
+ type = currentType(fds),
+ level = "site",
+ filters = list(),
+ dist = "BetaBinomial",
+ ...
+)
+
+padjVals(
+ fds,
+ type = currentType(fds),
+ dist = c("BetaBinomial"),
+ level = "site",
+ subsetName = NULL,
+ filters = list(),
+ ...
+)
+
+availableFDRsubsets(fds)
predictedMeans(fds, type = currentType(fds))
@@ -47,6 +67,10 @@ currentType(fds)
currentType(fds) <- value
+fitMetrics(fds)
+
+fitMetrics(fds) <- value
+
pseudocount(value = NULL)
hyperParams(fds, type = currentType(fds), all = FALSE)
@@ -70,12 +94,16 @@ verbose(fds) <- value
\item{byGroup}{If TRUE, aggregation by donor/acceptor site will be done.}
-\item{...}{Internally used parameteres.}
+\item{...}{Internally used parameters.}
\item{level}{Indicates if the retrieved p values should be adjusted on the
donor/acceptor site-level (default) or if unadjusted junction-level
p values should be returned.}
+\item{filters}{A named list giving the filters that were applied for masking
+during p value correction. Used for storing and retrieving the
+correct set of requested p values.}
+
\item{dist}{Distribution for which the p-values should be extracted.}
\item{all}{Logical value indicating whether \code{hyperParams(fds)} should
@@ -110,6 +138,9 @@ beta-binomial distribution
\item \code{padjVals}: This returns the adjusted p-values.
+\item \code{availableFDRsubsets}: This returns the names of FDR subsets
+for which adjusted p values have been calculated.
+
\item \code{predictedMeans}: This returns the fitted mu (i.e. psi)
values.
@@ -117,11 +148,18 @@ values.
observed and the fitted psi values.
\item \code{currentType}: Returns the psi type that is used
-within several methods in the FRASER package.
+within several methods in the FRASER package (defaults to jaccard).
\item \code{currentType<-}: Sets the psi type that is to be used
within several methods in the FRASER package.
+\item \code{fitMetrics}: Returns the splice metrics that will be
+fitted (defaults to jaccard, used within several methods in the
+FRASER package).
+
+\item \code{fitMetrics<-}: Sets the splice metrics that will be
+fitted (used within several methods in the FRASER package).
+
\item \code{pseudocount}: Sets and returns the pseudo count used
within the FRASER fitting procedure.
@@ -140,7 +178,7 @@ assays should be stored as hdf5 files.
\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored
as hdf5 files.
-\item \code{verbose}: Dependend on the level of verbosity
+\item \code{verbose}: Dependent on the level of verbosity
the algorithm reports more or less to the user. 0 means being quiet
and 10 means everything.
@@ -156,7 +194,7 @@ dontWriteHDF5(fds)
dontWriteHDF5 <- TRUE
# get/set the splice metric for which results should be retrieved
-currentType(fds) <- "psi5"
+currentType(fds) <- "jaccard"
currentType(fds)
# get fitted parameters
@@ -167,6 +205,9 @@ rho(fds)
# get statistics
pVals(fds)
padjVals(fds)
+
+# zscore not calculated by default
+fds <- calculateZscore(fds, type="jaccard")
zScores(fds)
# set and get pseudocount
@@ -174,9 +215,9 @@ pseudocount(4L)
pseudocount()
# retrieve or set a mask to exclude certain junctions in the fitting step
-featureExclusionMask(fds, type="theta") <- sample(
- c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE)
-featureExclusionMask(fds, type="theta")
+featureExclusionMask(fds, type="jaccard") <- sample(
+ c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE)
+featureExclusionMask(fds, type="jaccard")
# controlling the verbosity level of the output of some algorithms
verbose(fds) <- 2
diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd
index 1f85338e..5132c6e4 100644
--- a/man/injectOutliers.Rd
+++ b/man/injectOutliers.Rd
@@ -6,7 +6,7 @@
\usage{
injectOutliers(
fds,
- type = c("psi5", "psi3", "theta"),
+ type = psiTypes,
freq = 0.001,
minDpsi = 0.2,
minCoverage = 2,
@@ -50,5 +50,6 @@ Inject artificial outliers in an existing fds
\examples{
# A generic dataset
fds <- makeSimulatedFraserDataSet()
+fds <- calculatePSIValues(fds)
fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
}
diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd
index f3d10622..e1333b4a 100644
--- a/man/optimHyperParams.Rd
+++ b/man/optimHyperParams.Rd
@@ -6,9 +6,9 @@
\usage{
optimHyperParams(
fds,
- type,
+ type = psiTypes,
implementation = "PCA",
- q_param = seq(2, min(40, ncol(fds)), by = 3),
+ q_param = getEncDimRange(fds),
noise_param = 0,
minDeltaPsi = 0.1,
iterations = 5,
@@ -24,7 +24,7 @@ optimHyperParams(
\arguments{
\item{fds}{A \code{\link{FraserDataSet}} object}
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing
efficiency)}
\item{implementation}{The method that should be used to correct for
@@ -71,13 +71,14 @@ ratios while maximizing the precision-recall curve.
\examples{
# generate data
fds <- makeSimulatedFraserDataSet(m=15, j=20)
+ fds <- calculatePSIValues(fds)
# run hyperparameter optimization
- fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5))
+ fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5))
# get estimated optimal dimension of the latent space
- bestQ(fds, type="psi5")
- hyperParams(fds, type="psi5")
+ bestQ(fds, type="jaccard")
+ hyperParams(fds, type="jaccard")
}
\seealso{
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index cbeddee6..db8025a0 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -1,6 +1,7 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plotMethods.R
-\name{plotFunctions}
+% Please edit documentation in R/AllGenerics-definitions.R, R/plotMethods.R
+\name{plotManhattan}
+\alias{plotManhattan}
\alias{plotFunctions}
\alias{plotAberrantPerSample}
\alias{plotVolcano}
@@ -10,23 +11,29 @@
\alias{plotFilterExpression}
\alias{plotExpectedVsObservedPsi}
\alias{plotEncDimSearch}
+\alias{plotBamCoverage}
+\alias{plotBamCoverageFromResultTable}
\alias{plotVolcano,FraserDataSet-method}
\alias{plotAberrantPerSample,FraserDataSet-method}
+\alias{plotSpliceMetricRank}
\alias{plotQQ,FraserDataSet-method}
\alias{plotEncDimSearch,FraserDataSet-method}
\alias{plotFilterVariability}
\alias{plotCountCorHeatmap,FraserDataSet-method}
+\alias{plotManhattan,FraserDataSet-method}
\title{Visualization functions for FRASER}
\usage{
+plotManhattan(object, ...)
+
\S4method{plotVolcano}{FraserDataSet}(
object,
sampleID,
- type = c("psi3", "psi5", "theta"),
+ type = fitMetrics(object),
basePlot = TRUE,
aggregate = FALSE,
main = NULL,
label = NULL,
- deltaPsiCutoff = 0.3,
+ deltaPsiCutoff = 0.1,
padjCutoff = 0.1,
...
)
@@ -34,10 +41,9 @@
\S4method{plotAberrantPerSample}{FraserDataSet}(
object,
main,
- type = c("psi3", "psi5", "theta"),
+ type = fitMetrics(object),
padjCutoff = 0.1,
- zScoreCutoff = NA,
- deltaPsiCutoff = 0.3,
+ deltaPsiCutoff = 0.1,
aggregate = TRUE,
BPPARAM = bpparam(),
...
@@ -45,8 +51,20 @@
plotExpression(
fds,
- type = c("psi5", "psi3", "theta"),
- site = NULL,
+ type = fitMetrics(fds),
+ idx = NULL,
+ result = NULL,
+ colGroup = NULL,
+ basePlot = TRUE,
+ main = NULL,
+ label = "aberrant",
+ ...
+)
+
+plotSpliceMetricRank(
+ fds,
+ type = fitMetrics(fds),
+ idx = NULL,
result = NULL,
colGroup = NULL,
basePlot = TRUE,
@@ -57,7 +75,7 @@ plotExpression(
plotExpectedVsObservedPsi(
fds,
- type = c("psi5", "psi3", "theta"),
+ type = fitMetrics(fds),
idx = NULL,
result = NULL,
colGroup = NULL,
@@ -83,11 +101,7 @@ plotExpectedVsObservedPsi(
...
)
-\S4method{plotEncDimSearch}{FraserDataSet}(
- object,
- type = c("psi3", "psi5", "theta"),
- plotType = c("auc", "loss")
-)
+\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes, plotType = c("auc", "loss"))
plotFilterExpression(
fds,
@@ -105,7 +119,7 @@ plotFilterVariability(
\S4method{plotCountCorHeatmap}{FraserDataSet}(
object,
- type = c("psi5", "psi3", "theta"),
+ type = psiTypes,
logit = FALSE,
topN = 50000,
topJ = 5000,
@@ -126,10 +140,62 @@ plotFilterVariability(
plotCov = TRUE,
...
)
+
+plotBamCoverage(
+ fds,
+ gr,
+ sampleID,
+ control_samples = sample(samples(fds[, which(samples(fds) != sampleID)]), min(3,
+ ncol(fds) - length(sampleID))),
+ txdb = NULL,
+ min_junction_count = 20,
+ highlight_range = NULL,
+ highlight_range_color = "firebrick",
+ color_annotated = "gray",
+ color_novel = "goldenrod3",
+ color_sample_interest = "firebrick",
+ color_control_samples = "dodgerblue4",
+ toscale = c("exon", "gene", "none"),
+ mar = c(2, 10, 0.1, 5),
+ curvature_splicegraph = 1,
+ curvature_coverage = 1,
+ cex = 1,
+ splicegraph_labels = c("genomic_range", "id", "name", "none"),
+ splicegraph_position = c("top", "bottom"),
+ ...
+)
+
+plotBamCoverageFromResultTable(
+ fds,
+ result,
+ show_full_gene = FALSE,
+ txdb = NULL,
+ orgDb = NULL,
+ res_gene_col = "hgncSymbol",
+ res_geneid_type = "SYMBOL",
+ txdb_geneid_type = "ENTREZID",
+ left_extension = 1000,
+ right_extension = 1000,
+ ...
+)
+
+\S4method{plotManhattan}{FraserDataSet}(
+ object,
+ sampleID,
+ value = "pvalue",
+ type = fitMetrics(object),
+ chr = NULL,
+ main = paste0("sample: ", sampleID),
+ chrColor = c("black", "darkgrey"),
+ ...
+)
}
\arguments{
\item{object, fds}{An \code{\link{FraserDataSet}} object.}
+\item{...}{Additional parameters passed to plot() or plot_ly() if not stated
+otherwise in the details for each plot function}
+
\item{sampleID}{A sample ID which should be plotted. Can also be a vector.
Integers are treated as indices.}
@@ -150,22 +216,19 @@ samples. Labelling can be turned off by setting
\code{label=NULL}. The user can also provide a custom
list of gene symbols or sampleIDs.}
-\item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta
+\item{padjCutoff, deltaPsiCutoff}{Significance or delta
psi cutoff to mark outliers}
-\item{...}{Additional parameters passed to plot() or plot_ly() if not stated
-otherwise in the details for each plot function}
-
\item{BPPARAM}{BiocParallel parameter to use.}
+\item{idx}{A junction site ID or gene ID or one of both, which
+should be plotted. Can also be a vector. Integers are treated
+as indices.}
+
\item{result}{The result table to be used by the method.}
\item{colGroup}{Group of samples that should be colored.}
-\item{idx, site}{A junction site ID or gene ID or one of both, which
-should be plotted. Can also be a vector. Integers are treated
-as indices.}
-
\item{global}{Flag to plot a global Q-Q plot, default FALSE}
\item{conf.alpha}{If set, a confidence interval is plotted, defaults to 0.05}
@@ -228,6 +291,108 @@ annotation of the heatmap.}
\item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with
the mean psi values or the junction coverage.}
+
+\item{gr}{A GRanges object indicating the genomic range that should be shown
+in \code{plotBamCoverage}.}
+
+\item{control_samples}{The sampleIDs of the samples used as control in
+\code{plotBamCoverage}.}
+
+\item{txdb}{A TxDb object giving the gene/transcript annotation to use.}
+
+\item{min_junction_count}{The minimal junction count across samples required
+for a junction to appear in the splicegraph and coverage tracks
+of \code{plotBamCoverage}.}
+
+\item{highlight_range}{A \code{GenomicRanges} or \code{GenomicRangesList}
+object of ranges to be highlighted in the splicegraph of
+\code{plotBamCoverage}.}
+
+\item{highlight_range_color}{The color of highlighted ranges in
+the splicegraph of \code{plotBamCoverage}.}
+
+\item{color_annotated}{The color for exons and junctions present in
+the given annotation (in the splicegraph of
+\code{plotBamCoverage}).}
+
+\item{color_novel}{The color for novel exons and junctions not present in
+the given annotation (in the splicegraph of
+\code{plotBamCoverage}).}
+
+\item{color_sample_interest}{The color in \code{plotBamCoverage} for the
+sample of interest.}
+
+\item{color_control_samples}{The color in \code{plotBamCoverage} for the
+samples used as controls.}
+
+\item{toscale}{In \code{plotBamCoverage}, indicates which part of the
+plotted region should be drawn to scale. Possible values are
+'exon' (exonic regions are drawn to scale),
+'gene' (both exonic and intronic regions are drawn to scale) or
+'none' (exonic and intronic regions have constant length)
+(see SGSeq package).}
+
+\item{mar}{The margin of the plot area for \code{plotBamCoverage}
+(b,l,t,r).}
+
+\item{curvature_splicegraph}{The curvature of the junction arcs in the
+splicegraph in \code{plotBamCoverage}. Decrease this value
+for flatter arcs and increase it for steeper arcs.}
+
+\item{curvature_coverage}{The curvature of the junction arcs in the
+coverage tracks of \code{plotBamCoverage}. Decrease this
+value for flatter arcs and increase it for steeper arcs.}
+
+\item{cex}{For controlling the size of text and numbers in
+\code{plotBamCoverage}.}
+
+\item{splicegraph_labels}{Indicated the format of exon/splice junction
+labels in the splicegraph of \code{plotBamCoverage}.
+Possible values are 'genomic_range' (gives the start position
+of the first exon and the end position of the last exon that
+are shown), 'id' (format E1,... J1,...), 'name' (format
+type:chromosome:start-end:strand for each feature),
+'none' for no labels (see SGSeq package).}
+
+\item{splicegraph_position}{The position of the splicegraph relative to the
+coverage tracks in \code{plotBamCoverage}. Possible values
+are 'top' (default) and 'bottom'.}
+
+\item{show_full_gene}{Should the full genomic range of the gene be shown in
+\code{plotBamCoverageFromResultTable} (default: FALSE)?
+If FALSE, only a certain region (see parameters left_extension
+and right_extension) around the outlier junction is shown.}
+
+\item{orgDb}{A OrgDb object giving the mapping of gene ids and symbols.}
+
+\item{res_gene_col}{The column name in the given results table that
+contains the gene annotation.}
+
+\item{res_geneid_type}{The type of gene annotation in the results table in
+\code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This
+information is needed for mapping between the results table and
+the provided annotation in the txdb object.}
+
+\item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)}
+(e.g. ENTREZID). This information is needed for
+mapping between the results table and the provided annotation
+in the txdb object.}
+
+\item{left_extension}{Indicating how far the plotted range around the outlier
+junction should be extended to the left in
+\code{plotBamCoverageFromResultTable}.}
+
+\item{right_extension}{Indicating how far the plotted range around the
+outlier junction should be extended to the right in
+\code{plotBamCoverageFromResultTable}.}
+
+\item{value}{Indicates which assay is shown in the manhattan plot. Defaults
+to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.}
+
+\item{chr}{Vector of chromosome names to show in \code{plotManhattan}. The
+default is to show all chromosomes.}
+
+\item{chrColor}{Interchanging colors by chromosome for \code{plotManhattan}.}
}
\value{
If base R graphics are used nothing is returned else the plotly or
@@ -246,6 +411,9 @@ Plot the number of aberrant events per samples
Plots the observed split reads of the junction of interest over all reads
coming from the given donor/acceptor.
+Plots the observed values of the splice metric across samples for a
+junction of interest.
+
Plots the expected psi value over the observed psi value of the given
junction.
@@ -269,6 +437,10 @@ This is the list of all plotting function provided by FRASER:
\item plotFilterExpression()
\item plotFilterVariability()
\item plotEncDimSearch()
+ \item plotBamCoverage()
+ \item plotBamCoverageFromResultTable()
+ \item plotManhattan()
+ \item plotSpliceMetricRank()
}
For a detailed description of each plot function please see the details.
@@ -297,6 +469,9 @@ log10 space.
\code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi
against the predicted psi for a given site.
+\code{plotSpliceMetricRank}: This function plots for a given intron the
+observed values of the selected splice metrix against the sample rank.
+
\code{plotCountCorHeatmap}: The correlation heatmap of the count data either
of the full data set (i.e. sample-sample correlations) or of the top x most
variable junctions (i.e. junction-sample correlations). By default the values
@@ -317,33 +492,81 @@ introns and for the filtered (i.e. non-variable) introns.
It plots the encoding dimension against the achieved loss (area under the
precision-recall curve). From this plot the optimum should be choosen for
the \code{q} in fitting process.
+
+\code{plotManhattan}: A Manhattan plot showing the junction pvalues by
+genomic position. Useful to identify if outliers cluster by genomic position.
+
+\code{plotBamCoverage}: A sashimi plot showing the read coverage from
+the underlying bam files for a given genomic range and sampleIDs.
+
+\code{plotBamCoverageFromResultTable}: A sashimi plot showing the read
+coverage from the underlying bam files for a row in the results table. Can
+either show the full range of the gene with the outlier junction or only a
+certain region around the outlier.
}
\examples{
# create full FRASER object
fds <- makeSimulatedFraserDataSet(m=40, j=200)
fds <- calculatePSIValues(fds)
fds <- filterExpressionAndVariability(fds, filter=FALSE)
-# this step should be done for all splicing metrics and more dimensions
-fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
+# this step should be done for more dimensions in practice
+fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25))
fds <- FRASER(fds)
# QC plotting
plotFilterExpression(fds)
plotFilterVariability(fds)
-plotCountCorHeatmap(fds, "theta")
-plotCountCorHeatmap(fds, "theta", normalized=TRUE)
-plotEncDimSearch(fds, type="psi5")
+plotCountCorHeatmap(fds, "jaccard")
+plotCountCorHeatmap(fds, "jaccard", normalized=TRUE)
+plotEncDimSearch(fds, type="jaccard")
# extract results
plotAberrantPerSample(fds, aggregate=FALSE)
-plotVolcano(fds, "sample1", "psi5")
+plotVolcano(fds, "sample1", "jaccard")
# dive into gene/sample level results
res <- results(fds)
res
plotExpression(fds, result=res[1])
plotQQ(fds, result=res[1])
-plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
+plotExpectedVsObservedPsi(fds, res=res[1])
+
+# create manhattan plot of pvalues by genomic position
+if(require(ggbio)){
+ plotManhattan(fds, type="jaccard", sampleID="sample10")
+}
+# plot splice graph and coverage from bam files in a given region
+if(require(SGSeq)){
+ fds <- createTestFraserSettings()
+ gr <- GRanges(seqnames="chr19",
+ IRanges(start=7587496, end=7598895),
+ strand="+")
+ plotBamCoverage(fds, gr=gr, sampleID="sample3",
+ control_samples="sample2", min_junction_count=5,
+ curvature_splicegraph=1, curvature_coverage=1,
+ mar=c(1, 7, 0.1, 3))
+
+ # plot coverage from bam file for a row in the result table
+ fds <- createTestFraserDataSet()
+ require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+ require(org.Hs.eg.db)
+ orgDb <- org.Hs.eg.db
+
+ res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+ res_dt <- as.data.table(res)
+ res_dt <- res_dt[sampleID == "sample2",]
+
+ # plot full range of gene containing outlier junction
+ plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+ txdb=txdb, orgDb=orgDb, control_samples="sample3")
+
+ # plot only certain range around outlier junction
+ plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE,
+ control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+ curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+ splicegraph_labels="id")
+}
}
diff --git a/man/potentialImpactAnnotations.Rd b/man/potentialImpactAnnotations.Rd
new file mode 100644
index 00000000..282a6d09
--- /dev/null
+++ b/man/potentialImpactAnnotations.Rd
@@ -0,0 +1,135 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/resultAnnotations.R
+\name{potentialImpactAnnotations}
+\alias{potentialImpactAnnotations}
+\alias{annotateIntronReferenceOverlap}
+\alias{annotatePotentialImpact}
+\alias{flagBlacklistRegions}
+\title{Additional result annotations}
+\usage{
+annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam())
+
+annotatePotentialImpact(
+ result,
+ txdb,
+ fds,
+ addPotentialImpact = TRUE,
+ addUTRoverlap = TRUE,
+ minoverlap = 5,
+ BPPARAM = bpparam()
+)
+
+flagBlacklistRegions(
+ result,
+ blacklist_regions = NULL,
+ assemblyVersion = c("hg19", "hg38"),
+ minoverlap = 5
+)
+}
+\arguments{
+\item{fds}{A FraserDataSet}
+
+\item{txdb}{A txdb object providing the reference annotation.}
+
+\item{BPPARAM}{For controlling parallelization behavior. Defaults to
+\code{bpparam()}.}
+
+\item{result}{A result table as generated by FRASER, including the column
+\code{annotatedJunction} as generated by the function
+\code{annotateIntronReferenceOverlap}.}
+
+\item{addPotentialImpact}{Logical, indicating if the type of the potential
+impact should be added to the results table. Defaults to \code{TRUE}.}
+
+\item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions
+should checked and added to the results table. Defaults to \code{TRUE}.}
+
+\item{minoverlap}{Integer value defining the number of base pairs around the
+splice site that need to overlap with UTR or blacklist region,
+respectivly, to be considered matching. Defaults to 5 bp.}
+
+\item{blacklist_regions}{A BED file that contains the blacklist regions.
+If \code{NULL} (default), the BED files that are packaged with FRASER
+are used (see Details for more information).}
+
+\item{assemblyVersion}{Indicates the genome assembly version of the intron
+coordinates. Only used if blacklist_regions is NULL. For other versions,
+please provide the BED file containing the blacklist regions directly.}
+}
+\value{
+An annotated FraserDataSet or results table, respectively
+}
+\description{
+These functions work on the result table and add additional
+ annotations to the reported introns: the type of potential impact on
+ splicing (e.g. exon skipping, exon truncation, ...), potential occurence
+ of frameshift, overlap with UTR regions as well as a flag for introns
+ that are located in blacklist regions of the genome.
+
+\code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the
+ fds for each intron based on the overlap of the intron's location with
+ the reference annotation. Has to be run before the result table is
+ created so that the new column can be included in it (see examples).
+
+\code{\link{annotatePotentialImpact}} annotates each intron in the results
+ table with the type of potential impact on splicing and potential
+ occurence of frameshift (likely, unlikely, inconclusive). Can also
+ calculate overlap with annotated UTR regions. Potential impact can be:
+ annotatedIntron_increasedUsage, annotatedIntron_reducedUsage,
+ exonTruncation, exonElongation, exonTruncation&Elongation,
+ exonSkipping, splicingBeyondGene,
+ multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene,
+ complex (everything else).
+ Splice sites (theta metric) annotations indicate how the splice site is
+ located with respect to the reference annotation. The annotated types
+ are: annotatedSpliceSite, exonicRegion, intronicRegion.
+
+\code{\link{flagBlacklistRegions}} flags introns in the results table on
+ whether or not they are located in a blacklist region of the genome. By
+ default, the blacklist regions as reported in
+ \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from
+ \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here}
+ are used.
+}
+\section{Functions}{
+\itemize{
+\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations
+based on overlap with the reference annotation (start, end, none, both)
+for the full fds. The overlap type is added as a new column
+\code{annotatedJunction} in \code{mcols(fds)}.
+
+\item \code{annotatePotentialImpact}: This method annotates the splice event
+type to junctions in the given results table.
+
+\item \code{flagBlacklistRegions}: This method flags all introns and
+splice sites in the given results table for which at least one splice
+site (donor or acceptor) is located in a blacklist region. Blacklist
+regions of the genome are determined from the provided BED file.
+}}
+
+\examples{
+ # get data, fit and compute p-values and z-scores
+ fds <- createTestFraserDataSet()
+
+ # load reference annotation
+ library(TxDb.Hsapiens.UCSC.hg19.knownGene)
+ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+
+ # add basic annotations for overlap with the reference annotation
+ # run this function before creating the results table
+ fds <- annotateIntronReferenceOverlap(fds, txdb)
+
+ # extract results: for this small example dataset, no cutoffs used
+ # to get some results
+ res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+
+ # annotate the type of potential impact on splicing and UTR overlap
+ res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
+
+ # annotate overlap with blacklist regions
+ res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
+
+ # show results table containing additional annotations
+ res
+
+}
diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd
index f07be433..9a4b9ca2 100644
--- a/man/psiTypes.Rd
+++ b/man/psiTypes.Rd
@@ -3,18 +3,18 @@
\docType{data}
\name{psiTypes}
\alias{psiTypes}
-\title{Available psi types}
+\title{Available splice metrics}
\format{
-An object of class \code{character} of length 3.
+An object of class \code{character} of length 4.
}
\usage{
psiTypes
}
\description{
-Available psi types
+Available splice metrics
}
\examples{
- # to show available psi types:
+ # to show all available splice metrics:
psiTypes
}
diff --git a/man/results.Rd b/man/results.Rd
index 57682616..ceeb9fc8 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -2,7 +2,6 @@
% Please edit documentation in R/AllGenerics.R
\name{results,FraserDataSet-method}
\alias{results,FraserDataSet-method}
-\alias{resultsByGenes}
\alias{aberrant,FraserDataSet-method}
\title{Extracting results and aberrant splicing events}
\usage{
@@ -10,26 +9,31 @@
object,
sampleIDs = samples(object),
padjCutoff = 0.05,
- zScoreCutoff = NA,
- deltaPsiCutoff = 0.3,
+ deltaPsiCutoff = 0.1,
+ rhoCutoff = NA,
+ aggregate = FALSE,
+ collapse = FALSE,
minCount = 5,
- psiType = c("psi3", "psi5", "theta"),
+ psiType = psiTypes,
+ geneColumn = "hgnc_symbol",
+ all = FALSE,
+ returnTranscriptomewideResults = TRUE,
additionalColumns = NULL,
- BPPARAM = bpparam(),
- ...
+ BPPARAM = bpparam()
)
-resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY")
-
\S4method{aberrant}{FraserDataSet}(
object,
- type = currentType(object),
+ type = fitMetrics(object),
padjCutoff = 0.05,
- deltaPsiCutoff = 0.3,
- zScoreCutoff = NA,
+ deltaPsiCutoff = 0.1,
minCount = 5,
+ rhoCutoff = NA,
by = c("none", "sample", "feature"),
aggregate = FALSE,
+ geneColumn = "hgnc_symbol",
+ subsetName = NULL,
+ all = FALSE,
...
)
}
@@ -41,16 +45,38 @@ retrieved}
\item{padjCutoff}{The FDR cutoff to be applied or NA if not requested.}
-\item{zScoreCutoff}{The z-score cutoff to be applied or NA if not requested.}
-
\item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.}
+\item{rhoCutoff}{The cutoff value on the fitted rho value
+(overdispersion parameter of the betabinomial) above which
+junctions are filtered}
+
+\item{aggregate}{If TRUE the returned object is aggregated to the feature
+level (i.e. gene level).}
+
+\item{collapse}{Only takes effect if \code{aggregate=TRUE}.
+If TRUE, collapses results across the different psi
+types to return only one row per feature (gene) and sample.}
+
\item{minCount}{The minimum count value of the total coverage of an intron
to be considered as significant.
result}
\item{psiType}{The psi types for which the results should be retrieved.}
+\item{geneColumn}{The column name of the column that has the gene annotation
+that will be used for gene-level pvalue computation.}
+
+\item{all}{By default FALSE, only significant introns (or genes) are listed
+in the results. If TRUE, results are assembled for all
+samples and introns/genes regardless of significance.}
+
+\item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets
+of genes of interest have been calculated, this parameter
+indicates whether additionally the transcriptome-wide results
+should be returned as well (default), or whether only results
+for those subsets should be retrieved.}
+
\item{additionalColumns}{Character vector containing the names of additional
columns from mcols(fds) that should appear in the result table
(e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns
@@ -58,26 +84,15 @@ are included.}
\item{BPPARAM}{The BiocParallel parameter.}
-\item{...}{Further arguments can be passed to the method. If "zscores",
-"padjVals" or "dPsi" is given, the values of those arguments
-are used to define the aberrant events.}
-
-\item{res}{Result as created with \code{results()}}
-
-\item{geneColumn}{The name of the column in \code{mcols(res)} that contains
-the gene symbols.}
-
-\item{method}{The p.adjust method that is being used to adjust p values per
-sample.}
-
\item{type}{Splicing type (psi5, psi3 or theta)}
\item{by}{By default \code{none} which means no grouping. But if
\code{sample} or \code{feature} is specified the sum by
sample or feature is returned}
-\item{aggregate}{If TRUE the returned object is based on the grouped
-features}
+\item{...}{Further arguments can be passed to the method. If "n",
+"padjVals", "dPsi" or "rhoVals" are given, the values of those
+arguments are used to define the aberrant events.}
}
\value{
For \code{results}: GRanges object containing significant results.
@@ -95,24 +110,36 @@ aberrant splicing events based on the given cutoffs.
# get data, fit and compute p-values and z-scores
fds <- createTestFraserDataSet()
-# extract results: for this example dataset, z score cutoff of 2 is used to
-# get at least one result and show the output
-res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+# extract results: for this example dataset, no cutoffs are used to
+# show the output of the results function
+res <- results(fds, all=TRUE)
res
# aggregate the results by genes (gene symbols need to be annotated first
# using annotateRanges() function)
-resultsByGenes(res)
+results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE)
+
+# aggregate the results by genes and collapse over all psi types to obtain
+# only one row per gene in the results table
+results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE,
+ collapse=TRUE)
# get aberrant events per sample: on the example data, nothing is aberrant
# based on the adjusted p-value
-aberrant(fds, type="psi5", by="sample")
+aberrant(fds, type="jaccard", by="sample")
# get aberrant events per gene (first annotate gene symbols)
fds <- annotateRangesWithTxDb(fds)
-aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA,
- aggregate=TRUE)
+aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE)
# find aberrant junctions/splice sites
-aberrant(fds, type="psi5")
+aberrant(fds, type="jaccard")
+
+# retrieve results limiting FDR correction to only a subset of genes
+# first, we need to create a list of genes per sample that will be tested
+geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
+fds <- calculatePadjValues(fds, type="jaccard",
+ subsets=list("exampleSubset"=geneList))
+results(fds, all=TRUE, returnTranscriptomewideResults=FALSE)
+
}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 461280c0..b03ef489 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -152,6 +152,21 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
+// truncNLL_rho_penalized
+double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda);
+RcppExport SEXP _FRASER_truncNLL_rho_penalized(SEXP logit_rhoSEXP, SEXP yiSEXP, SEXP kiSEXP, SEXP niSEXP, SEXP lambdaSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< double >::type logit_rho(logit_rhoSEXP);
+ Rcpp::traits::input_parameter< arma::vec >::type yi(yiSEXP);
+ Rcpp::traits::input_parameter< arma::vec >::type ki(kiSEXP);
+ Rcpp::traits::input_parameter< arma::vec >::type ni(niSEXP);
+ Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
+ rcpp_result_gen = Rcpp::wrap(truncNLL_rho_penalized(logit_rho, yi, ki, ni, lambda));
+ return rcpp_result_gen;
+END_RCPP
+}
// fullNLL
arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows);
RcppExport SEXP _FRASER_fullNLL(SEXP ySEXP, SEXP rhoSEXP, SEXP kSEXP, SEXP nSEXP, SEXP DSEXP, SEXP lambdaSEXP, SEXP byRowsSEXP) {
@@ -215,6 +230,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_FRASER_truncNLL_e", (DL_FUNC) &_FRASER_truncNLL_e, 7},
{"_FRASER_truncGrad_e", (DL_FUNC) &_FRASER_truncGrad_e, 7},
{"_FRASER_truncNLL_rho", (DL_FUNC) &_FRASER_truncNLL_rho, 4},
+ {"_FRASER_truncNLL_rho_penalized", (DL_FUNC) &_FRASER_truncNLL_rho_penalized, 5},
{"_FRASER_fullNLL", (DL_FUNC) &_FRASER_fullNLL, 7},
{"_FRASER_truncWeightedNLL_db", (DL_FUNC) &_FRASER_truncWeightedNLL_db, 7},
{"_FRASER_truncWeightedGrad_db", (DL_FUNC) &_FRASER_truncWeightedGrad_db, 7},
diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp
index 07eadcca..0078a270 100644
--- a/src/loss_n_gradient_functions.cpp
+++ b/src/loss_n_gradient_functions.cpp
@@ -6,7 +6,7 @@
using namespace Rcpp;
const double MAX_EXP_VALUE = 700;
-double PSEUDO_COUNT = 1;
+double PSEUDO_COUNT = 0;
// [[Rcpp::export(.setPseudoCount)]]
double setPseudoCount(double pseudoCount){
@@ -142,6 +142,12 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double
infPosB = arma::find_nonfinite(beta);
// beta.elem( infPosB ) = abs.elem( infPosB );
beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem;
@@ -266,6 +272,12 @@ double truncNLL_e(arma::vec par, arma::mat x, arma::mat D, arma::vec b,
beta.elem( infPosB ) = abs.elem( infPosB );
// beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = abs.elem( infPosA2 );
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = abs.elem( infPosB2 );
+
nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem;
return arma::as_scalar(nll);
@@ -361,11 +373,58 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){
// beta.elem( infPosB ) = abs.elem( infPosB );
beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob);
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa);
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob);
+
+
nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem;
return arma::as_scalar(nll);
}
+// [[Rcpp::export()]]
+double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda){
+ arma::vec mui, u, alpha, alphaK, beta, betaNK, alphaBeta, nll;
+ double rho, rhoa, rhob;
+
+ rho = exp(logit_rho)/(1 + exp(logit_rho));
+ rhoa = (1 - rho)/rho;
+ rhob = (rho - 1)/rho;
+ mui = predictMuCpp(yi);
+ u = (mui-1) * rhob;
+
+ alpha = arma::lgamma(mui * rhoa);
+ alphaK = arma::lgamma(mui * rhoa + ki + PSEUDO_COUNT);
+ beta = arma::lgamma(u);
+ betaNK = arma::lgamma(u + ni - ki + PSEUDO_COUNT);
+ alphaBeta = arma::lgamma(rhoa + ni + (2*PSEUDO_COUNT)) - lgamma(rhoa);
+
+ // arma::vec abs;
+ arma::uvec infPosA, infPosB;
+ // abs = arma::abs(yi);
+ infPosA = arma::find_nonfinite(alpha);
+ // alpha.elem( infPosA ) = abs.elem( infPosA );
+ alpha.elem( infPosA ) = estLgammaAlpha(yi, infPosA, rhoa);
+ infPosB = arma::find_nonfinite(beta);
+ // beta.elem( infPosB ) = abs.elem( infPosB );
+ beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob);
+
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa);
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob);
+
+
+ nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem;
+ nll = nll + lambda * (logit_rho*logit_rho);
+
+ return arma::as_scalar(nll);
+}
+
// [[Rcpp::export()]]
arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows=false){
arma::mat rhoa, rhob;
@@ -395,6 +454,11 @@ arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::ma
infPosB = arma::find_nonfinite(beta);
beta.elem( infPosB ) = abs.elem( infPosB );
// beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = abs.elem( infPosA2 );
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = abs.elem( infPosB2 );
if(byRows){
nll = rowMeans(alpha + beta - alphaK - betaNK + nonTruncTerms);
@@ -471,6 +535,12 @@ double truncWeightedNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n,
// beta.elem( infPosB ) = abs.elem( infPosB );
beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+ arma::uvec infPosA2, infPosB2;
+ infPosA2 = arma::find_nonfinite(alphaK);
+ alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
+ infPosB2 = arma::find_nonfinite(betaNK);
+ betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
+
nll = arma::accu((alpha + beta - alphaK - betaNK)%w)/k.n_elem;
nll = nll + (lambda/k.n_elem) * arma::accu(d % d);
diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R
index 4fcd0d61..1bdbfc37 100644
--- a/tests/testthat/test_counting.R
+++ b/tests/testthat/test_counting.R
@@ -51,7 +51,7 @@ test_that("test minAnchor", {
"sample3", features, fds, minAnchor=25, recount=TRUE)) })
expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1])
- expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1])
+ expect_equivalent(c(5, 8, 0, 0, 7), ctnNS25[,1])
})
test_that("Test psi values", {
diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R
index 069a217d..45c9b2ff 100644
--- a/tests/testthat/test_fraser_pipeline.R
+++ b/tests/testthat/test_fraser_pipeline.R
@@ -4,8 +4,8 @@ test_that("FRASER function", {
fds <- createTestFraserDataSet()
expect_is(fds, "FraserDataSet")
anames <- c(psiTypes, paste0(c("delta", "predictedMeans",
- "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_",
- rep(psiTypes, 5)))
+ "pvaluesBetaBinomial", "padjBetaBinomial"), "_",
+ rep(fitMetrics(fds), 5)))
expect_equal(anames %in% assayNames(fds), !logical(length(anames)))
})
diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R
index 26331409..fd8a961b 100644
--- a/tests/testthat/test_hyperParams.R
+++ b/tests/testthat/test_hyperParams.R
@@ -2,6 +2,7 @@ context("Test hyper param optimization")
test_that("Test hyper param testing", {
fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB")
+ fds <- calculatePSIValues(fds)
# test BB no hyper params and accessors
fds <- optimHyperParams(fds, type="psi3", implementation="BB")
diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R
index 85a4b9d3..43022359 100644
--- a/tests/testthat/test_plotJunctionDist.R
+++ b/tests/testthat/test_plotJunctionDist.R
@@ -3,15 +3,15 @@ context("Test distribution plots for given results/junction")
test_that("Main junction distribution plot", {
# get results
fds <- getFraser()
- res <- results(fds, padjCutoff=1, zScoreCutoff=NA, deltaPsiCutoff=NA)
+ res <- results(fds, padjCutoff=1, deltaPsiCutoff=NA)
# plot distributions
expect_silent(plotExpression(fds, result=res[1]))
- expect_silent(plotVolcano(fds, "sample1", "psi5"))
+ expect_silent(plotVolcano(fds, "sample1", "jaccard"))
expect_silent(plotExpectedVsObservedPsi(fds, result=res[2]))
- expect_is(plotCountCorHeatmap(fds, "psi5", norm=FALSE), "pheatmap")
- expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE), "pheatmap")
- expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE, topN=10), "pheatmap")
+ expect_is(plotCountCorHeatmap(fds, "jaccard", norm=FALSE), "pheatmap")
+ expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE), "pheatmap")
+ expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE, topN=10), "pheatmap")
})
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index 8e76cc19..b1eedb8c 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -17,16 +17,94 @@ test_that("PSI value calculation", {
expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0))
})
-test_that("Zscore calculation", {
- fds <- getFraser(clean = TRUE)
+# test_that("Zscore calculation", {
+# fds <- getFraser(clean = TRUE)
+#
+# # prepare zScore input for logit scale
+# psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount())
+# mu <- predictedMeans(fds, "jaccard")
+# residual <- qlogis(psiVal) - qlogis(mu)
+#
+# # compute zscore
+# zscores <- (residual - rowMeans(residual)) / rowSds(residual)
+#
+# expect_equal(zscores, zScores(fds, "jaccard"))
+# })
+
+test_that("Gene p value calculation with NAs", {
+ fds <- getFraser()
+ fds <- fds[15:24,]
+ mcols(fds, type="j")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"),
+ times=c(3, 4, 3))
+ mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"),
+ times=c(4, 6, 4))
+
+ # simulate junction with bad rho fit to create partly NAs
+ rho <- rho(fds, type="jaccard")
+ rho[c(1, 4:7)] <- 0.5
+ rho(fds, type="jaccard") <- rho
+
+ # calc p values
+ fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1)
+
+ # check dimension of junction-, site- and gene-level pval matrices
+ expect_equal(nrow(pVals(fds, type="jaccard", level="junction")), nrow(fds))
+ expect_equal(nrow(pVals(fds, type="jaccard", level="site",
+ filters=list(rho=0.1))), nrow(fds))
+ expect_equal(nrow(pVals(fds, type="jaccard", level="gene",
+ filters=list(rho=0.1))), 3)
+
+ # check jaccard pvals are partly NAs
+ expect_true(all(is.na(pVals(fds, type="jaccard", level="site",
+ filters=list(rho=0.1))[4:7,])))
+ expect_true(all(is.na(pVals(fds, type="jaccard", level="gene",
+ filters=list(rho=0.1))["geneB",])))
+ expect_true(all(is.na(padjVals(fds, type="jaccard", level="site",
+ filters=list(rho=0.1))[4:7,])))
+ expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene",
+ filters=list(rho=0.1))["geneB",])))
- # prepare zScore input for logit scale
- psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount())
- mu <- predictedMeans(fds, "psi5")
- residual <- qlogis(psiVal) - qlogis(mu)
+ # simulate junction with bad rho fit to create partly NAs
+ rho <- rho(fds, type="jaccard")
+ rho <- rep(0.5, length(rho))
+ rho(fds, type="jaccard") <- rho
+ fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1)
+
+ # check jaccard pvals are all NAs
+ expect_true(all(is.na(pVals(fds, type="jaccard", level="site",
+ filters=list(rho=0.1)))))
+ expect_true(all(is.na(pVals(fds, type="jaccard", level="gene",
+ filters=list(rho=0.1)))))
+ expect_true(all(is.na(padjVals(fds, type="jaccard", level="site",
+ filters=list(rho=0.1)))))
+ expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene",
+ filters=list(rho=0.1)))))
+})
+
+test_that("FDR on subset of genes", {
+ fds <- getFraser()
+ mcols(fds, type="j")$hgnc_symbol <-
+ rep(c("geneA", "geneB", "geneC", "geneD", "geneE"),
+ times=c(3, 7, 5, 4, 8))
- # compute zscore
- zscores <- (residual - rowMeans(residual)) / rowSds(residual)
+ # define gene subset per sample
+ genes_per_sample <- list(
+ "sample1" = c("geneE", "geneC", "geneA"),
+ "sample2" = c("geneB"),
+ "sample3" = c("geneA", "geneB", "geneC", "geneD")
+ )
- expect_equal(zscores, zScores(fds, "psi5"))
+ subsetName <- "subset_test"
+ fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample,
+ subsetName=subsetName, type="jaccard")
+ subset_padj <- padjVals(fds, type="jaccard", subsetName=subsetName)
+ expect_true(is(subset_padj, "matrix"))
+ expect_true(nrow(subset_padj) == 27)
+ expect_true(ncol(subset_padj) == 3)
+ subset_padj_gene <- padjVals(fds, type="jaccard", level="gene",
+ subsetName=subsetName)
+ expect_true(is(subset_padj_gene, "matrix"))
+ expect_true(nrow(subset_padj_gene) == 5)
+ expect_true(ncol(subset_padj_gene) == 3)
+
})
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 11a7a599..70e80f21 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -1,4 +1,4 @@
-%\VignetteIndexEntry{FRASER: Find RAre Splicing Evens in RNA-seq Data}
+%\VignetteIndexEntry{FRASER: Find RAre Splicing Events in RNA-seq Data}
%\VignettePackage{FRASER}
%\VignetteEngine{knitr::knitr}
%\VignetteEncoding{UTF-8}
@@ -36,7 +36,7 @@ opts_chunk$set(
\newcommand{\fraser}{\Biocpkg{FRASER}}
\newcommand{\fds}{\Rclass{FraserDataSet}}
-\title{FRASER: Find RAre Splicing Events in RNA-seq}
+\title{FRASER: Find RAre Splicing Events in RNA-seq Data}
\author{
Christian Mertes$^{1}$, Ines Scheller$^{1}$, Julien Gagneur$^{1}$ \\
@@ -175,6 +175,32 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and
between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the
following.
+From \fraser{} 2.0 on, only a single metric - the Intron Jaccard Index (Figure
+\ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard
+Index is more robust and allows to focus more on functionally relevant
+aberrant splicing events. It allows to detect all types of aberrant splicing
+previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$)
+within a single metric.
+
+\incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron
+Jaccard Index, the splice metric used in \fraser{}2.}{
+The Intron Jaccard Index considers both split and nonsplit reads within a
+single metric and allows to detect all different types of aberrant splicing
+previously captured with either of the metrics $\psi_5$, $\psi_3$, $\theta$.
+}
+
+The Intron Jaccard Index considers both split and nonsplit reads and is
+defined as the jaccard index of the set of donor reads (reads sharing a donor
+site with the intron of interest and nonsplit reads at that donor site) and
+acceptor reads (reads sharing an acceptor site with the intron of interest and
+nonsplit reads at that acceptor site):
+
+\begin{equation}
+ J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}
+ \label{eq:jaccard}
+\end{equation}
+
+
\section{Quick guide to \fraser{}}
Here we quickly show how to do an analysis with \fraser{}, starting from a
@@ -204,10 +230,6 @@ fds <- calculatePSIValues(fds)
fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20,
minDeltaPsi=0.0, filter=TRUE)
-# fit the splicing model for each metric
-# with a specific latentsapce dimension
-fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3))
-
# we provide two ways to anntoate introns with the corresponding gene symbols:
# the first way uses TxDb-objects provided by the user as shown here
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
@@ -216,17 +238,21 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgDb <- org.Hs.eg.db
fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
+# fit the splicing model for each metric
+# with a specific latentspace dimension
+fds <- FRASER(fds, q=c(jaccard=2))
+
# alternatively, we also provide a way to use biomart for the annotation:
# fds <- annotateRanges(fds)
# get results: we recommend to use an FDR cutoff 0.05, but due to the small
# dataset size we extract all events and their associated values
-# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3)
-res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA)
+# eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1)
+res <- results(fds, all=TRUE)
res
# result visualization
-plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE)
+plotVolcano(fds, sampleID="sample1", type="jaccard", aggregate=TRUE)
@
@@ -386,9 +412,9 @@ be processed with the same protocol and origin from the same tissue.
\label{sec:filtering}
Before we can filter the data, we have to compute the main splicing metric:
-the $\psi$-value (Percent Spliced In).
+the $\psi$-value (Percent Spliced In) and the Intron Jaccard Index.
-<>=
+<>=
fds <- calculatePSIValues(fds)
fds
@
@@ -439,7 +465,7 @@ transformed $\psi$ values to compute the correlation.
<>=
# Heatmap of the sample correlation
-plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE)
+plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE)
@
It is also possible to visualize the correlation structure of the logit
@@ -447,7 +473,7 @@ transformed $\psi$ values of the $topJ$ most variable introns for all samples:
<>=
# Heatmap of the intron/sample expression
-plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE,
+plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE,
plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)
@
@@ -470,14 +496,14 @@ p-values and z-scores for all $\psi$ types. For more details see section
<>=
# This is computational heavy on real size datasets and can take awhile
-fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2))
+fds <- FRASER(fds, q=c(jaccard=3))
@
To check whether the correction worked, we can have a look at the correlation
heatmap using the normalized $\psi$ values from the fit.
<>=
-plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE)
+plotCountCorHeatmap(fds, type="jaccard", normalized=TRUE, logit=TRUE)
@
\subsubsection{Calling splicing outliers}
@@ -519,34 +545,58 @@ the following additional information:
\item hgncSymbol: the gene symbol of the gene that contains the splice
junction or site if available
\item type: the metric for which the aberrant event was detected (either
- psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$)
- \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of
- this event
+ jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or
+ theta for $\theta$)
+ \item pValue, padjust: the p-value and adjusted p-value (FDR) of
+ this event (at intron or splice site level depending on metric)
+ \item pValueGene, padjustGene: only present in the gene-level results table,
+ gives the p-value and FDR adjusted p-value at gene-level
\item psiValue: the value of $\psi_5$, $\psi_3$ or $\theta$ metric
(depending on the type column) of this junction or splice site for the
sample in which it is detected as aberrant
\item deltaPsi: the $\Delta\psi$-value of the event in this sample, which
is the difference between the actual observed $\psi$ and the expected $\psi$
+ \item counts, totalCounts: the count (k) and total count (n) of the splice
+ junction or site for the sample where it is detected as aberrant
\item meanCounts: the mean count (k) of reads mapping to this splice
junction or site over all samples
\item meanTotalCounts: the mean total count (n) of reads mapping to the
same donor or acceptor site as this junction or site over all samples
- \item counts, totalCounts: the count (k) and total count (n) of the splice
- junction or site for the sample where it is detected as aberrant
+ \item nonsplitCounts, nonsplitProportion: only present for the Intron
+ Jaccard Index. States the sum of nonsplit counts overlapping either the
+ donor or acceptor site of the outlier intron for the sample where it is
+ detected as aberrant; and their proportion out of the total counts (N).
+ A high nonsplitProportion indicates possible (partial) intron retention.
+ \item FDR\_set The set of genes on which FDR correction is applied. If not
+ otherwise specified, FDR correction is transcriptome-wide.
\end{itemize}
Please refer to section \ref{sec:Introduction} for more information about the
-metrics $\psi_5$, $\psi_3$ and $\theta$ and their definition. In general, an
+Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and
+$\theta$) and their definition. In general, an
aberrant $\psi_5$ value might indicate aberrant acceptor site usage of the
junction where the event is detected; an aberrant $\psi_3$ value might indicate
aberrant donor site usage of the junction where the event is detected; and an
aberrant $\theta$ value might indicate partial or full intron retention, or
-exon truncation or elongation. We recommend using a genome browser to
-investigate interesting detected events in more detail.
+exon truncation or elongation. As the Intron Jaccard Index combines the
+previously described metrics, an aberrant Intron Jaccard value can indicate any
+of the above described cases. We recommend using a genome browser to
+investigate interesting detected events in more detail. \fraser{}2 also
+provides the function \Rfunction{plotBamCoverageFromResultTable} to create a
+sashimi plot for an outlier in the results table directly in R (if paths to
+bam files are available in the \fds{} object).
<>=
-# to show result visualization functions for this tuturial, zScore cutoff used
-res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1)
+# to show result visualization functions for this tutorial, no cutoff used
+res <- results(fds, all=TRUE)
res
+
+# for the gene level pvalues, gene symbols need to be annotated the fds object
+# before calling the calculatePadjValues function (part of FRASER() function)
+# as we previously called FRASER() before annotating genes, we run it again here
+fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE)
+# generate gene-level results table (if gene symbols have been annotated)
+res_gene <- results(fds, aggregate=TRUE, all=TRUE)
+res_gene
@
\subsection{Finding splicing candidates in patients}
@@ -555,7 +605,7 @@ Let's hava a look at sample 10 and check if we got some splicing
candidates for this sample.
<>=
-plotVolcano(fds, type="psi5", "sample10")
+plotVolcano(fds, type="jaccard", "sample10")
@
Which are the splicing events in detail?
@@ -568,8 +618,9 @@ sampleRes
To have a closer look at the junction level, use the following functions:
<>=
-plotExpression(fds, type="psi5", result=sampleRes[1])
-plotExpectedVsObservedPsi(fds, result=sampleRes[1])
+plotExpression(fds, type="jaccard", result=sampleRes[9])
+plotSpliceMetricRank(fds, type="jaccard", result=sampleRes[9])
+plotExpectedVsObservedPsi(fds, result=sampleRes[9])
@
\subsection{Saving and loading a \fds{}}
@@ -619,7 +670,7 @@ confounders in the data. Currently the following methods are implemented:
# Using an alternative way to correct splicing ratios
# here: only 2 iteration to speed the calculation up
# for the vignette, the default is 15 iterations
-fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder",
+fds <- fit(fds, q=3, type="jaccard", implementation="PCA-BB-Decoder",
iterations=2)
@
@@ -638,17 +689,17 @@ for a subset of the dataset:
<>=
set.seed(42)
# hyperparameter opimization
-fds <- optimHyperParams(fds, type="psi5", plot=FALSE)
+fds <- optimHyperParams(fds, type="jaccard", plot=FALSE)
# retrieve the estimated optimal dimension of the latent space
-bestQ(fds, type="psi5")
+bestQ(fds, type="jaccard")
@
The results from this hyper parameter optimization can be visualized with the
function \Rfunction{plotEncDimSearch}.
<>=
-plotEncDimSearch(fds, type="psi5")
+plotEncDimSearch(fds, type="jaccard")
@
\subsection{P-value calculation}
@@ -669,8 +720,8 @@ computed as the product of the fitted correction values from the autoencoder and
the fitted mean adjustements.
<>=
-fds <- calculatePvalues(fds, type="psi5")
-head(pVals(fds, type="psi5"))
+fds <- calculatePvalues(fds, type="jaccard")
+head(pVals(fds, type="jaccard"))
@
Afterwards, adjusted p-values can be calculated. Multiple testing correction is
@@ -680,32 +731,21 @@ methods supported by \Rfunction{p.adjust} can be used via the \Robject{method}
argument.
<>=
-fds <- calculatePadjValues(fds, type="psi5", method="BY")
-head(padjVals(fds,type="psi5"))
+fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+head(padjVals(fds,type="jaccard"))
@
-\subsection{Z-score calculation}
-\label{sec:Z-score-calculation}
-
-To calculate z-scores on the logit transformed $\Delta\psi$ values and to store
-them in the \fds{} object, the function \Rfunction{calculateZScores} can be
-called. The Z-scores can be used for visualization, filtering, and ranking of
-samples. The Z-scores are calculated as follows:
-
-\begin{equation}
- z_{ij} = \frac{\delta_{ij} - \bar{\delta_j}}{sd(\delta_j)}
-\end{equation}
-\begin{equation*}
- \delta_{ij} = logit{(\frac{k_{ij} + 1}{n_{ij} + 2})} - logit{(\mu_{ij})},
-\end{equation*}
-
-where $\delta_{ij}$ is the difference on the logit scale between the measured
-counts and the counts after correction for confounders and $\bar{\delta_j}$ is
-the mean of intron $j$.
+With FRASER 2.0 we introduce the option to limit FDR correction to a subset of
+genes based on prior knowledge, e.g. genes that contain a rare variant per
+sample. To use this option, provide a list of genes per sample during FDR
+computation:
-<>=
-fds <- calculateZscore(fds, type="psi5")
-head(zScores(fds, type="psi5"))
+<>=
+genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"),
+ "sample2"=c("ARHGEF18", "TRAPPC5"))
+fds <- calculatePadjValues(fds, type="jaccard",
+ subsets=list("exampleSubset"=genesOfInterest))
+head(padjVals(fds, type="jaccard", subsetName="exampleSubset"))
@
\subsection{Result visualization}
@@ -713,6 +753,7 @@ head(zScores(fds, type="psi5"))
In addition to the plotting methods \Rfunction{plotVolcano},
\Rfunction{plotExpression}, \Rfunction{plotExpectedVsObservedPsi},
+\Rfunction{plotSpliceMetricRank},
\Rfunction{plotFilterExpression} and \Rfunction{plotEncDimSearch} used above,
the \fraser{} package provides two additional functions to visualize the
results:
@@ -731,6 +772,54 @@ plotQQ(fds, result=res[1])
plotQQ(fds, aggregate=TRUE, global=TRUE)
@
+The \Rfunction{plotManhattan} function can be used to visualize the p-values
+along with the genomic coordinates of the introns:
+<>=
+plotManhattan(fds, sampleID="sample10")
+plotManhattan(fds, sampleID="sample10", chr="chr19")
+@
+
+Finally, when one has access to the bam files from which the split and unsplit
+counts of FRASER were created, the \Rfunction{plotBamCoverage} and
+\Rfunction{plotBamCoverageFromResultTable} functions use the \Rpackage{SGSeq}
+package to allow visualizing the read coverage in the bam file a certain intron
+from the results table or within a given genomic region as a sashimi plot:
+<>=
+### plot coverage from bam file for a certain genomic region
+fds <- createTestFraserSettings()
+vizRange <- GRanges(seqnames="chr19",
+ IRanges(start=7587496, end=7598895),
+ strand="+")
+plotBamCoverage(fds, gr=vizRange, sampleID="sample3",
+ control_samples="sample2", min_junction_count=5,
+ curvature_splicegraph=1, curvature_coverage=1,
+ mar=c(1, 7, 0.1, 3))
+
+### plot coverage from bam file for a row in the result table
+fds <- createTestFraserDataSet()
+
+# load gene annotation
+require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+require(org.Hs.eg.db)
+orgDb <- org.Hs.eg.db
+
+# get results table
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+res_dt <- as.data.table(res)
+res_dt <- res_dt[sampleID == "sample2",]
+
+# plot full range of gene highlighting the outlier intron
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+ txdb=txdb, orgDb=orgDb, control_samples="sample3")
+
+# plot only certain range around the outlier intron
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE,
+ control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+ curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+ splicegraph_labels="id")
+@
+
\bibliography{bibliography}
\section{Session Info}
diff --git a/vignettes/IntronJaccardIndex_sketch.png b/vignettes/IntronJaccardIndex_sketch.png
new file mode 100644
index 00000000..79cca6a8
Binary files /dev/null and b/vignettes/IntronJaccardIndex_sketch.png differ