diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 946e96ef..e32789e2 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -249,7 +249,7 @@ jobs: rcmdcheck::rcmdcheck( args = c("--no-manual", "--no-vignettes", "--timings"), build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), - error_on = "warning", + error_on = "error", check_dir = "check" ) shell: Rscript {0} diff --git a/DESCRIPTION b/DESCRIPTION index 77e0fc50..2dc759a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,13 @@ Package: FRASER Type: Package Title: Find RAre Splicing Events in RNA-Seq Data -Version: 1.11.0 -Date: 2021-04-05 +Version: 1.99.0 +Date: 2022-11-25 Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de"), person("Ines", "Scheller", role=c("aut"), email="scheller@in.tum.de"), + person("Karoline", "Lutz", role=c("aut")), person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"), person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de")) Description: Detection of rare aberrant splicing events in transcriptome @@ -28,7 +29,7 @@ biocViews: License: MIT + file LICENSE URL: https://github.com/gagneurlab/FRASER BugRepots: https://github.com/gagneurlab/FRASER/issues -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.2 Encoding: UTF-8 VignetteBuilder: knitr Depends: @@ -82,9 +83,13 @@ Suggests: covr, TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db, + rtracklayer, + SGSeq, + ggbio, + biovizBase LinkingTo: - Rcpp, - RcppArmadillo + RcppArmadillo, + Rcpp Collate: variables.R getNSetterFuns.R @@ -114,3 +119,4 @@ Collate: fitCorrectionMethods.R plotMethods.R zzz.R + resultAnnotations.R diff --git a/NAMESPACE b/NAMESPACE index 35e2d66a..907f1408 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export("condition<-") export("currentType<-") export("dontWriteHDF5<-") export("featureExclusionMask<-") +export("fitMetrics<-") export("name<-") export("nonSplicedReads<-") export("pairedEnd<-") @@ -22,12 +23,16 @@ export(K) export(N) export(aberrant) export(addCountsToFraserDataSet) +export(annotateIntronReferenceOverlap) +export(annotatePotentialImpact) export(annotateRanges) export(annotateRangesWithTxDb) +export(availableFDRsubsets) export(bamFile) export(bestQ) export(calculatePSIValues) export(calculatePadjValues) +export(calculatePadjValuesOnSubset) export(calculatePvalues) export(calculateZscore) export(condition) @@ -44,6 +49,8 @@ export(filterExpression) export(filterExpressionAndVariability) export(filterVariability) export(fit) +export(fitMetrics) +export(flagBlacklistRegions) export(getNonSplitReadCountsForAllSamples) export(getSplitReadCountsForAllSamples) export(hyperParams) @@ -60,19 +67,22 @@ export(pVals) export(padjVals) export(pairedEnd) export(plotAberrantPerSample) +export(plotBamCoverage) +export(plotBamCoverageFromResultTable) export(plotCountCorHeatmap) export(plotEncDimSearch) export(plotExpectedVsObservedPsi) export(plotExpression) export(plotFilterExpression) export(plotFilterVariability) +export(plotManhattan) export(plotQQ) +export(plotSpliceMetricRank) export(plotVolcano) export(predictedMeans) export(pseudocount) export(psiTypes) export(results) -export(resultsByGenes) export(rho) export(samples) export(saveFraserDataSet) @@ -98,6 +108,7 @@ exportMethods(assays) exportMethods(bamFile) exportMethods(condition) exportMethods(filterExpression) +exportMethods(filterVariability) exportMethods(length) exportMethods(name) exportMethods(nonSplicedReads) @@ -105,6 +116,7 @@ exportMethods(pairedEnd) exportMethods(plotAberrantPerSample) exportMethods(plotCountCorHeatmap) exportMethods(plotEncDimSearch) +exportMethods(plotManhattan) exportMethods(plotQQ) exportMethods(plotVolcano) exportMethods(results) @@ -158,27 +170,40 @@ importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqlevels) importFrom(GenomeInfoDb,seqlevelsStyle) importFrom(GenomeInfoDb,seqnames) +importFrom(GenomeInfoDb,sortSeqlevels) importFrom(GenomeInfoDb,standardChromosomes) importFrom(GenomicAlignments,junctions) importFrom(GenomicAlignments,readGAlignmentPairs) importFrom(GenomicAlignments,readGAlignments) importFrom(GenomicAlignments,summarizeJunctions) +importFrom(GenomicFeatures,exons) +importFrom(GenomicFeatures,fiveUTRsByTranscript) importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,makeTxDbFromGFF) +importFrom(GenomicFeatures,seqlevels0) +importFrom(GenomicFeatures,threeUTRsByTranscript) +importFrom(GenomicRanges,"end<-") +importFrom(GenomicRanges,"seqinfo<-") +importFrom(GenomicRanges,"start<-") importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) +importFrom(GenomicRanges,end) importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,granges) importFrom(GenomicRanges,invertStrand) importFrom(GenomicRanges,makeGRangesFromDataFrame) +importFrom(GenomicRanges,start) importFrom(HDF5Array,HDF5Array) importFrom(HDF5Array,loadHDF5SummarizedExperiment) importFrom(HDF5Array,path) importFrom(HDF5Array,saveHDF5SummarizedExperiment) importFrom(HDF5Array,writeHDF5Array) +importFrom(IRanges,"%over%") importFrom(IRanges,IRanges) +importFrom(IRanges,distance) importFrom(IRanges,from) +importFrom(IRanges,nearest) importFrom(IRanges,ranges) importFrom(IRanges,subsetByOverlaps) importFrom(IRanges,to) @@ -204,15 +229,18 @@ importFrom(Rsamtools,scanBamHeader) importFrom(Rsubread,featureCounts) importFrom(S4Vectors,"mcols<-") importFrom(S4Vectors,"metadata<-") +importFrom(S4Vectors,"values<-") importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,Rle) importFrom(S4Vectors,SimpleList) +importFrom(S4Vectors,elementMetadata) importFrom(S4Vectors,end) importFrom(S4Vectors,mcols) importFrom(S4Vectors,metadata) importFrom(S4Vectors,queryHits) importFrom(S4Vectors,start) importFrom(S4Vectors,subjectHits) +importFrom(S4Vectors,values) importFrom(SummarizedExperiment,"assay<-") importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") @@ -236,6 +264,7 @@ importFrom(VGAM,rbetabinom) importFrom(VGAM,vglm) importFrom(biomaRt,getBM) importFrom(biomaRt,useEnsembl) +importFrom(cowplot,background_grid) importFrom(cowplot,theme_cowplot) importFrom(extraDistr,dbbinom) importFrom(extraDistr,pbbinom) @@ -245,6 +274,8 @@ importFrom(ggplot2,aes) importFrom(ggplot2,annotate) importFrom(ggplot2,annotation_logticks) importFrom(ggplot2,element_blank) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_hline) @@ -253,10 +284,12 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_smooth) +importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,labs) +importFrom(ggplot2,quo_name) importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_discrete) importFrom(ggplot2,scale_color_gradientn) @@ -271,6 +304,7 @@ importFrom(ggplot2,theme_bw) importFrom(ggplot2,xlab) importFrom(ggplot2,xlim) importFrom(ggplot2,ylab) +importFrom(ggplot2,ylim) importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRampPalette) importFrom(matrixStats,colAnys) @@ -324,9 +358,11 @@ importFrom(stats,rnbinom) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(stats,sd) +importFrom(tibble,"%>%") importFrom(tibble,as_tibble) importFrom(tools,file_path_as_absolute) importFrom(utils,capture.output) importFrom(utils,packageVersion) +importFrom(utils,tail) importMethodsFrom(OUTRIDER,results) useDynLib(FRASER) diff --git a/NEWS b/NEWS index 15ca2c35..6728c4a5 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,24 @@ +CHANGES IN VERSION 2.0.0 +------------------------- + o Major update to FRASER2: + o Introduction of new & more robust splice metric Intron Jaccard Index + o Only Intron Jaccard Index metric used by default + o Improved gene level pvalue calculation and internal storage + o Introduction of option to limit FDR correction to user-defined + subsets of genes per sample (e.g. OMIM genes with rare variant) + o Updated internal pseudocount parameter and default delta Jaccard + cutoff + o Junction filtering adapted to usage of Intron Jaccard Index metric + o Require min expression of N >= 10 in 25% of the samples + o Results table: + o Functionality to flag outliers in blacklist regions of the genome + o Functionality to annotate the predicted type of aberrantSplicing + (e.g. exon skipping, intron retention etc.) + o Several updates in the plotting functions + o introduction of manhattan plot functionality + o possibility to create sashimi plots to visualize read coverage in + the bam files for outliers + CHANGES IN VERSION 1.8.1 ------------------------- o Bugfix in merging splicing counts (#41) diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R index ced4142a..0015b143 100644 --- a/R/AllGenerics-definitions.R +++ b/R/AllGenerics-definitions.R @@ -146,3 +146,14 @@ setGeneric("nonSplicedReads", #' @export setGeneric("nonSplicedReads<-", signature = "object", function(object, value) standardGeneric("nonSplicedReads<-")) + +#' @rdname plotFunctions +#' @export +setGeneric("plotManhattan", function(object, ...) + standardGeneric("plotManhattan")) + +#' @rdname filtering +#' @export +setGeneric("filterVariability", function(object, ...) + standardGeneric("filterVariability")) + diff --git a/R/AllGenerics.R b/R/AllGenerics.R index cfb73078..05eb80d1 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -12,8 +12,8 @@ asFDS <- function(x){ #' #' @title Getter/Setter methods for the FraserDataSet #' -#' The following methods are getter and setter methods to extract or set -#' certain values of a FraserDataSet object. +#' @description The following methods are getter and setter methods to extract +#' or set certain values of a FraserDataSet object. #' #' \code{samples} sets or gets the sample IDs; \code{condition} ; #' \code{} @@ -66,6 +66,9 @@ NULL #' @rdname fds-methods #' @export setMethod("samples", "FraserDataSet", function(object) { + if(!is.null(colnames(object))){ + return(colnames(object)) + } return(as.character(colData(object)[,"sampleID"])) }) @@ -74,6 +77,7 @@ setMethod("samples", "FraserDataSet", function(object) { setReplaceMethod("samples", "FraserDataSet", function(object, value) { colData(object)[,"sampleID"] <- as.character(value) rownames(colData(object)) <- colData(object)[,"sampleID"] + colnames(object) <- as.character(value) validObject(object) return(object) }) @@ -515,12 +519,14 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace) #' @examples #' fds <- createTestFraserDataSet() #' -#' counts(fds, type="psi5", side="ofInterest") -#' counts(fds, type="psi5", side="other") +#' counts(fds, side="ofInterest") +#' counts(fds, type="jaccard", side="other") +#' head(K(fds)) +#' head(K(fds, type="psi5")) #' head(K(fds, type="psi3")) #' head(N(fds, type="theta")) #' -setMethod("counts", "FraserDataSet", function(object, type=NULL, +setMethod("counts", "FraserDataSet", function(object, type=currentType(object), side=c("ofInterest", "otherSide")){ side <- match.arg(side) if(side=="ofInterest"){ @@ -536,8 +542,9 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, # extract psi value from type type <- whichPSIType(type) if(length(type) == 0 | length(type) > 1){ - stop(paste0("Please provide a correct psi type: psi5, psi3, or ", - "theta. Not the given one: '", type, "'.")) + stop(paste0("Please provide a correct psi type: psi5, psi3, ", + "theta or jaccard. Not the given one: '", + type, "'.")) } aname <- paste0("rawOtherCounts_", type) if(!aname %in% assayNames(object)){ @@ -551,7 +558,8 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, #' setter for count data #' #' @rdname counts -setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL, +setReplaceMethod("counts", "FraserDataSet", function(object, + type=currentType(object), side=c("ofInterest", "otherSide"), ..., value){ side <- match.arg(side) @@ -578,150 +586,289 @@ setAs("DataFrame", "matrix", function(from){ as.matrix(as(from, "data.table")) }) #' -#' retrieve a single sample result object -#' @noRd -resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals, - rawCts, rawTotalCts, deltaPsiVals, muPsi, psiType, fdrCut, - zscoreCut, dPsiCut, rowMeansK, rowMeansN, minCount, - additionalColumns){ - - zscore <- zscores[,sampleID] - dpsi <- deltaPsiVals[,sampleID] - pval <- pvals[,sampleID] - padj <- padjs[,sampleID] - - goodCut <- !logical(length(zscore)) - if(!is.na(zscoreCut)){ - goodCut <- goodCut & na2default(abs(zscore) >= zscoreCut, TRUE) - } - if(!is.na(dPsiCut)){ - goodCut <- goodCut & na2default(abs(dpsi) >= dPsiCut, TRUE) - } - if(!is.na(fdrCut)){ - goodCut <- goodCut & na2false(padj <= fdrCut) +#' Mapping of chromosome names +#' +#' @param fds FraserDataSet +#' @param style The style of the chromosome names. +#' @param ... Further parameters. For mapSeqLevels: further parameters +#' passed to GenomeInfoDb::mapSeqlevels(). +#' +#' @rdname fds-methods +#' @export +mapSeqlevels <- function(fds, style="UCSC", ...){ + + mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) + # fix missing names() when fds has only a single chromosome + if(is.null(names(mappings))){ + names(mappings) <- seqlevels(fds) } - if(!is.na(minCount)){ - goodCut <- goodCut & rawTotalCts[,sampleID] >= minCount + + if(length(mappings) != length(seqlevels(fds))){ + message(date(), ": Drop non standard chromosomes for compatibility.") + fds <- keepStandardChromosomes(fds) + nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) + validObject(fds) } + fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] + + seqlevels(fds) <- as.vector(mappings) + seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) + + return(fds) +} +#' +#' retrieve a single sample result object +#' @noRd +resultsSingleSample <- function(sampleID, gr, pvals, padjs, + psivals, rawCts, rawTotalCts, rawNonsplitCts, + rawNsProportion, nsProportion_99quantile, + deltaPsiVals, psiType, rowMeansK, rowMeansN, + aberrant, aggregate, rho, + pvalsGene=NULL, padjsGene=NULL, + aberrantGene, additionalColumns, + geneColumn="hgnc_symbol"){ + mcols(gr)$idx <- seq_along(gr) + # if gene level results, find the most aberrant junction per gene first + if(isTRUE(aggregate)){ + goodGenes <- rownames(aberrantGene)[aberrantGene[,sampleID] & + !is.na(aberrantGene[,sampleID])] + geneJunctions <- findJunctionsForAberrantGenes(gr=gr, + aberrantGenes=goodGenes, + pvals=pvals[,sampleID], + dpsi=deltaPsiVals[,sampleID], + geneColumn=geneColumn, + aberrantJunctions=aberrant[,sampleID]) + goodCut <- rep(FALSE, nrow(pvals)) + goodCut[geneJunctions] <- TRUE + } else{ + goodCut <- aberrant[,sampleID] + } + ans <- granges(gr[goodCut]) - + if(!any(goodCut)){ return(ans) } - - if(!"hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(gr)$hgnc_symbol <- NA_character_ + mcols(ans)$idx <- mcols(gr)$idx[goodCut] + + if(!geneColumn %in% colnames(mcols(gr))){ + mcols(gr)[,geneColumn] <- NA_character_ } - + # extract data mcols(ans)$sampleID <- Rle(sampleID) if("hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])$hgnc_symbol) - } - if("other_hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol) + mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])[,geneColumn]) } + mcols(ans)$type <- Rle(psiType) - mcols(ans)$pValue <- signif(pval[goodCut], 5) - mcols(ans)$padjust <- signif(padj[goodCut], 5) - mcols(ans)$zScore <- Rle(round(zscore[goodCut], 2)) + mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5) + mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5) mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2)) - mcols(ans)$deltaPsi <- Rle(round(dpsi[goodCut], 2)) - mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) - mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + mcols(ans)$deltaPsi <- round(deltaPsiVals[goodCut,sampleID], 2) mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID]) mcols(ans)$totalCounts <- Rle(rawTotalCts[goodCut, sampleID]) + mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) + mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + + if(psiType == "jaccard"){ + mcols(ans)$nonsplitCounts <- + Rle(round(rawNonsplitCts[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion <- + Rle(round(rawNsProportion[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion_99quantile <- + Rle(round(nsProportion_99quantile[goodCut], 2)) + } if(!is.null(additionalColumns)){ for(column in additionalColumns){ mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column]) } } - - return(ans[order(mcols(ans)$pValue)]) + + if(isTRUE(aggregate)){ + # report junction more than once if it is significant for several genes + nrGenesPerJunction <- table(geneJunctions) + ans <- rep(ans, nrGenesPerJunction[as.character(mcols(ans)$idx)]) + mcols(ans)$hgncSymbol <- + as.data.table(ans)[, names(geneJunctions)[geneJunctions == idx], + by = eval(colnames(mcols(ans)))][,V1] + + # add gene level pvalue + mcols(ans)$pValueGene <- + signif(pvalsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$padjustGene <- + signif(padjsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol) + } + + # remove helper column + mcols(ans)$idx <- NULL + + + return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))]) } -FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, - dPsiCutoff, psiType, BPPARAM=bpparam(), maxCols=20, - minCount, additionalColumns=NULL){ - - # check input - checkNaAndRange(fdrCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(dPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(zscoreCutoff, min=0, max=100, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) - +FRASER.results <- function(object, sampleIDs, fdrCutoff, + dPsiCutoff, minCount, rhoCutoff, psiType, + maxCols=20, aggregate=FALSE, collapse=FALSE, + geneColumn="hgnc_symbol", BPPARAM=bpparam(), + subsetName=NULL, all=all, additionalColumns=NULL){ + stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) - + + if("annotatedJunction" %in% colnames(mcols(object, type="j")) && + !("annotatedJunction" %in% additionalColumns)){ + additionalColumns <- c(additionalColumns, "annotatedJunction") + } + + # only extract results for requested psiTypes if pvals exist for them + stopifnot(all(psiType %in% psiTypes)) + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } + pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType, + filters=list(rho=rhoCutoff), + aggregate=aggregate, + subsetName=subsetName) + psiType <- psiType[pvalsAvailable] + if(all(isFALSE(pvalsAvailable))){ + stop("For the splice metric(s), pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function.") + } + resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ - message(date(), ": Collecting results for: ", type) + message(date(), ": Collecting results for: ", type, + ifelse(is.null(subsetName), " (transcriptome-wide)", + paste0(" (", subsetName, ")"))) currentType(object) <- type gr <- rowRanges(object, type=type) - + # first get row means rowMeansK <- rowMeans(K(object, type=type)) rowMeansN <- rowMeans(N(object, type=type)) - + + # get proportion of nonsplitCounts among all counts (N) for each intron + if(type == "jaccard"){ + rawNonsplitCts <- as.matrix(assay(object, "rawCountsJnonsplit")) + rawNsProportion <- rawNonsplitCts / as.matrix(N(object)) + nsProportion_99quantile <- + rowQuantiles(rawNsProportion, probs=0.99) + } else{ + rawNonsplitCts <- NULL + rawNsProportion <- NULL + nsProportion_99quantile <- NULL + } + # then iterate by chunk chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols) sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs, - chunkSize=chunkCols) - + chunkSize=chunkCols) + ans <- lapply(seq_along(sampleChunks), function(idx){ message(date(), ": Process chunk: ", idx, " for: ", type) sc <- sampleChunks[[idx]] tmp_x <- object[,sc] - + # extract values rawCts <- as.matrix(K(tmp_x)) rawTotalCts <- as.matrix(N(tmp_x)) - pvals <- as.matrix(pVals(tmp_x)) - padjs <- as.matrix(padjVals(tmp_x)) - zscores <- as.matrix(zScores(tmp_x)) + pvals <- as.matrix(pVals(tmp_x, + filters=list(rho=rhoCutoff))) + padjs <- as.matrix(padjVals(tmp_x, + subsetName=subsetName, + filters=list(rho=rhoCutoff))) psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) psivals_pc <- (rawCts + pseudocount()) / - (rawTotalCts + 2*pseudocount()) - deltaPsiVals <- psivals_pc - muPsi - + (rawTotalCts + 2*pseudocount()) + deltaPsiVals <- deltaPsiValue(tmp_x, type) + rho <- rho(tmp_x, type) + aberrant <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=FALSE, + all=all, + geneColumn=geneColumn, + subsetName=subsetName) + if(isTRUE(aggregate)){ + pvalsGene <- as.matrix(pVals(tmp_x, level="gene", + filters=list(rho=rhoCutoff))) + padjsGene <- as.matrix(padjVals(tmp_x, level="gene", + subsetName=subsetName, + filters=list(rho=rhoCutoff))) + aberrantGene <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=TRUE, + all=all, + geneColumn=geneColumn, + subsetName=subsetName) + } else{ + pvalsGene <- NULL + padjsGene <- NULL + aberrantGene <- NULL + } + if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc - colnames(zscores) <- sc colnames(deltaPsiVals) <- sc } # create result table sampleRes <- lapply(sc, - resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs, - zscores=zscores, psiType=type, psivals=psivals, - deltaPsiVals=deltaPsiVals, muPsi=muPsi, rawCts=rawCts, - rawTotalCts=rawTotalCts, fdrCut=fdrCutoff, - zscoreCut=zscoreCutoff, dPsiCut=dPsiCutoff, - rowMeansK=rowMeansK, rowMeansN=rowMeansN, - minCount=minCount, additionalColumns=additionalColumns) - + resultsSingleSample, gr=gr, pvals=pvals, + padjs=padjs, psiType=type, + psivals=psivals, deltaPsiVals=deltaPsiVals, + rawCts=rawCts, rawTotalCts=rawTotalCts, + rawNonsplitCts=rawNonsplitCts[,sc,drop=FALSE], + rawNsProportion=rawNsProportion[,sc,drop=FALSE], + nsProportion_99quantile=nsProportion_99quantile, + rowMeansK=rowMeansK, rowMeansN=rowMeansN, + aberrant=aberrant, aggregate=aggregate, + rho=rho, geneColumn=geneColumn, + pvalsGene=pvalsGene, padjsGene=padjsGene, + aberrantGene=aberrantGene, + additionalColumns=additionalColumns) + # return combined result return(unlist(GRangesList(sampleRes))) }) - + unlist(GRangesList(ans)) }) - + # merge results ans <- unlist(GRangesList(resultsls)) - + # sort it if existing if(length(ans) > 0){ ans <- ans[order(ans$pValue)] + if(is.null(subsetName)){ + mcols(ans)[["FDR_set"]] <- "transcriptome-wide" + } else{ + mcols(ans)[["FDR_set"]] <- subsetName + } } - + + # collapse into one row per gene if requested + if(isTRUE(aggregate) && isTRUE(collapse)){ + ans <- collapseResTablePerGene(ans) + } + # return only the results return(ans) } + #' #' Extracting results and aberrant splicing events #' @@ -733,31 +880,41 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param sampleIDs A vector of sample IDs for which results should be #' retrieved #' @param padjCutoff The FDR cutoff to be applied or NA if not requested. -#' @param zScoreCutoff The z-score cutoff to be applied or NA if not requested. #' @param deltaPsiCutoff The cutoff on delta psi or NA if not requested. #' @param minCount The minimum count value of the total coverage of an intron #' to be considered as significant. #' result +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which +#' junctions are filtered #' @param psiType The psi types for which the results should be retrieved. #' @param additionalColumns Character vector containing the names of additional #' columns from mcols(fds) that should appear in the result table #' (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns #' are included. #' @param BPPARAM The BiocParallel parameter. -#' @param res Result as created with \code{results()} -#' @param geneColumn The name of the column in \code{mcols(res)} that contains -#' the gene symbols. -#' @param method The p.adjust method that is being used to adjust p values per -#' sample. #' @param type Splicing type (psi5, psi3 or theta) #' @param by By default \code{none} which means no grouping. But if #' \code{sample} or \code{feature} is specified the sum by #' sample or feature is returned -#' @param aggregate If TRUE the returned object is based on the grouped -#' features -#' @param ... Further arguments can be passed to the method. If "zscores", -#' "padjVals" or "dPsi" is given, the values of those arguments -#' are used to define the aberrant events. +#' @param aggregate If TRUE the returned object is aggregated to the feature +#' level (i.e. gene level). +#' @param collapse Only takes effect if \code{aggregate=TRUE}. +#' If TRUE, collapses results across the different psi +#' types to return only one row per feature (gene) and sample. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. +#' @param all By default FALSE, only significant introns (or genes) are listed +#' in the results. If TRUE, results are assembled for all +#' samples and introns/genes regardless of significance. +#' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets +#' of genes of interest have been calculated, this parameter +#' indicates whether additionally the transcriptome-wide results +#' should be returned as well (default), or whether only results +#' for those subsets should be retrieved. +#' @param ... Further arguments can be passed to the method. If "n", +#' "padjVals", "dPsi" or "rhoVals" are given, the values of those +#' arguments are used to define the aberrant events. #' #' @return For \code{results}: GRanges object containing significant results. #' For \code{aberrant}: Either a of logical values of size @@ -770,186 +927,212 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' # get data, fit and compute p-values and z-scores #' fds <- createTestFraserDataSet() #' -#' # extract results: for this example dataset, z score cutoff of 2 is used to -#' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +#' # extract results: for this example dataset, no cutoffs are used to +#' # show the output of the results function +#' res <- results(fds, all=TRUE) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first #' # using annotateRanges() function) -#' resultsByGenes(res) +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) +#' +#' # aggregate the results by genes and collapse over all psi types to obtain +#' # only one row per gene in the results table +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, +#' collapse=TRUE) #' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value -#' aberrant(fds, type="psi5", by="sample") +#' aberrant(fds, type="jaccard", by="sample") #' #' # get aberrant events per gene (first annotate gene symbols) #' fds <- annotateRangesWithTxDb(fds) -#' aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, -#' aggregate=TRUE) +#' aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) #' #' # find aberrant junctions/splice sites -#' aberrant(fds, type="psi5") -#' @export -setMethod("results", "FraserDataSet", function(object, - sampleIDs=samples(object), padjCutoff=0.05, - zScoreCutoff=NA, deltaPsiCutoff=0.3, - minCount=5, psiType=c("psi3", "psi5", "theta"), - additionalColumns=NULL, BPPARAM=bpparam(), ...){ - FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, - zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, - minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), - additionalColumns=additionalColumns, BPPARAM=BPPARAM) -}) - -#' @rdname results -#' @export -resultsByGenes <- function(res, geneColumn="hgncSymbol", method="BY"){ - # sort by pvalue - res <- res[order(res$pValue)] - - # extract subset - if(is(res, "GRanges")){ - ans <- as.data.table(mcols(res)[,c(geneColumn, "pValue", "sampleID")]) - colnames(ans) <- c("features", "pval", "sampleID") - } else { - ans <- featureNames <- res[,.( - features=get(geneColumn), pval=pValue, sampleID=sampleID)] - } - - # remove NAs - naIdx <- ans[,is.na(features)] - ansNoNA <- ans[!is.na(features)] - - # compute pvalues by gene - ansNoNA[,pByFeature:=min(p.adjust(pval, method="holm")), - by="sampleID,features"] - - # subset to lowest pvalue by gene - dupIdx <- duplicated(ansNoNA[,.(features,sampleID)]) - ansGenes <- ansNoNA[!dupIdx] - - # compute FDR - ansGenes[,fdrByFeature:=p.adjust(pByFeature, method=method), - by="sampleID"] - - # get final result table - finalAns <- res[!naIdx][!dupIdx] - finalAns$pValueGene <- ansGenes$pByFeature - finalAns$padjustGene <- ansGenes$fdrByFeature - finalAns -} - -#' -#' Mapping of chromosome names -#' -#' @param fds FraserDataSet -#' @param style The style of the chromosome names. -#' @param ... Further parameters. For mapSeqLevels: further parameters -#' passed to GenomeInfoDb::mapSeqlevels(). +#' aberrant(fds, type="jaccard") +#' +#' # retrieve results limiting FDR correction to only a subset of genes +#' # first, we need to create a list of genes per sample that will be tested +#' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=geneList)) +#' results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) #' -#' @rdname fds-methods #' @export -mapSeqlevels <- function(fds, style="UCSC", ...){ - - mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) - # fix missing names() when fds has only a single chromosome - if(is.null(names(mappings))){ - names(mappings) <- seqlevels(fds) +setMethod("results", "FraserDataSet", function(object, + sampleIDs=samples(object), padjCutoff=0.1, + deltaPsiCutoff=0.1, + rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, + minCount=5, psiType=psiTypes, + geneColumn="hgnc_symbol", all=FALSE, + returnTranscriptomewideResults=TRUE, + additionalColumns=NULL, BPPARAM=bpparam()){ + psiType <- match.arg(psiType, several.ok=TRUE) + FDRsets <- availableFDRsubsets(object) + + if(isFALSE(returnTranscriptomewideResults) && is.null(FDRsets)){ + warning("Retrieving transcriptome-wide results as no other ", + "FDR subsets are available in the fds object.") + returnTranscriptomewideResults <- TRUE + } + if(isTRUE(returnTranscriptomewideResults)){ + res <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, all=all, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + subsetName=NULL, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) } - - if(length(mappings) != length(seqlevels(fds))){ - message(date(), ": Drop non standard chromosomes for compatibility.") - fds <- keepStandardChromosomes(fds) - nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) - validObject(fds) + + # add results for FDR_subsets if requested + if(!is.null(FDRsets)){ + resls_subsets <- lapply(FDRsets, function(setName){ + res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, all=all, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + subsetName=setName, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) + }) + + if(isTRUE(returnTranscriptomewideResults)){ + res <- unlist(GRangesList(unlist(list(res, resls_subsets)))) + } else{ + res <- unlist(GRangesList(unlist(resls_subsets))) + } + + # sort it if existing + if(length(res) > 0){ + res <- res[order(res$pValue)] + if(isTRUE(aggregate)){ + res <- res[!is.na(res$pValueGene)] + } + } } - fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] - - seqlevels(fds) <- as.vector(mappings) - seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) - - return(fds) -} - - -aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, - deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5, - by=c("none", "sample", "feature"), aggregate=FALSE, ...){ + return(res) +}) - checkNaAndRange(zScoreCutoff, min=0, max=Inf, na.ok=TRUE) - checkNaAndRange(padjCutoff, min=0, max=1, na.ok=TRUE) - checkNaAndRange(deltaPsiCutoff, min=0, max=1, na.ok=TRUE) +aberrant.FRASER <- function(object, type=fitMetrics(object), + padjCutoff=0.1, deltaPsiCutoff=0.1, + minCount=5, rhoCutoff=NA, + by=c("none", "sample", "feature"), + aggregate=FALSE, geneColumn="hgnc_symbol", + subsetName=NULL, all=FALSE, ...){ + + checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) by <- match.arg(by) - + type <- match.arg(type) + + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } + dots <- list(...) if("n" %in% names(dots)){ n <- dots[['n']] } else { n <- N(object, type=type) } - if("zscores" %in% names(dots)){ - zscores <- dots[['zscores']] - } else { - zscores <- zScores(object, type=type) - } if("padjVals" %in% names(dots)){ padj <- dots[['padjVals']] } else { - padj <- padjVals(object, type=type) + # check if padj values are available for the given filters + pvalsAvailable <- checkPadjAvailableForFilters(object, type=type, + filters=list(rho=rhoCutoff), + aggregate=aggregate, + subsetName=subsetName) + if(isFALSE(pvalsAvailable)){ + stop("For the given filters, pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function with the requested filters.") + } + padj <- padjVals(object, type=type, level="site", subsetName=subsetName, + filters=list(rho=rhoCutoff)) } if("dPsi" %in% names(dots)){ dpsi <- dots[['dPsi']] } else { dpsi <- deltaPsiValue(object, type=type) } - - - # create cutoff matrix - goodCutoff <- matrix(TRUE, nrow=nrow(zscores), ncol=ncol(zscores), - dimnames=dimnames(zscores)) - if("hgnc_symbol" %in% colnames(mcols(object, type=type)) & - nrow(mcols(object, type=type)) == nrow(goodCutoff)){ - rownames(goodCutoff) <- mcols(object, type=type)[,"hgnc_symbol"] - } else if(isTRUE(aggregate)){ - stop("Please provide hgnc symbols to compute gene p values!") + if("rhoVals" %in% names(dots)){ + rho <- dots[['rhoVals']] + } else { + rho <- matrix(rho(object, type=type), + nrow=nrow(dpsi), ncol=ncol(dpsi)) + } + if(isTRUE(aggregate)){ + if("padjGeneVals" %in% names(dots)){ + padj_gene <- dots[['padjGeneVals']] + } else{ + padj_gene <- padjVals(object, type=type, level="gene", + subsetName=subsetName, + filters=list(rho=rhoCutoff)) + } + } - # check each cutoff if in use (not NA) - if(!is.na(minCount)){ - goodCutoff <- goodCutoff & as.matrix(n >= minCount) - } - if(!is.na(zScoreCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(zscores) >= zScoreCutoff) - } - if(!is.na(deltaPsiCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(dpsi) >= deltaPsiCutoff) + if(is.na(padjCutoff)){ + padjCutoff <- 1 } - if(!is.na(padjCutoff)){ - goodCutoff <- goodCutoff & as.matrix(padj <= padjCutoff) - } - goodCutoff[is.na(goodCutoff)] <- FALSE - # check if we should go for aggregation - # TODO to speed it up we only use any hit within a feature - # but should do a holm's + BY correction per gene and genome wide + if(isTRUE(all)){ + aberrantEvents <- matrix(TRUE, nrow=nrow(object), ncol=ncol(object)) + colnames(aberrantEvents) <- colnames(fds) + } else{ + aberrantEvents <- as.matrix(padj) <= padjCutoff + + # check each cutoff if in use (not NA) + if(!is.na(minCount)){ + aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) + } + if(!is.na(deltaPsiCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(dpsi) >= deltaPsiCutoff) + } + if(!is.na(rhoCutoff)){ + aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff) + } + aberrantEvents[is.na(aberrantEvents)] <- FALSE + } + if(isTRUE(aggregate)){ - goodCutoff <- as.matrix(data.table(goodCutoff, keep.rownames=TRUE)[, - as.data.table(t(colAnys(as.matrix(.SD)))), by=rn][,-1]) - rownames(goodCutoff) <- unique(mcols(object, type=type)[,"hgnc_symbol"]) - colnames(goodCutoff) <- colnames(zscores) + if(is.null(rownames(padj_gene))){ + stop("Missing rownames for gene-level padj values.") + } + # reduce aberrant matrix to one row per gene + # (TRUE if any junction is aberrant for each sample) + ab_dt <- data.table(geneID=getGeneIDs(object, type=type, unique=FALSE, + geneColumn=geneColumn), + aberrantEvents) + ab_dt[, dt_idx:=seq_len(.N)] + dt_tmp <- ab_dt[!is.na(geneID), splitGenes(geneID), by="dt_idx"] + ab_dt <- ab_dt[dt_tmp$dt_idx] + ab_dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + ab_dt <- ab_dt[,lapply(.SD, any), by="geneID"] + aberrantEvents <- as.matrix(ab_dt[,-1]) + rownames(aberrantEvents) <- ab_dt[,geneID] + + if(isFALSE(all)){ + aberrantEvents <- aberrantEvents & as.matrix( + padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] + ) <= padjCutoff + } } - # return results - if(by == "feature"){ - return(rowSums(goodCutoff)) - } - if(by == "sample"){ - return(colSums(goodCutoff)) - } - return(goodCutoff) + return(switch(match.arg(by), + none = aberrantEvents, + sample = colSums(aberrantEvents, na.rm=TRUE), + feature = rowSums(aberrantEvents, na.rm=TRUE) + )) } #' @rdname results #' @export setMethod("aberrant", "FraserDataSet", aberrant.FRASER) + diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 87e5ad96..a175a0bd 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -22,15 +22,18 @@ #' ### GRange/Experiment/bamFile packages #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which -#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes +#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes exons +#' fiveUTRsByTranscript threeUTRsByTranscript seqlevels0 #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions #' readGAlignmentPairs #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames #' colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList -#' makeGRangesFromDataFrame invertStrand -#' @importFrom IRanges subsetByOverlaps from to IRanges ranges +#' makeGRangesFromDataFrame invertStrand start end start<- end<- +#' seqinfo<- +#' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance +#' %over% #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter #' bamWhich bamWhich<- BamFile idxstatsBam #' @importFrom Rsubread featureCounts @@ -41,16 +44,14 @@ #' @importFrom biomaRt useEnsembl getBM #' @importFrom AnnotationDbi select #' -#' ### Plotting #' #' @importFrom plotly plot_ly subplot layout add_trace ggplotly #' @importFrom pheatmap pheatmap #' @importFrom RColorBrewer brewer.pal -#' @importFrom cowplot theme_cowplot +#' @importFrom cowplot theme_cowplot background_grid #' @importFrom ggrepel geom_text_repel #' -#' ### Data handling #' #' @importFrom HDF5Array writeHDF5Array path HDF5Array @@ -74,18 +75,20 @@ #' @importFrom R.utils renameFile withTimeout #' @importFrom tools file_path_as_absolute #' @importFrom methods as callNextMethod is new show slot slot<- validObject -#' @importFrom utils capture.output packageVersion +#' @importFrom utils capture.output packageVersion tail #' #' #' ### To be added into the functions above #' #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<- -#' start end metadata metadata<- subjectHits queryHits +#' start end metadata metadata<- subjectHits queryHits elementMetadata +#' values values<- #' @importFrom grDevices colorRampPalette #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels #' seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames -#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' sortSeqlevels #' @importFrom DelayedArray rowMaxs rowMeans path<- cbind plogis qlogis #' DelayedArray #' @importFrom DelayedMatrixStats colSds rowMedians rowSds colMeans2 rowMeans2 @@ -100,9 +103,10 @@ #' scale_y_log10 scale_color_gradientn labs theme_bw theme #' scale_color_brewer scale_color_discrete scale_linetype_manual #' annotate geom_histogram scale_fill_manual xlim scale_colour_manual -#' element_blank annotation_logticks +#' element_blank annotation_logticks ylim quo_name facet_grid +#' facet_wrap geom_text #' -#' @importFrom tibble as_tibble +#' @importFrom tibble as_tibble %>% #' #' @useDynLib FRASER #' @@ -128,5 +132,12 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa", "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", - "uniqueID", "V1", "value", "zscore", "maxDTheta"), + "uniqueID", "V1", "value", "zscore", "maxDTheta", "par", "genes_donor", + "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", + "blacklist", "potentialImpact", "causesFrameshift", "annotatedJunction", + "distNearestGene", "UTR_overlap", "meanCount", "medianCount", + "potentialImpact2", "nonsplitProportion", "nonsplitCounts", + "nonsplitProportion_99quantile", "startID", "endID", "j_idx", "jidx", + "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", + "pvalue"), package="FRASER") diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index c3b9b967..92d0b492 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -31,7 +31,7 @@ #' splicing types. #' @param implementation The method that should be used to correct for #' confounders. -#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing +#' @param type The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing #' efficiency) #' @param iterations The maximal number of iterations. When the autoencoder has #' not yet converged after these number of iterations, the fit stops anyway. @@ -61,16 +61,32 @@ #' # The functions run inside the FRASER function can also be directly #' # run themselves. #' # To directly run the fit function: -#' fds <- fit(fds, implementation="PCA", q=2, type="psi5") +#' fds <- fit(fds, implementation="PCA", q=2, type="jaccard") #' #' # To directly run the nomial and adjusted p value and z score #' # calculation, the following functions can be used: -#' fds <- calculatePvalues(fds, type="psi5") -#' head(pVals(fds, type="psi5")) -#' fds <- calculatePadjValues(fds, type="psi5", method="BY") -#' head(padjVals(fds, type="psi5")) -#' fds <- calculateZscore(fds, type="psi5") -#' head(zScores(fds, type="psi5")) +#' fds <- calculatePvalues(fds, type="jaccard") +#' head(pVals(fds, type="jaccard")) +#' fds <- calculatePadjValues(fds, type="jaccard", method="BY") +#' head(padjVals(fds, type="jaccard")) +#' fds <- calculateZscore(fds, type="jaccard") +#' head(zScores(fds, type="jaccard")) +#' +#' # example of restricting FDR correction to subsets of genes of interest +#' genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=genesOfInterest)) +#' padjVals(fds, type="jaccard", subsetName="exampleSubset") +#' padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +#' padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") +#' +#' # only adding FDR corrected pvalues on a subset without calculating +#' # transcriptome-wide FDR again: +#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, +#' subsetName="setOfInterest", type="jaccard") +#' padjVals(fds, type="jaccard", subsetName="setOfInterest") #' #' @seealso \code{\link[FRASER]{fit}} #' @@ -85,8 +101,9 @@ NULL #' the beta-binomial fit, the computation of Z scores and p values as well as #' the computation of delta-PSI values. #' @export -FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", - "AE-weighted", "AE", "BB"), +FRASER <- function(fds, q, type=fitMetrics(fds), + implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", + "AE", "BB"), iterations=15, BPPARAM=bpparam(), correction, ...){ # Check input implementation <- match.arg(implementation) @@ -103,7 +120,7 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", } # fit each splicing type separately - for(i in psiTypes){ + for(i in type){ # get type specific q if(missing(q)){ @@ -128,8 +145,8 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", message(date(), ": Adjust p values for: '", i, "'.") fds <- calculatePadjValues(fds, type=i) - message(date(), ": Compute Z scores for: '", i, "'.") - fds <- calculateZscore(fds, type=i) + # message(date(), ": Compute Z scores for: '", i, "'.") + # fds <- calculateZscore(fds, type=i) } # return final analysis diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R index 630361d4..62eee8e8 100644 --- a/R/FraserDataSet-class.R +++ b/R/FraserDataSet-class.R @@ -218,13 +218,13 @@ showFraserDataSet <- function(object) { cat("\n") cat("-------------------- BAM parameters --------------------\n") - if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ - cat(paste0("Default used with: ", - "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) - )) - } else { + # if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ + # cat(paste0("Default used with: ", + # "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) + # )) + # } else { show(scanBamParam(object)) - } + # } cat("\n\n") } diff --git a/R/RcppExports.R b/R/RcppExports.R index 659f3b4c..42ec8fbc 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,6 +41,10 @@ truncNLL_rho <- function(rho, yi, ki, ni) { .Call('_FRASER_truncNLL_rho', PACKAGE = 'FRASER', rho, yi, ki, ni) } +truncNLL_rho_penalized <- function(logit_rho, yi, ki, ni, lambda) { + .Call('_FRASER_truncNLL_rho_penalized', PACKAGE = 'FRASER', logit_rho, yi, ki, ni, lambda) +} + fullNLL <- function(y, rho, k, n, D, lambda, byRows = FALSE) { .Call('_FRASER_fullNLL', PACKAGE = 'FRASER', y, rho, k, n, D, lambda, byRows) } diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R index 62f6c377..8796cd0b 100644 --- a/R/annotationOfRanges.R +++ b/R/annotationOfRanges.R @@ -17,6 +17,10 @@ #' \code{TxDb.Hsapiens.UCSC.hg19.knownGene}. #' @param orgDb An \code{orgDb} object or a data table to map the feature names. #' If this is NULL, then \code{org.Hs.eg.db} is used as the default. +#' @param filter A named list specifying the filters which should be applied to +#' subset to e.g. only protein-coding genes for annotation. +#' \code{names(filter)} needs to be column names in the given +#' orgDb object (default: no filtering). #' @param keytype The keytype or column name of gene IDs in the \code{TxDb} #' object (see #' \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}} @@ -32,13 +36,13 @@ #' # either using biomart with GRCh38 #' try({ #' fds <- annotateRanges(fds, GRCh=38) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol")] #' }) #' #' # either using biomart with GRCh37 #' try({ #' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol_37")] #' }) #' #' # or with a provided TxDb object @@ -47,7 +51,7 @@ #' require(org.Hs.eg.db) #' orgDb <- org.Hs.eg.db #' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -#' rowRanges(fds, type="psi5")[,"hgnc_symbol"] +#' rowRanges(fds, type="j")[,"hgnc_symbol"] #' #' @rdname annotateRanges #' @export @@ -59,9 +63,6 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, if(length(fds) == 0) return(fds) # useEnsembl only understands GRCh=37 or GRCh=NULL (uses 38 then) - if(is.null(GRCh)){ - GRCh <- 38 - } if(GRCh == 38){ GRCh <- NULL } @@ -89,17 +90,16 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, annotation <- getFeatureAsGRange(ensembl, feature, featureName, biotype, useUSCS) - # annotate split reads - for(i in c("psi3", "theta")){ - gr <- rowRanges(fds, type=i) - if(any(strand(gr) == "*")){ - strand(annotation) <- "*" - } - annos <- getAnnotationFeature(data=gr, featureName, annotation) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # annotate splice sites first + gr <- rowRanges(fds, type="theta") + if(any(strand(gr) == "*")){ + strand(annotation) <- "*" } + annos <- getAnnotationFeature(data=gr, featureName, annotation) + mcols(fds, type="theta")[[featureName]] <- annos + + # annotate junctions with genes at donor and acceptor sites + fds <- annotateFeatureFromSpliceSite(fds, featureName) return(fds) } @@ -108,7 +108,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, #' @export annotateRangesWithTxDb <- function(fds, feature="SYMBOL", featureName="hgnc_symbol", keytype="ENTREZID", - txdb=NULL, orgDb=NULL){ + txdb=NULL, orgDb=NULL, filter=list()){ gene_id <- NULL # check input @@ -132,41 +132,49 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL", } } - for(i in c("psi3", "theta")){ - # get GRanges object with the split reads which should be annotated - gr <- rowRanges(fds, type=i) - - # get the annotation to compare to - anno <- genes(txdb) - if(is.data.table(orgDb)){ - tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, - by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, - .(gene_id, feature=get(feature))] - setnames(tmp, "feature", feature) - } else { - tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], - columns=feature, keytype=keytype)) - } - - # add the new feature to the annotation - tmp[, uniqueID := .GRP, by=keytype] - anno <- anno[tmp[,uniqueID]] - mcols(anno)[[featureName]] <- tmp[,get(feature)] + # get GRanges object with the splice sites which should be annotated + gr <- rowRanges(fds, type="theta") + + # get the annotation to compare to + anno <- genes(txdb) + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, + by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, + c("gene_id", feature, names(filter)), with=FALSE] + } else { + tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], + columns=c(feature, names(filter)), keytype=keytype)) + } - # clean up of NA and "" ids - anno <- anno[!is.na(mcols(anno)[,featureName]),] - anno <- anno[mcols(anno)[,featureName] != "",] - if(any(strand(gr) == "*")){ - strand(anno) <- "*" + # filter genes as specified by user (e.g. only protein_coding) + tmp[, include:=TRUE] + if(!is.null(filter) & length(filter) > 0 & !is.null(names(filter))){ + for(n in names(filter)){ + stopifnot(n %in% colnames(tmp)) + tmp[!(get(n) %in% filter[[n]]), include:=FALSE] } + } + + # add the new feature to the annotation + tmp[, uniqueID := .GRP, by=keytype] + tmp <- tmp[include == TRUE,] + anno <- anno[tmp[,uniqueID]] + mcols(anno)[[featureName]] <- tmp[,get(feature)] - # retrieve the feature of interest for the split reads - annos <- getAnnotationFeature(data=gr, featureName, anno) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # clean up of NA and "" ids + anno <- anno[!is.na(mcols(anno)[,featureName]),] + anno <- anno[mcols(anno)[,featureName] != "",] + if(any(strand(gr) == "*")){ + strand(anno) <- "*" } + # retrieve the feature of interest for the splice sites + annos <- getAnnotationFeature(data=gr, featureName, anno) + mcols(fds, type="theta")[[featureName]] <- annos + + # transfer annoated features for splice sites to junctions + fds <- annotateFeatureFromSpliceSite(fds, featureName) + return(fds) } @@ -228,14 +236,12 @@ getAnnotationFeature <- function(data, feature, annotation){ } # extract only the feature and group them with a ";" - featureDT <- featureDT[, - list(first_feature=unique(feature)[1], - other_features=paste(unique(feature)[-1], collapse = ";")), - by="from" - ] + featureDT <- featureDT[,feature:=paste(unique(feature), collapse = ";"), + by="from"] + featureDT <- featureDT[!duplicated(featureDT),] + featureDT[feature == "NA", feature:=NA] - return(list(feature=featureDT[order(from),first_feature], - other_features=featureDT[order(from),other_features])) + return(featureDT[order(from),feature]) } @@ -314,4 +320,27 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE, fds } - +#' annotate junctions with genes at donor and acceptor sites +#' @noRd +annotateFeatureFromSpliceSite <- function(fds, featureName){ + ssdt <- data.table(spliceSiteID=mcols(fds, type="theta")$spliceSiteID, + genes=mcols(fds, type="theta")[[featureName]] + ) + junction_dt <- data.table(startID=mcols(fds, type="psi3")$startID, + endID=mcols(fds, type="psi3")$endID + ) + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="startID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_donor") + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="endID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_acceptor") + + junction_dt[,genes:=paste(uniqueIgnoreNA( + c(splitGenes(genes_donor), splitGenes(genes_acceptor))), + collapse=";"), + by="startID,endID"] + junction_dt[genes == "NA", genes:=NA] + mcols(fds, type="j")[[featureName]] <- junction_dt[,genes] + return(fds) +} diff --git a/R/autoencoder.R b/R/autoencoder.R index e0e066a3..d9d0e7a2 100644 --- a/R/autoencoder.R +++ b/R/autoencoder.R @@ -2,8 +2,9 @@ #' Main autoencoder fit function #' #' @noRd -fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1, - rhoRange=c(1e-5, 1-1e-5), lambda=0, convergence=1e-5, +fitAutoencoder <- function(fds, q, type=currentType(fds), noiseAlpha=1, + minDeltaPsi=0.1, + rhoRange=c(-30, 30), lambda=0, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5, weighted=FALSE, nSubset=15000, multiRho=FALSE, diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index e605ba67..f9f3f991 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -14,14 +14,14 @@ #' #' @inheritParams countRNA #' @param types A vector with the psi types which should be calculated. Default -#' is all of psi5, psi3 and theta. +#' is all of jaccard, psi5, psi3 and theta. #' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will #' be recalculated based on the existing junction counts (aka K) #' @return FraserDataSet #' @export #' @examples #' fds <- createTestFraserDataSet() -#' fds <- calculatePSIValues(fds, types="psi5") +#' fds <- calculatePSIValues(fds, types="jaccard") #' #' ### usually one would run this function for all psi types by using: #' # fds <- calculatePSIValues(fds) @@ -36,6 +36,10 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, overwriteCts=overwriteCts, BPPARAM=BPPARAM) } + # calculate intron jaccard index + fds <- calculateIntronNonsplitSum(fds, overwriteCts=overwriteCts) + fds <- calculateJaccardIntronIndex(fds, overwriteCts=overwriteCts) + # calculate the delta psi value for(psiType in types){ assayName <- paste0("delta_", psiType) @@ -183,7 +187,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ # check input stopifnot(is(fds, "FraserDataSet")) - message(date(), ": Calculate the PSI site values ...") + message(date(), ": Calculate the theta values ...") psiName <- "theta" psiROCName <- "rawOtherCounts_theta" @@ -319,3 +323,101 @@ getOtherCountsCacheFolder <- function(fds){ # return it return(cachedir) } + +#' +#' calculates the jaccard intron value for the given junctions +#' +#' @noRd +calculateJaccardIntronIndex <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the Jaccard Intron values ...") + + # check if we have computed N_psi3, N_psi5 and K_nonsplit already + if(!all(c(paste0("rawOtherCounts_psi", c(5, 3)), "rawCountsJnonsplit") %in% + assayNames(fds))){ + stop("Please calculate N_psi3, N_psi5 and K_nonsplit first before ", + "calling this function.") + } + + # calculate intron jaccard value + jaccard_denom <- N(fds, "psi3") + N(fds, "psi5") + + assay(fds, "rawCountsJnonsplit") - K(fds, type="j") + jaccardValues <- K(fds, type="j") / jaccard_denom + otherCounts_jaccard <- jaccard_denom - K(fds, type="j") + + # assign it to our object + assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues + + if(isTRUE(overwriteCts) || + !("rawOtherCounts_jaccard" %in% assayNames(fds))){ + assay(fds, type="j", "rawOtherCounts_jaccard", + withDimnames=FALSE) <- otherCounts_jaccard + } + + return(fds) +} + +#' Calculates the sum of nonsplit reads overlapping either the donor or +#' acceptor splice site and stores it as a new assay (one value for each +#' junction and sample). +#' +#' @noRd +calculateIntronNonsplitSum <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the total nonsplit counts for each intron ", + "...") + + + # get splice site nonsplit counts + nsr_ss <- K(fds, "theta") + + # retrieve junction and splice site annotation + junction_dt <- as.data.table(rowRanges(fds, type="j"))[, + .(seqnames, start, end, + strand, startID, endID)] + junction_dt[, j_idx:=seq_len(.N)] + ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, + nsr_idx=seq_len(nrow(nonSplicedReads(fds)))) + + junction_dt <- merge(junction_dt, ss_map, + by.x="startID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "start_idx") + junction_dt <- merge(junction_dt, ss_map, + by.x="endID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "end_idx") + + # for each junction, find the two rows in K_theta corresponding to its + # donor and acceptor splice site + donor_sites <- junction_dt[!is.na(start_idx),] + acc_sites <- junction_dt[!is.na(end_idx),] + + # set nsr counts to 0 for junctions for which no mapping by spliceSiteID + # could be found + nsr_donor <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + nsr_acc <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + + nsr_donor[donor_sites[,j_idx],] <- + as.matrix(nsr_ss[donor_sites[,start_idx],]) + nsr_acc[acc_sites[,j_idx],] <- + as.matrix(nsr_ss[acc_sites[,end_idx],]) + + # sum them + nsr_j <- nsr_donor + nsr_acc + + if(nrow(nsr_j) != nrow(fds)){ + warning("Unequal number of junctions in fds and junctions with ", + "computed nonsplit count sum!") + } + + # assign it to our object + if(isTRUE(overwriteCts) || + !("rawCountsJnonsplit" %in% assayNames(fds))){ + assay(fds, type="j", "rawCountsJnonsplit", withDimnames=FALSE) <- nsr_j + } + + return(fds) +} diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index f68c4bdc..9ef6ef5f 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -361,7 +361,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges, " splice junctions are found.") # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) message(date(), ": In total ", length(spliceSiteCoords), " splice sites (acceptor/donor) will be counted ...") @@ -569,6 +569,10 @@ countSplitReadsPerChromosome <- function(chromosome, bamFile, bamFile, param=param, strandMode=strandMode) } + # remove read pairs with NA seqnames + # (occurs if reads of a pair align to different chromosomes) + galignment <- galignment[!is.na(seqnames(galignment))] + # remove the strand information if unstranded data if(isFALSE(as.logical(strandMode))){ strand(galignment) <- "*" @@ -852,7 +856,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, } # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) } @@ -895,7 +899,8 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, # extract the counts with Rsubread tmp_ssc <- checkSeqLevelStyle(spliceSiteCoords, fds, sampleID, TRUE) - anno <- GRanges2SAF(tmp_ssc, minAnchor=minAnchor) + # use minAnchor+1 here to allow for small variants in the anchor region + anno <- GRanges2SAF(tmp_ssc, minAnchor=(minAnchor+1)) rsubreadCounts <- featureCounts(files=bamFile, annot.ext=anno, minOverlap=minAnchor*2, allowMultiOverlap=TRUE, @@ -974,17 +979,12 @@ readJunctionMap <- function(junctionMap){ #' extracts the splice site coordinates from a junctions GRange object ( #' @noRd -extractSpliceSiteCoordinates <- function(junctions, fds){ +extractSpliceSiteCoordinates <- function(junctions){ - if(strandSpecific(fds) >= 1L){ - spliceSiteCoords <- unlist(GRangesList( - extractSpliceSiteCoordsPerStrand(junctions, "+"), - extractSpliceSiteCoordsPerStrand(junctions, "-") - )) - } else { - strand(junctions) <- "*" - spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") - } + spliceSiteCoords <- unlist(GRangesList( + lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, + junctions=junctions) + )) return(unique(sort(spliceSiteCoords))) } @@ -1049,15 +1049,21 @@ annotateSpliceSite <- function(gr){ dt <- GRanges2SAF(gr) # extract donor/acceptor annotation - startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] - endSideDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] + endSiteDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT[,Start:=Start-1] + endSiteDT[,End:=End+1] # annotate and enumerate donor/acceptor - annotadedDT <- rbind(startSideDT, endSideDT) - annotadedDT[,id:=seq_len(nrow(annotadedDT))] + annotatedDT <- rbind(startSiteDT, endSiteDT) + annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"] + + # set back start / end positions for merging with junction ranges + annotatedDT[type == "start", Start:=End] + annotatedDT[type == "end", End:=Start] # convert back to granges - annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE) + annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE) ids <- lapply(c("start", "end"), function(type){ # reduce annogr to only the specific type to prevent overlap diff --git a/R/example_functions.R b/R/example_functions.R index 620b5ffc..c4b992be 100644 --- a/R/example_functions.R +++ b/R/example_functions.R @@ -61,7 +61,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ if(all(file.exists(hdf5Files))){ if(isFALSE(rerun)){ fds <- loadFraserDataSet(workingDir, name="Data_Analysis") - if(all(paste0(c("zScores", "padjBetaBinomial", "predictedMeans"), + if(all(paste0(c("padjBetaBinomial", "predictedMeans"), "_", rep(psiTypes, 3)) %in% assayNames(fds))){ message(date(), ": Use existing cache data.") return(fds) @@ -80,12 +80,12 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=5, minDeltaPsi=0, quantileMinExpression=0) - # run FRASER pipeline - fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2) - # annotate it suppressMessages({ fds <- annotateRangesWithTxDb(fds) }) + # run FRASER pipeline + fds <- FRASER(fds, q=c(jaccard=2, psi5=2, psi3=2, theta=2), iterations=2) + # save data for later fds <- saveFraserDataSet(fds) diff --git a/R/filterExpression.R b/R/filterExpression.R index dde3b29a..6c7eab8c 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -17,6 +17,9 @@ #' passed all filters is returned. If FALSE, no subsetting is done and the #' information of whether an intron passed the filters is only stored in the #' mcols. +#' @param filterOnJaccard If TRUE, the Intron Jaccard Metric is used to define +#' express introns during fitlering. Otherwise, the psi5, psi3 and theta +#' metrics are used (default: TRUE). #' @param delayed If FALSE, count matrices will be loaded into memory, #' otherwise the function works on the delayedMatrix representations. The #' default value depends on the number of samples in the fds-object. @@ -27,8 +30,8 @@ #' @examples #' fds <- createTestFraserDataSet() #' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -#' mcols(fds, type="psi5")[, c( -#' "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +#' mcols(fds, type="jaccard")[, c( +#' "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] #' #' plotFilterExpression(fds) #' plotFilterVariability(fds) @@ -42,21 +45,25 @@ NULL #' read support and introns that are not variable across samples. #' @export filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, minDeltaPsi=0.05, + quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + # filter introns with low read support and corresponding splice sites object <- filterExpression(object, minExpressionInOneSample=minExpressionInOneSample, quantile=quantile, quantileMinExpression=quantileMinExpression, filter=filter, delayed=delayed, + filterOnJaccard=filterOnJaccard, BPPARAM=BPPARAM) # filter introns that are not variable across samples object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter, - delayed=delayed, BPPARAM=BPPARAM) + delayed=delayed, filterOnJaccard=filterOnJaccard, + BPPARAM=BPPARAM) # return fds message(date(), ": Filtering done!") @@ -64,11 +71,307 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, } -filterExpression.FRASER <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, filter=TRUE, +#' @noRd +filterExpression.FRASER2 <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + return(filterExpression_jaccard(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } else{ + return(filterExpression.FRASER(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterExpression", signature="FraserDataSet", + filterExpression.FRASER2) + +#' This function filters out introns and corresponding +#' splice sites which are expressed at very low levels across samples. +#' @noRd +filterExpression_jaccard <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM=bpparam()){ + + stopifnot(is(object, "FraserDataSet")) + + message(date(), ": Filtering out introns with low read support ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN) + } + + # cutoff functions + f1 <- function(cts, ...){ + rowMaxs(cts) } + f2 <- function(cts, ctsN, quantile, ...){ + rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] } + + funs <- c(maxCount=f1, quantileValueN=f2) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN, quantile=quantile) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + mcols(object, type="j")[['passedExpression']] <- + cutoffs$maxCount >= minExpressionInOneSample & + cutoffs$quantileValueN >= quantileMinExpression + if("passedVariability" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyExpressionFilters_jaccard(object, + minExpressionInOneSample, + quantileMinExpression) + } + + validObject(object) + return(object) +} +#' @noRd +filterVariability.FRASER2 <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + object <- filterVariability_jaccard(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } else{ + object <- filterVariability.FRASER(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterVariability", signature="FraserDataSet", + filterVariability.FRASER2) + + +#' This function filters out introns and corresponding +#' splice sites which do not show variablity across samples. +#' @noRd +filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + + message(date(), ": Filtering out non-variable introns ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN) + } + + # cutoff functions + f1 <- function(cts, ctsN, ...) { + jaccard <- cts/ctsN + rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), + na.rm=TRUE) } + + funs <- c(maxDJaccard=f1) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + # add annotation of theta on splice sites of introns to mcols + intron_dt <- as.data.table(rowRanges(object, type="j")) + + # check which introns pass the filter + mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, + cutoffs$maxDJaccard, + 0) >= minDeltaPsi + if("passedExpression" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedVariability']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyVariabilityFilters_jaccard(object, minDeltaPsi) + } + + validObject(object) + return(object) +} + +#' Applies previously calculated filters for expression filters +#' @noRd +applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, + quantileMinExpression){ + + maxCount <- mcols(fds, type="j")[['maxCount']] + quantileValueN <- mcols(fds, type="j")[['quantileValueN']] + + # report rare junctions that passed minExpression filter but not + # quantileFilter as SE obj + junctionsToReport <- maxCount >= minExpressionInOneSample & + !(quantileValueN >= quantileMinExpression) + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + + if(any(junctionsToReport)){ + # get SE object of junctions to report + rareJunctions <- asSE(fds[junctionsToReport, by="j"]) + for(aname in assayNames(rareJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(rareJunctions, aname) <- NULL + } + } + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) + + # check if folder already exists from previous filtering + rareJctsDir <- file.path(outputDir, "rareJunctions") + if(dir.exists(rareJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir) + toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & + !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) + + rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions) + + for(aname in assayNames(rJ_tmp)){ + assay(rJ_tmp, aname) <- + rbind(as.matrix(assay(rareJunctions, aname)), + as.matrix(assay(rJ_stored[toReport,], aname)) ) + } + rareJunctions <- rJ_tmp + rm(rJ_tmp) + } + + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=rareJctsDir, replace=TRUE) + } + + # apply filter + numFilt <- sum(mcols(fds, type="j")[['passedExpression']]) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] + + return(fds) +} + + +#' Applies previously calculated variablilty filters +#' @noRd +applyVariabilityFilters_jaccard <- function(fds, minDeltaPsi){ + + # + passedVariability <- mcols(fds, type="j")[['passedVariability']] + + # store information of non-variable junctions + filtered <- !passedVariability + + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + if(any(filtered)){ + # get SE object of junctions to report + nonVariableJunctions <- asSE(fds[filtered, by="j"]) + for(aname in assayNames(nonVariableJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(nonVariableJunctions, aname) <- NULL + } + } + nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE, + nonVariableJunctions, + dir=file.path(tempdir(), "tmp_nvJ")) + + # check if folder already exists from previous filtering + nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions") + if(dir.exists(nonVarJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) + toReport <- mcols(nV_stored)$maxDJaccard < minDeltaPsi + + nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) + for(aname in assayNames(nVJunctions)){ + assay(nVJunctions, aname) <- + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) + } + nonVariableJunctions <- nVJunctions + rm(nVJunctions) + } + + nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir, + x=nonVariableJunctions, replace=TRUE) + + } + + # apply filtering + numFilt <- sum(passedVariability) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] + return(fds) +} + + +#' Old FRASER1 filtering functions +#' @noRd +filterExpression.FRASER <- function(object, minExpressionInOneSample=20, + quantile=0.95, quantileMinExpression=10, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + stopifnot(is(object, "FraserDataSet")) message(date(), ": Filtering out introns with low read support ...") @@ -83,30 +386,30 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, ctsN5 <- as.matrix(ctsN5) ctsN3 <- as.matrix(ctsN3) } - + # cutoff functions f1 <- function(cts, ...){ - rowMaxs(cts) } + rowMaxs(cts) } f2 <- function(cts, ctsN5, quantile, ...){ - rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } + rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } f3 <- function(cts, ctsN3, quantile, ...) { - rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } - + rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } + funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3) - + # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) - + cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) + # add annotation to object for(n in names(cutoffs)){ mcols(object, type="j")[n] <- cutoffs[[n]] } mcols(object, type="j")[['passedExpression']] <- - cutoffs$maxCount >= minExpressionInOneSample & - (cutoffs$quantileValue5 >= quantileMinExpression & - cutoffs$quantileValue3 >= quantileMinExpression) + cutoffs$maxCount >= minExpressionInOneSample & + (cutoffs$quantileValue5 >= quantileMinExpression & + cutoffs$quantileValue3 >= quantileMinExpression) if("passedVariability" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -119,25 +422,18 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, # filter if requested if(isTRUE(filter)){ object <- applyExpressionFilters(object, minExpressionInOneSample, - quantileMinExpression) + quantileMinExpression) } - + validObject(object) return(object) } -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites that have low read support in all samples. -#' @export -setMethod("filterExpression", signature="FraserDataSet", - filterExpression.FRASER) - -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites which do not show variablity across samples. -#' @export -filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, - delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM=bpparam()){ +#' Old FRASER1 filtering functions +#' @noRd +filterVariability.FRASER <- function(object, minDeltaPsi=0.05, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ message(date(), ": Filtering out non-variable introns ...") @@ -167,7 +463,7 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, theta <- ctsSE/ctsNSE dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), na.rm=TRUE) } - + funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3) @@ -194,14 +490,14 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, mcols(object, type="j")["maxDThetaAcceptor"] <- merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", all.x=TRUE, sort=FALSE)[,maxDTheta] - + # check which introns pass the filter mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, - cutoffs$maxDPsi3, - cutoffs$maxDPsi5, - mcols(object, type="j")$maxDThetaDonor, - mcols(object, type="j")$maxDThetaAcceptor, - 0) >= minDeltaPsi + cutoffs$maxDPsi3, + cutoffs$maxDPsi5, + mcols(object, type="j")$maxDThetaDonor, + mcols(object, type="j")$maxDThetaAcceptor, + 0) >= minDeltaPsi if("passedExpression" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -232,8 +528,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, # report rare junctions that passed minExpression filter but not # quantileFilter as SE obj junctionsToReport <- maxCount >= minExpressionInOneSample & - !(quantileValue5 >= quantileMinExpression & - quantileValue3 >= quantileMinExpression) + !(quantileValue5 >= quantileMinExpression & + quantileValue3 >= quantileMinExpression) outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(junctionsToReport)){ @@ -247,8 +543,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, } } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=file.path(tempdir(), "tmp_rJ"), - replace=TRUE) + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) # check if folder already exists from previous filtering rareJctsDir <- file.path(outputDir, "rareJunctions") @@ -284,7 +580,6 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] return(fds) - } #' Applies previously calculated variablilty filters @@ -308,8 +603,8 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ nonVariableJunctions <- asSE(fds[filtered, by="j"]) for(aname in assayNames(nonVariableJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", - "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3"))){ + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3"))){ assay(nonVariableJunctions, aname) <- NULL } } @@ -325,15 +620,15 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "cannot be restored.") nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi & - mcols(nV_stored)$maxDPsi3 < minDeltaPsi & - mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & - mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi + mcols(nV_stored)$maxDPsi3 < minDeltaPsi & + mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & + mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) for(aname in assayNames(nVJunctions)){ assay(nVJunctions, aname) <- - rbind(as.matrix(assay(nonVariableJunctions, aname)), - as.matrix(assay(nV_stored[toReport,], aname)) ) + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) } nonVariableJunctions <- nVJunctions rm(nVJunctions) @@ -351,5 +646,5 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "% of the junctions")) fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] return(fds) - + } diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index b01a4d1a..ad4873ec 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -27,6 +27,8 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){ fds <- calculatePvalues(fds, type=type, implementation=implementation, BPPARAM=BPPARAM) + fds <- calculatePadjValues(fds, type=type, geneLevel=FALSE, + BPPARAM=BPPARAM) return(fds) } @@ -50,7 +52,7 @@ eval_prot <- function(fds, type){ }, FUN.VALUE=logical(length(unique(index))) ) ) + 0 if(any(is.na(scores))){ - warning(sum(is.na(scores)), " P-values where NAs.") + # warning(sum(is.na(scores)), " P-values where NAs.") scores[is.na(scores)] <- min(scores, na.rm=TRUE)-1 } pr <- pr.curve(scores, weights.class0=labels) @@ -111,17 +113,18 @@ findEncodingDim <- function(i, fds, type, params, implementation, #' @examples #' # generate data #' fds <- makeSimulatedFraserDataSet(m=15, j=20) +#' fds <- calculatePSIValues(fds) #' #' # run hyperparameter optimization -#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) +#' fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) #' #' # get estimated optimal dimension of the latent space -#' bestQ(fds, type="psi5") -#' hyperParams(fds, type="psi5") +#' bestQ(fds, type="jaccard") +#' hyperParams(fds, type="jaccard") #' #' @export -optimHyperParams <- function(fds, type, implementation="PCA", - q_param=seq(2, min(40, ncol(fds)), by=3), +optimHyperParams <- function(fds, type=psiTypes, implementation="PCA", + q_param=getEncDimRange(fds), noise_param=0, minDeltaPsi=0.1, iterations=5, setSubset=50000, injectFreq=1e-2, BPPARAM=bpparam(), internalThreads=1, plot=TRUE, @@ -227,3 +230,19 @@ optimHyperParams <- function(fds, type, implementation="PCA", return(fds) } +#' Get default range of latent space dimensions to test during hyper param opt +#' @noRd +getEncDimRange <- function(fds, mp=3){ + # Get range for latent space dimension + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique + return(pars_q) +} diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index f9b67599..1b726403 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), - q, type="psi3", rhoRange=c(1e-8, 1-1e-8), + q, type=psiTypes, rhoRange=c(-30, 30), weighted=FALSE, noiseAlpha=1, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), nSubset=15000, @@ -51,6 +51,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", paste(names(list(...)), collapse=", ")) } method <- match.arg(implementation) + type <- match.arg(type) verbose <- verbose(object) > 0 diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 94e8ca5e..00fb0055 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -10,11 +10,14 @@ #' @param level Indicates if the retrieved p values should be adjusted on the #' donor/acceptor site-level (default) or if unadjusted junction-level #' p values should be returned. +#' @param filters A named list giving the filters that were applied for masking +#' during p value correction. Used for storing and retrieving the +#' correct set of requested p values. #' @param value The new value to be assigned. #' @param all Logical value indicating whether \code{hyperParams(fds)} should #' return the results of all evaluated parameter combinations or only #' for the optimal parameter combination. -#' @param ... Internally used parameteres. +#' @param ... Internally used parameters. #' @return A (delayed) matrix or vector dependent on the type of data retrieved. #' #' @name getter_setter_functions @@ -29,7 +32,7 @@ #' dontWriteHDF5 <- TRUE #' #' # get/set the splice metric for which results should be retrieved -#' currentType(fds) <- "psi5" +#' currentType(fds) <- "jaccard" #' currentType(fds) #' #' # get fitted parameters @@ -40,6 +43,9 @@ #' # get statistics #' pVals(fds) #' padjVals(fds) +#' +#' # zscore not calculated by default +#' fds <- calculateZscore(fds, type="jaccard") #' zScores(fds) #' #' # set and get pseudocount @@ -47,9 +53,9 @@ #' pseudocount() #' #' # retrieve or set a mask to exclude certain junctions in the fitting step -#' featureExclusionMask(fds, type="theta") <- sample( -#' c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -#' featureExclusionMask(fds, type="theta") +#' featureExclusionMask(fds, type="jaccard") <- sample( +#' c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +#' featureExclusionMask(fds, type="jaccard") #' #' # controlling the verbosity level of the output of some algorithms #' verbose(fds) <- 2 @@ -198,7 +204,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ } -`setAssayMatrix<-` <- function(fds, name, type, ..., value){ +`setAssayMatrix<-` <- function(fds, name, type=currentType(fds), ..., value){ if(!is.matrix(value)){ value <- matrix(value, ncol=ncol(fds), nrow=nrow(mcols(fds, type=type))) } @@ -217,7 +223,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ fds } -getAssayMatrix <- function(fds, name, type, byGroup=FALSE){ +getAssayMatrix <- function(fds, name, type=currentType(fds), byGroup=FALSE){ if(missing(name)){ name <- type } else { @@ -248,8 +254,8 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ #' @describeIn getter_setter_functions This returns the calculated p-values. #' @export pVals <- function(fds, type=currentType(fds), level="site", - dist="BetaBinomial", ...){ - level <- match.arg(level, choices=c("site", "junction")) + filters=list(), dist="BetaBinomial", ...){ + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ @@ -260,33 +266,148 @@ pVals <- function(fds, type=currentType(fds), level="site", warning("Did not find junction-level p values. ", "Using site-level p values instead.") } + } else{ + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ + stop("Did not find gene-level p values. ", + "Please compute them first.") + } + return(metadata(fds)[[paste(aname, type, sep="_")]]) + } } + getAssayMatrix(fds, aname, type=type, ...) } `pVals<-` <- function(fds, type=currentType(fds), level="site", + filters=list(), dist="BetaBinomial", ..., value){ - level <- match.arg(level, choices=c("site", "junction")) + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ aname <- paste0(aname, "_junction") + setAssayMatrix(fds, name=aname, type=type, ...) <- value + return(fds) + } else if(level == "gene"){ + aname <- paste0(aname, "_gene") + } + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value } - setAssayMatrix(fds, name=aname, type=type, ...) <- value return(fds) } #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export -padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){ +padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), + level="site", subsetName=NULL, filters=list(), ...){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...)) + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } + # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ + stop("Did not find gene-level padj values. ", + "Please compute them first.") + } + return(metadata(fds)[[paste(aname, type, sep="_")]]) + } + return(getAssayMatrix(fds, aname, type=type, ...)) } -`padjVals<-` <- function(fds, type=currentType(fds), - dist="BetaBinomial", ..., value){ +`padjVals<-` <- function(fds, type=currentType(fds), level="site", + dist="BetaBinomial", subsetName=NULL, filters=list(), ..., + value){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - setAssayMatrix(fds, name=paste0("padj", dist), type=type, ...) <- value + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value + } + return(fds) +} + +#' @describeIn getter_setter_functions This returns the names of FDR subsets +#' for which adjusted p values have been calculated. +#' @export +availableFDRsubsets <- function(fds){ + ans <- metadata(fds)[["FDRsubsets"]] + return(ans) +} + +`availableFDRsubsets<-` <- function(fds, value){ + metadata(fds)[["FDRsubsets"]] <- value + return(fds) +} + +`addToAvailableFDRsubsets<-` <- function(fds, value){ + if(!isScalarCharacter(value)){ + stop("The assigned value needs to be a scalar character.") + } + ans <- metadata(fds)[["FDRsubsets"]] + if(is.null(ans)){ + metadata(fds)[["FDRsubsets"]] <- value + } else{ + metadata(fds)[["FDRsubsets"]] <- unique(c(ans, value)) + } return(fds) } @@ -311,10 +432,14 @@ deltaPsiValue <- function(fds, type=currentType(fds)){ #' @describeIn getter_setter_functions Returns the psi type that is used -#' within several methods in the FRASER package. +#' within several methods in the FRASER package (defaults to jaccard). #' @export currentType <- function(fds){ - return(metadata(fds)[['currentType']]) + curType <- metadata(fds)[['currentType']] + if(is.null(curType)){ + curType <- "jaccard" + } + return(curType) } #' @describeIn getter_setter_functions Sets the psi type that is to be used @@ -326,6 +451,27 @@ currentType <- function(fds){ return(fds) } +#' @describeIn getter_setter_functions Returns the splice metrics that will be +#' fitted (defaults to jaccard, used within several methods in the +#' FRASER package). +#' @export +fitMetrics <- function(fds){ + metrics <- metadata(fds)[['fit_metrics']] + if(is.null(metrics)){ + metrics <- "jaccard" + } + return(metrics) +} + +#' @describeIn getter_setter_functions Sets the splice metrics that will be +#' fitted (used within several methods in the FRASER package). +#' @export +`fitMetrics<-` <- function(fds, value){ + stopifnot(is.character(whichPSIType(value))) + metadata(fds)[['fit_metrics']] <- whichPSIType(value) + return(fds) +} + #' @describeIn getter_setter_functions Sets and returns the pseudo count used #' within the FRASER fitting procedure. #' @export @@ -342,7 +488,7 @@ pseudocount <- function(value=NULL){ # set pseudo count if provided stopifnot(isScalarNumeric(value)) stopifnot(value >= 0) - value <- as.integer(value) + value <- as.numeric(value) options('FRASER.pseudoCount'=value) devNULL <- .setPseudoCount(value) stopifnot(value == devNULL) @@ -433,7 +579,7 @@ dontWriteHDF5 <- function(fds){ return(fds) } -getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ +getTrueOutliers <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueOutliers", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -443,7 +589,7 @@ getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ pmin(pmax(ans, -1), 1) } -getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getTrueDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueDeltaPSI", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -451,7 +597,8 @@ getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ ans } -getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ +getAbsMaxByGroup <- function(fds, type=currentType(fds), mat, index=NULL, + BPPARAM=bpparam()){ if(is.null(index)){ index <- getSiteIndex(fds, type) } @@ -470,13 +617,13 @@ getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ return(values) } -getByGroup <- function(fds, type, value){ +getByGroup <- function(fds, type=currentType(fds), value){ index <- getSiteIndex(fds, type) idx <- !duplicated(index) return(value[idx,]) } -getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ mu <- predictedMeans(fds, type) dataPsi <- (K(fds, type) + pseudocount())/(N(fds, type) + 2*pseudocount()) deltaPSI <- dataPsi - mu @@ -488,13 +635,14 @@ getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ # calculate FRASER weights -calcFraserWeights <- function(fds, psiType){ +calcFraserWeights <- function(fds, psiType=currentType(fds)){ k <- as.matrix(K(fds, psiType)) n <- as.matrix(N(fds, psiType)) mu <- t(predictMu(fds, psiType)) rho <- rho(fds, psiType) - dataPsi <- plogis(t( - x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + # dataPsi <- plogis(t( + # x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + dataPsi <- k / n # pearson residuals for BB # on counts of success k @@ -503,18 +651,23 @@ calcFraserWeights <- function(fds, psiType){ # (1+((n+2*pseudocount())-1)*rho)) # on probability of success mu r <- (dataPsi - mu) / sqrt( - mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / - (n+2*pseudocount())) + # mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / + # (n+2*pseudocount())) + mu * (1-mu) * (1+(n-1)*rho) / n + ) # weights according to Huber function (as in edgeR) c <- 1.345; # constant, as suggested in edgeR paper w <- ifelse(abs(r) > c, c/abs(r) , 1) + # set weights to 0 if NA (i.e. N=0) + w[is.na(w)] <- 0 + return(w) } # get FRASER weights -weights <- function(fds, type){ +weights <- function(fds, type=currentType(fds)){ return(getAssayMatrix(fds, "weights", type)) } @@ -524,7 +677,7 @@ weights <- function(fds, type){ return(fds) } -getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ +getIndexFromResultTable <- function(fds, resultTable){ type <- as.character(resultTable$type) target <- makeGRangesFromDataFrame(resultTable) if(type == "theta"){ @@ -541,8 +694,9 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ ov } -getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, - idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, ...){ +getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), + result=NULL, idx=NULL, aggregate=FALSE, pvalLevel="site", + Ncpus=3, geneColumn="hgnc_symbol", ...){ if(!is.null(result)){ type <- as.character(result$type) idx <- getIndexFromResultTable(fds, result) @@ -564,8 +718,8 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, spliceID <- getSiteIndex(fds, type=type)[idxrow] feature_names <- rownames(mcols(fds, type=type))[idxrow] - if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){ - feature_names <- mcols(fds, type=type)[idxrow,"hgnc_symbol"] + if(geneColumn %in% colnames(mcols(fds, type=type))){ + feature_names <- mcols(fds, type=type)[idxrow, geneColumn] } if(is.null(feature_names)){ feature_names <- as.character(seq_row(mcols(fds, type=type)))[idxrow] @@ -587,42 +741,62 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, pval = c(pVals(fds, type=type, level=pvalLevel)[idxrow, idxcol]), padj = c(padjVals(fds, type=type)[idxrow, idxcol]), - zscore = c(zScores(fds, type=type)[idxrow, idxcol]), - obsPsi = c((k + pseudocount())/(n + 2*pseudocount())), - predPsi = c(predictedMeans(fds, type)[idxrow, idxcol])) + obsPsi = c(k/n), + predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), + rho = rep(rho(fds, type=type)[idxrow], + ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) + ) dt[, deltaPsi:=obsPsi - predPsi] # add aberrant information to it aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], - dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)]) + dPsi=dt[,.(deltaPsi)], n=dt[,.(n)], + rhoVals=dt[,.(rho)], aggregate=FALSE) dt[,aberrant:=aberrantVec] - # if requested return gene p values (correct for multiple testing again) + # if requested return gene p values if(isTRUE(aggregate)){ dt <- dt[!is.na(featureID)] - - # correct by gene and take the smallest p value - dt <- rbindlist(bplapply(unique(dt[,sampleID]), - BPPARAM=getBPParam(Ncpus, length(unique(dt[,sampleID]))), - FUN=function(x){ - dttmp <- dt[sampleID == x] - dttmp[, pval:=p.adjust(pval, method="holm"), - by="sampleID,featureID,type"] - dttmp <- dttmp[order(sampleID, featureID, type, -aberrant, - pval, -abs(deltaPsi))][ - !duplicated(data.table(sampleID, featureID, type))] - dttmp <- dttmp[, padj:=p.adjust(pval, method="BY"), - by="sampleID,type"] - dttmp - })) + # split featureID into several rows if more than one + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(featureID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(featureID=dt_tmp$V1, dt_idx=NULL)] + + # get gene-level pvalue matrices + pvalsGene <- lapply(c("pval", "padj"), function(x){ + if(x == "pval"){ + pvalsGene <- pVals(fds, type=type, + level="gene")[,idxcol,drop=FALSE] + } else { + pvalsGene <- padjVals(fds, type=type, + level="gene")[,idxcol,drop=FALSE] + } + pvalsGene <- data.table(featureID=rownames(pvalsGene), pvalsGene) + pvalsGene <- melt(pvalsGene, value.name=paste0("gene_", x), + id.vars="featureID", variable.name="sampleID") + return(pvalsGene) + }) + pvalsGene <- merge(pvalsGene[[1]], pvalsGene[[2]], + by=c("featureID", "sampleID")) + + # merge with gene pval matrix + dt <- merge(dt, pvalsGene, by=c("featureID", "sampleID")) + dt[,`:=`(pval=gene_pval, padj=gene_padj, + gene_pval=NULL, gene_padj=NULL)] + + # sort + dt <- dt[order(sampleID, featureID, type, -aberrant, + padj, -abs(deltaPsi))][ + !duplicated(data.table(sampleID, featureID, type))] } - + # return object dt } -#' @describeIn getter_setter_functions Dependend on the level of verbosity +#' @describeIn getter_setter_functions Dependent on the level of verbosity #' the algorithm reports more or less to the user. 0 means being quiet #' and 10 means everything. #' @export diff --git a/R/helper-functions.R b/R/helper-functions.R index 15b91fde..03d8de3b 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -71,7 +71,7 @@ checkReadType <- function(fds, type){ # check if type is null or missing if(missing(type) | is.null(type)){ - if(verbose(fds) > 0){ + if(verbose(fds) > 3){ warning("Read type was not specified!", "We will assume the default: 'j'") } @@ -79,7 +79,7 @@ checkReadType <- function(fds, type){ } type <- unique(type) stopifnot(isScalarCharacter(type)) - correctTypes <- c(psi3="j", psi5="j", theta="ss") + correctTypes <- c(psi3="j", psi5="j", theta="ss", jaccard="j") # check if it is already the correct type if(type %in% correctTypes) return(type) @@ -109,7 +109,7 @@ checkReadType <- function(fds, type){ #' #' @noRd whichPSIType <- function(type){ - unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE))) + unlist(regmatches(type, gregexpr("psi(3|5)|theta|jaccard", type, perl=TRUE))) } #' @@ -122,7 +122,8 @@ whichReadType <- function(fds, name){ # check writing if(name == "ss" | endsWith(name, "theta")) return("ss") - if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3")) + if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3") | + endsWith(name, "jaccard")) return("j") # check assay names @@ -355,18 +356,18 @@ assayExists <- function(fds, assayName){ return(aexists) } -getAssayAsVector <- function(fds, prefix, psiType, sampleID){ +getAssayAsVector <- function(fds, prefix, psiType=currentType(fds), sampleID){ as.vector(assay(fds, paste0(prefix, psiType))[,sampleID]) } -variableJunctions <- function(fds, type, minDeltaPsi=0.1){ +variableJunctions <- function(fds, type=currentType(fds), minDeltaPsi=0.1){ psi <- K(fds, type=type)/N(fds, type=type) j2keep <- rowMaxs(abs(psi - rowMeans(psi, na.rm=TRUE)), na.rm=TRUE) j2keep >= minDeltaPsi } -subsetKMostVariableJunctions <- function(fds, type, n){ +subsetKMostVariableJunctions <- function(fds, type=currentType(fds), n){ curX <- x(fds, type=type, all=TRUE, center=FALSE, noiseAlpha=NULL) xsd <- colSds(curX) nMostVarJuncs <- which(xsd >= sort(xsd, TRUE)[min(length(xsd), n*2)]) @@ -375,7 +376,8 @@ subsetKMostVariableJunctions <- function(fds, type, n){ ans } -getSubsetVector <- function(fds, type, minDeltaPsi=0.1, nSubset=15000){ +getSubsetVector <- function(fds, type=currentType(fds), minDeltaPsi=0.1, + nSubset=15000){ # get any variable intron ans <- variableJunctions(fds, type, minDeltaPsi=minDeltaPsi) @@ -549,6 +551,130 @@ getStrandString <- function(fds){ return(strand) } + +#' +#' Check if adjusted pvalues have been computed for a given set of filters. +#' @noRd +checkPadjAvailableForFilters <- function(fds, type=currentType(fds), + filters=list(), dist="BetaBinomial", aggregate=FALSE, + subsetName=NULL){ + dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) + aname <- paste0("padj", dist) + aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname) + aname <- ifelse(!is.null(subsetName), paste0(aname, "_", subsetName), aname) + + # add information on used filters + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + aname <- paste(aname, type, sep="_") + if(isTRUE(aggregate)){ + pvalsAvailable <- aname %in% names(metadata(fds)) + } else{ + pvalsAvailable <- aname %in% assayNames(fds) + } + return(pvalsAvailable) +} + +#' +#' Find most aberrant junction for each aberrant gene +#' +#' @param gr GRanges object with information about junctions. +#' @param aberrantGenes Significant genes for which the corresponding junction +#' should be extracted. +#' @param pvals Vector of pvalues (for one sample). +#' @param dpsi Vector of delta psi values (for one sample). +#' @param aberrantJunctions Vector indicating which junctions are considered +#' aberrant. +#' @param geneColumn Name of the column in mcols(fds) that has gene annotation. +#' @noRd +findJunctionsForAberrantGenes <- function(gr, aberrantGenes, pvals, dpsi, + aberrantJunctions, geneColumn="hgnc_symbol"){ + dt <- data.table(idx=mcols(gr)$idx, + geneID=mcols(gr)[,geneColumn], + pval=pvals, + dpsi=abs(dpsi), + aberrant=aberrantJunctions) + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + dt <- dt[geneID %in% aberrantGenes,] + dt <- dt[!is.na(aberrant) & aberrant == TRUE,] + + # sort per gene by lowest pvalue / highest deltaPsi and return index + dt <- dt[order(geneID, -aberrant, pval, -dpsi)] + dt <- dt[!duplicated(dt, by="geneID"),] + + # remove gene-level significant result if no junction in that gene passed + # the filters + dt <- dt[!is.na(pval),] + + junctionsToReport <- dt[,idx] + names(junctionsToReport) <- dt[,geneID] + junctionsToReport <- sort(junctionsToReport) + return(junctionsToReport) +} + +collapseResTablePerGene <- function(res, geneColumn="hgncSymbol"){ + if(length(res) == 0){ + return(res) + } + if(!is.data.table(res)){ + res <- as.data.table(res) + } + + if(any(!c("pValue", "pValueGene", geneColumn) %in% colnames(res))){ + stop("For collapsing per gene, the results table needs to contain ", + "the columns pValue, pValueGene and ", geneColumn, ".") + } + + res <- res[order(res$pValueGene, res$pValue)] + naIdx <- is.na(res[, get(geneColumn)]) + ansNoNA <- res[!is.na(res[, get(geneColumn)]),] + + # get final result table + dupIdx <- duplicated(data.table(as.vector(ansNoNA[, get(geneColumn)]), + as.vector(ansNoNA$sampleID))) + ans <- res[!naIdx,][!dupIdx,] + return(ans) +} + +#' ignores NA in unique if other values than NA are present +#' @noRd +uniqueIgnoreNA <- function(x){ + uniq <- unique(x) + if(length(uniq) > 1) uniq <- uniq[!is.na(uniq)] + return(uniq) +} + +#' split string of gene names into vector +#' @noRd +splitGenes <- function(x, sep=";"){ + return(unlist(strsplit(as.character(x), sep, fixed=TRUE))) +} + +#' cap string of gene names to show max 3 gene names +#' @noRd +limitGeneNamesList <- function(gene_names, maxLength=3){ + gene_names <- as.character(gene_names) + numFeatures <- unlist(lapply(gene_names, function(x) length(splitGenes(x)))) + gene_names[numFeatures > maxLength] <- + unlist(lapply(gene_names[numFeatures > maxLength], function(x){ + paste(c(splitGenes(x)[seq_len(maxLength)], "..."), + collapse=";") + } )) + return(gene_names) +} + checkForAndCreateDir <- function(object, dir){ verbose <- 0 if(is(object, "FraserDataSet")){ @@ -569,4 +695,3 @@ checkForAndCreateDir <- function(object, dir){ } return(TRUE) } - diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index 6e29adf8..7a1fb6f9 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -435,9 +435,10 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' @examples #' # A generic dataset #' fds <- makeSimulatedFraserDataSet() +#' fds <- calculatePSIValues(fds) #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export -injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), +injectOutliers <- function(fds, type=psiTypes, freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, method=c('samplePSI', 'meanPSI', 'simulatedPSI'), @@ -472,6 +473,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), setAssayMatrix(fds, type="psi3", "originalOtherCounts", withDimnames=FALSE) <- counts(fds, type="psi3", side="other") + setAssayMatrix(fds, type="jaccard", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="jaccard", side="other") } # get infos from the fds @@ -500,7 +504,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), dt[,groupSize:=.N, by=groupID] # Get groups where outlier can be injected - available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), unique(groupID)] + available_groups <- dt[groupSize > ifelse(type == "theta" | + type == "jaccard", 0, 1), + unique(groupID)] # e.g. for psi3/5: no donor/acceptor # groups with at least 2 junctions (e.g in simulationBB) diff --git a/R/plotMethods.R b/R/plotMethods.R index 6d824392..6f7a9a22 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -15,6 +15,10 @@ #' \item plotFilterExpression() #' \item plotFilterVariability() #' \item plotEncDimSearch() +#' \item plotBamCoverage() +#' \item plotBamCoverageFromResultTable() +#' \item plotManhattan() +#' \item plotSpliceMetricRank() #' } #' #' For a detailed description of each plot function please see the details. @@ -25,10 +29,10 @@ #' @param type The psi type: either psi5, psi3 or theta (for SE). #' @param sampleID A sample ID which should be plotted. Can also be a vector. #' Integers are treated as indices. -#' @param idx,site A junction site ID or gene ID or one of both, which +#' @param idx A junction site ID or gene ID or one of both, which #' should be plotted. Can also be a vector. Integers are treated #' as indices. -#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta +#' @param padjCutoff,deltaPsiCutoff Significance or delta #' psi cutoff to mark outliers #' @param global Flag to plot a global Q-Q plot, default FALSE #' @param normalized If TRUE, the normalized psi values are used, the default, @@ -52,11 +56,61 @@ #' sample-sample correlation heatmap or \code{"junctionSample"} #' for a junction-sample correlation heatmap. #' @param onlyVariableIntrons Logical value indicating whether to show only -#' introns that also pass the variability filter. Defaults to -#' FALSE. +#' introns that also pass the variability filter. Defaults to +#' FALSE. #' @param onlyExpressedIntrons Logical value indicating whether to show only -#' introns that also pass the expression filter. Defaults to -#' FALSE. +#' introns that also pass the expression filter. Defaults to +#' FALSE. +#' @param gr A GRanges object indicating the genomic range that should be shown +#' in \code{plotBamCoverage}. +#' @param control_samples The sampleIDs of the samples used as control in +#' \code{plotBamCoverage}. +#' @param min_junction_count The minimal junction count across samples required +#' for a junction to appear in the splicegraph and coverage tracks +#' of \code{plotBamCoverage}. +#' @param txdb A TxDb object giving the gene/transcript annotation to use. +#' @param orgDb A OrgDb object giving the mapping of gene ids and symbols. +#' @param show_full_gene Should the full genomic range of the gene be shown in +#' \code{plotBamCoverageFromResultTable} (default: FALSE)? +#' If FALSE, only a certain region (see parameters left_extension +#' and right_extension) around the outlier junction is shown. +#' @param left_extension Indicating how far the plotted range around the outlier +#' junction should be extended to the left in +#' \code{plotBamCoverageFromResultTable}. +#' @param right_extension Indicating how far the plotted range around the +#' outlier junction should be extended to the right in +#' \code{plotBamCoverageFromResultTable}. +#' @param res_gene_col The column name in the given results table that +#' contains the gene annotation. +#' @param res_geneid_type The type of gene annotation in the results table in +#' \code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +#' information is needed for mapping between the results table and +#' the provided annotation in the txdb object. +#' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} +#' (e.g. ENTREZID). This information is needed for +#' mapping between the results table and the provided annotation +#' in the txdb object. +#' @param highlight_range A \code{GenomicRanges} or \code{GenomicRangesList} +#' object of ranges to be highlighted in the splicegraph of +#' \code{plotBamCoverage}. +#' @param highlight_range_color The color of highlighted ranges in +#' the splicegraph of \code{plotBamCoverage}. +#' @param toscale In \code{plotBamCoverage}, indicates which part of the +#' plotted region should be drawn to scale. Possible values are +#' 'exon' (exonic regions are drawn to scale), +#' 'gene' (both exonic and intronic regions are drawn to scale) or +#' 'none' (exonic and intronic regions have constant length) +#' (see SGSeq package). +#' @param splicegraph_labels Indicated the format of exon/splice junction +#' labels in the splicegraph of \code{plotBamCoverage}. +#' Possible values are 'genomic_range' (gives the start position +#' of the first exon and the end position of the last exon that +#' are shown), 'id' (format E1,... J1,...), 'name' (format +#' type:chromosome:start-end:strand for each feature), +#' 'none' for no labels (see SGSeq package). +#' @param splicegraph_position The position of the splicegraph relative to the +#' coverage tracks in \code{plotBamCoverage}. Possible values +#' are 'top' (default) and 'bottom'. #' #### Graphical parameters #' @param main Title for the plot, if missing a default title will be used. @@ -89,6 +143,31 @@ #' @param bins Set the number of bins to be used in the histogram. #' @param legend.position Set legend position (x and y coordinate), defaults to #' the top right corner. +#' @param color_annotated The color for exons and junctions present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_novel The color for novel exons and junctions not present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_sample_interest The color in \code{plotBamCoverage} for the +#' sample of interest. +#' @param color_control_samples The color in \code{plotBamCoverage} for the +#' samples used as controls. +#' @param curvature_splicegraph The curvature of the junction arcs in the +#' splicegraph in \code{plotBamCoverage}. Decrease this value +#' for flatter arcs and increase it for steeper arcs. +#' @param curvature_coverage The curvature of the junction arcs in the +#' coverage tracks of \code{plotBamCoverage}. Decrease this +#' value for flatter arcs and increase it for steeper arcs. +#' @param mar The margin of the plot area for \code{plotBamCoverage} +#' (b,l,t,r). +#' @param cex For controlling the size of text and numbers in +#' \code{plotBamCoverage}. +#' @param chr Vector of chromosome names to show in \code{plotManhattan}. The +#' default is to show all chromosomes. +#' @param value Indicates which assay is shown in the manhattan plot. Defaults +#' to 'pvalue'. Other options are 'deltaPsi' and 'zScore'. +#' @param chrColor Interchanging colors by chromosome for \code{plotManhattan}. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -116,6 +195,9 @@ #' #' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi #' against the predicted psi for a given site. +#' +#' \code{plotSpliceMetricRank}: This function plots for a given intron the +#' observed values of the selected splice metrix against the sample rank. #' #' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either #' of the full data set (i.e. sample-sample correlations) or of the top x most @@ -137,6 +219,17 @@ #' It plots the encoding dimension against the achieved loss (area under the #' precision-recall curve). From this plot the optimum should be choosen for #' the \code{q} in fitting process. +#' +#' \code{plotManhattan}: A Manhattan plot showing the junction pvalues by +#' genomic position. Useful to identify if outliers cluster by genomic position. +#' +#' \code{plotBamCoverage}: A sashimi plot showing the read coverage from +#' the underlying bam files for a given genomic range and sampleIDs. +#' +#' \code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +#' coverage from the underlying bam files for a row in the results table. Can +#' either show the full range of the gene with the outlier junction or only a +#' certain region around the outlier. #' #' @return If base R graphics are used nothing is returned else the plotly or #' the gplot object is returned. @@ -145,42 +238,80 @@ #' @rdname plotFunctions #' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ #' plotExpression plotCountCorHeatmap plotFilterExpression -#' plotExpectedVsObservedPsi plotEncDimSearch +#' plotExpectedVsObservedPsi plotEncDimSearch plotManhattan +#' plotBamCoverage plotBamCoverageFromResultTable #' @examples #' # create full FRASER object #' fds <- makeSimulatedFraserDataSet(m=40, j=200) #' fds <- calculatePSIValues(fds) #' fds <- filterExpressionAndVariability(fds, filter=FALSE) -#' # this step should be done for all splicing metrics and more dimensions -#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) +#' # this step should be done for more dimensions in practice +#' fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) #' fds <- FRASER(fds) #' #' # QC plotting #' plotFilterExpression(fds) #' plotFilterVariability(fds) -#' plotCountCorHeatmap(fds, "theta") -#' plotCountCorHeatmap(fds, "theta", normalized=TRUE) -#' plotEncDimSearch(fds, type="psi5") +#' plotCountCorHeatmap(fds, "jaccard") +#' plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +#' plotEncDimSearch(fds, type="jaccard") #' #' # extract results #' plotAberrantPerSample(fds, aggregate=FALSE) -#' plotVolcano(fds, "sample1", "psi5") +#' plotVolcano(fds, "sample1", "jaccard") #' #' # dive into gene/sample level results #' res <- results(fds) #' res #' plotExpression(fds, result=res[1]) #' plotQQ(fds, result=res[1]) -#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +#' plotExpectedVsObservedPsi(fds, res=res[1]) +#' +#' # create manhattan plot of pvalues by genomic position +#' if(require(ggbio)){ +#' plotManhattan(fds, type="jaccard", sampleID="sample10") +#' } +#' +#' # plot splice graph and coverage from bam files in a given region +#' if(require(SGSeq)){ +#' fds <- createTestFraserSettings() +#' gr <- GRanges(seqnames="chr19", +#' IRanges(start=7587496, end=7598895), +#' strand="+") +#' plotBamCoverage(fds, gr=gr, sampleID="sample3", +#' control_samples="sample2", min_junction_count=5, +#' curvature_splicegraph=1, curvature_coverage=1, +#' mar=c(1, 7, 0.1, 3)) +#' +#' # plot coverage from bam file for a row in the result table +#' fds <- createTestFraserDataSet() +#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' require(org.Hs.eg.db) +#' orgDb <- org.Hs.eg.db +#' +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +#' res_dt <- as.data.table(res) +#' res_dt <- res_dt[sampleID == "sample2",] +#' +#' # plot full range of gene containing outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, +#' txdb=txdb, orgDb=orgDb, control_samples="sample3") +#' +#' # plot only certain range around outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, +#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000, +#' splicegraph_labels="id") +#' } #' -#' NULL plotVolcano.FRASER <- function(object, sampleID, - type=c("psi3", "psi5", "theta"), basePlot=TRUE, + type=fitMetrics(object), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, - deltaPsiCutoff=0.3, padjCutoff=0.1, ...){ + deltaPsiCutoff=0.1, padjCutoff=0.1, ...){ type <- match.arg(type) @@ -213,7 +344,8 @@ plotVolcano.FRASER <- function(object, sampleID, } if(!is.na(padjCutoff)){ - if(dt[,any(padj <= padjCutoff)]){ + if(dt[padj <= padjCutoff, .N] > 0){ + padj_line <- min(dt[padj <= padjCutoff, -log10(pval)]) padj_line <- min(dt[padj <= padjCutoff, -log10(pval)]) } if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){ @@ -278,8 +410,8 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=c("psi3", "psi5", "theta"), - padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3, + type=fitMetrics(object), + padjCutoff=0.1, deltaPsiCutoff=0.1, aggregate=TRUE, BPPARAM=bpparam(), ...){ type <- match.arg(type, several.ok=TRUE) @@ -293,7 +425,7 @@ plotAberrantPerSample.FRASER <- function(object, main, # extract outliers outliers <- bplapply(type, aberrant, object=object, by="sample", - padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, + padjCutoff=padjCutoff, deltaPsiCutoff=deltaPsiCutoff, aggregate=aggregate, ..., BPPARAM=BPPARAM) dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){ @@ -307,7 +439,7 @@ plotAberrantPerSample.FRASER <- function(object, main, geom_line() + geom_hline(aes(yintercept=median, color=type, lty="Median")) + theme_bw() + - theme_cowplot() + + theme_cowplot() + background_grid(major="xy", minor="xy") + ggtitle(main) + xlab("Sample rank") + ylab("Number of outliers") + @@ -316,7 +448,8 @@ plotAberrantPerSample.FRASER <- function(object, main, scale_linetype_manual(name="", values=2, labels="Median") if(!all(dt2p[,value] == 0)){ - g <- g + scale_y_log10() + g <- g + scale_y_log10(limits=c(1, max(unlist(outliers)))) + + annotation_logticks(sides="l") } g @@ -341,17 +474,19 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), - site=NULL, result=NULL, colGroup=NULL, +plotExpression <- function(fds, type=fitMetrics(fds), + idx=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ type <- as.character(result$type) - site <- getIndexFromResultTable(fds, result) + idx <- getIndexFromResultTable(fds, result) } else { type <- match.arg(type) } - dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...) + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + if(!is.null(colGroup)){ if(all(colGroup %in% samples(fds))){ colGroup <- samples(fds) %in% colGroup @@ -422,6 +557,116 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), plotBasePlot(g, basePlot) } +#' +#' Junction splice metric plot +#' +#' Plots the observed values of the splice metric across samples for a +#' junction of interest. +#' +#' @rdname plotFunctions +#' @export +plotSpliceMetricRank <- function(fds, type=fitMetrics(fds), + idx=NULL, result=NULL, colGroup=NULL, + basePlot=TRUE, main=NULL, label="aberrant", ...){ + if(!is.null(result)){ + type <- as.character(result$type) + idx <- getIndexFromResultTable(fds, result) + } else { + type <- match.arg(type) + } + + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + + # rank on observed value of splice metric of interest + dt[, rank := rank(obsPsi, ties.method="random", na.last=FALSE)] + + if(!is.null(colGroup)){ + if(all(colGroup %in% samples(fds))){ + colGroup <- samples(fds) %in% colGroup + } + dt[colGroup,aberrant:=TRUE] + } + dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))] + + gr <- granges(rowRanges(fds,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), + ":", strand(gr)) + + if(is.null(main)){ + if(isTRUE(basePlot)){ + # main <- as.expression(bquote(bold(paste( + # .(ggplotLabelPsi(type)[[1]]), " rank plot: ", + # .(genomic_pos_label), + # " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + # ")")))) + main <- as.expression(bquote(bold(paste( + .(genomic_pos_label), + " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + ")")))) + } else{ + # main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], + # " rank plot: ", dt[,unique(featureID)], + # " (site ", dt[,unique(idx)], ")") + main <- paste0(genomic_pos_label, + " (", dt[,unique(featureID)], ")") + } + } + + if(isTRUE(basePlot)){ + ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]])) + } else{ + ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]]) + } + + g <- ggplot(dt, aes(x=rank, y=obsPsi, color=aberrant, label=sampleID, + text=paste0( + "Sample: ", sampleID, "
", + "Counts (K): ", k, "
", + "Total counts (N): ", n, "
", + "p value: ", signif(pval, 5), "
", + "padjust: ", signif(padj, 5), "
", + "Observed Psi: ", round(obsPsi, 2), "
", + "Predicted mu: ", round(predPsi, 2), "
"))) + + geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) + + theme_bw() + + theme(legend.position="none", title=) + + xlab("Sample rank") + + ylab(ylab) + + ggtitle(main, subtitle=paste0("fds row index: ", dt[,unique(idx)])) + + ylim(0,1) + + + if(isTRUE(basePlot) && !is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(dt[aberrant == TRUE,]) > 0){ + g <- g + geom_text_repel(data=dt[aberrant == TRUE,], + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + } + else{ + if(nrow(dt[sampleID %in% label]) > 0){ + g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + if(any(!(label %in% dt[,sampleID]))){ + warning("Did not find sample(s) ", + paste(label[!(label %in% dt[,sampleID])], + collapse=", "), " to label.") + } + } + } + + if(is.null(colGroup)){ + g <- g + scale_colour_manual( + values=c("FALSE"="gray70", "TRUE"="firebrick")) + } + + plotBasePlot(g, basePlot) +} + #' #' Expected over Overserved plot @@ -431,7 +676,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), +plotExpectedVsObservedPsi <- function(fds, type=fitMetrics(fds), idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) @@ -441,6 +686,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), idx=idx, ...) type <- as.character(unique(dt$type)) idx <- unique(dt$idx) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] if(is.null(main)){ if(isTRUE(basePlot)){ @@ -484,6 +730,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), geom_point(alpha=ifelse(dt$aberrant, 1, 0.5), color=c("gray70", "firebrick")[dt$aberrant + 1]) + geom_abline(intercept = 0, slope=1) + + xlim(c(0,1)) + ylim(c(0,1)) + theme_bw() + theme(legend.position="none") + xlab(xlab) + @@ -537,7 +784,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, if(isTRUE(global)){ if(is.null(type)){ - type <- psiTypes + type <- fitMetrics(object) } dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col", idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...)) @@ -562,6 +809,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, } else { type <- as.character(dt[,unique(type)]) featureID <- as.character(dt[,unique(featureID)]) + featureID <- limitGeneNamesList(featureID, maxLength=3) if(isTRUE(basePlot)){ main <- as.expression(bquote(bold(paste( .(ggplotLabelPsi(type)[[1]]), @@ -692,7 +940,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) -plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), +plotEncDimSearch.FRASER <- function(object, type=psiTypes, plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -717,6 +965,8 @@ plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), geom_smooth(method="loess", formula=y~x) + geom_vline(data=data[isOptimalQ == TRUE,], mapping=aes(xintercept=q, col=nsubset, linetype=noise)) + + geom_text(data=data[isOptimalQ == TRUE,], + aes(y=0.0, q+1, label=q)) + ggtitle(as.expression(bquote(bold(paste( "Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) + xlab("Estimated q") + @@ -860,8 +1110,8 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000, - topJ=5000, minMedian=1, minCount=10, + type=psiTypes, logit=FALSE, + topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, annotation_row=NA, border_color=NA, nClust=5, @@ -903,7 +1153,7 @@ plotCountCorHeatmap.FRASER <- function(object, object <- object[,ids2plot] } - xmat <- (skmat + 1)/(snmat + 2) + xmat <- (skmat + 1*pseudocount())/(snmat + 2*pseudocount()) if(isTRUE(logit)){ xmat <- qlogisWithCap(xmat) } @@ -1056,6 +1306,355 @@ plotCountCorHeatmap.FRASER <- function(object, setMethod("plotCountCorHeatmap", signature="FraserDataSet", plotCountCorHeatmap.FRASER) +#' +#' Plot coverage from bam files for given genomic range and sample ids +#' +#' @rdname plotFunctions +#' @export +plotBamCoverage <- function(fds, gr, sampleID, + control_samples=sample( + samples(fds[, which(samples(fds) != sampleID)]), + min(3, ncol(fds)-length(sampleID))), + txdb=NULL, min_junction_count=20, + highlight_range=NULL, highlight_range_color="firebrick", + color_annotated="gray", color_novel="goldenrod3", + color_sample_interest="firebrick", color_control_samples="dodgerblue4", + toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5), + curvature_splicegraph=1, curvature_coverage=1, cex=1, + splicegraph_labels=c("genomic_range", "id", "name", "none"), + splicegraph_position=c("top", "bottom"), ...){ + + if(missing(fds)){ + stop("Missing input: fds (FraserDataSet object)") + } else{ + stopifnot(is(fds, "FraserDataSet")) + } + if(missing(gr)){ + stop("Missing input gr (genomic range to plot).") + } else{ + stopifnot(is(gr, "GenomicRanges")) + stopifnot(length(gr) > 0) + } + if(missing(sampleID)){ + stop("Missing input: sample_of_interest") + } + toscale <- match.arg(toscale) + splicegraph_labels <- match.arg(splicegraph_labels) + splicegraph_position <- match.arg(splicegraph_position) + + # extract bam info for sample ids to plot + all_sids <- c(sampleID, control_samples) + si_out <- getSGSeqSI(fds, all_sids) + sgseq_si <- si_out[[1]] + fds <- si_out[[2]] + + # collapse input ranges if several + if(any(strand(gr) == "*")){ + # seems to throw an error with * strand so guessing strand instead + if(all(strand(gr) == "*")){ + guessStrand <- "+" + } else{ + guessStrand <- strand(gr[strand(gr) != "*"])[1] + } + strand(gr)[strand(gr) == "*"] <- guessStrand + warning("Input genomic ranges contained unstranded ranges.\n", + "This function needs strand information, guessing strand to ", + "be ", guessStrand, ".") + } + if(!all(strand(gr) == strand(gr[1,]))){ + warning("Input genomic ranges contained ranges on different strands,\n", + "only showing coverage for the ", strand(gr[1,]), " strand.") + strand(gr) <- rep(strand(gr[1,]), length(gr)) + } + gr <- range(gr) + gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr)))) + + # convert highlight_range to GRangesList if not + if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ + stopifnot(is(highlight_range, "GRanges")) + highlight_range <- GRangesList(highlight_range) + } + + # extract splice graph + sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, + min_junction_count=min_junction_count, psi=0, + ...) + + # overlap detected junctions with annotation + if(!is.null(txdb)){ + # subset to chr of interest + seqlevels(txdb) <- unique(as.character(seqnames(gr))) + + # extract transcript features with SGSeq package + txf <- SGSeq::convertToTxFeatures(txdb) + txf <- txf[txf %over% gr] + + # restore seqlevels of txdb object + seqlevels(txdb) <- seqlevels0(txdb) + + # annotate splice junctions with annotation features + sgfc_pred <- SGSeq::annotate(sgfc_pred, txf) + } else{ + # when no annotation is given, show everything in the same color + color_novel <- color_annotated + } + + # get genomic positions for first and last exon in given range + if(splicegraph_labels == "genomic_range"){ + # tell plotSpliceGraph function to use custom labels + splicegraph_labels <- "label" + # create custom labels (only for first and last exon for readability) + mcols(sgfc_pred)$label <- "" + exons <- which(SGSeq::type(sgfc_pred) == "E" & + rowRanges(sgfc_pred) %over% gr) + exons <- unique(c(exons[1], tail(exons, n=1))) + if(length(exons) == 1){ + mcols(sgfc_pred)$label[exons] <- + paste(seqnames(sgfc_pred), + paste(start(sgfc_pred), end(sgfc_pred), sep="-"), + strand(sgfc_pred), sep=":")[exons] + } + if(length(exons) == 2){ + mcols(sgfc_pred)$label[exons[1]] <- + paste(seqnames(sgfc_pred), + start(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[1]] + mcols(sgfc_pred)$label[exons[2]] <- + paste(seqnames(sgfc_pred), + end(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[2]] + } + } + + # plot splice graph and coverage of junctions from bam + nr_sa2p <- length(all_sids) + par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) + if(splicegraph_position == "top"){ + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color=color_annotated, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + for (j in seq_along(sampleID)) { + SGSeq::plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], + which = gr, + toscale = toscale, + label=sampleID[j], + color=color_sample_interest, + curvature=curvature_coverage) + } + for (j in seq_along(control_samples)) { + SGSeq::plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])], + which = gr, + toscale = toscale, + label=control_samples[j], + color=color_control_samples, + curvature=curvature_coverage) + } + if(splicegraph_position == "bottom"){ + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + + return(invisible(fds)) +} + +#' +#' Plot coverage from bam files for given row of results table +#' +#' @rdname plotFunctions +#' @export +plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, + txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol", + res_geneid_type="SYMBOL", txdb_geneid_type="ENTREZID", + left_extension=1000, right_extension=1000, ...){ + stopifnot(is(fds, "FraserDataSet")) + + if(is(result, "GenomicRanges")){ + result <- as.data.table(result) + } + + stopifnot(is.data.table(result)) + stopifnot(result[,.N] == 1) + + sid <- result[,sampleID] + jidx <- getIndexFromResultTable(fds, result) + outlier_range <- rowRanges(fds, type=result[,type])[jidx,] + + # showing either full range of the gene in which the outlier occured + if(show_full_gene == TRUE){ + if(missing(txdb)){ + stop("Missing input: txdb (for extracting gene range)") + } + if(missing(orgDb)){ + stop("Missing input: orgDb (for mapping of IDs to txdb)") + } + result_gene <- result[,get(res_gene_col)] + result_gene <- strsplit(result_gene, ";", fixed=TRUE)[[1]] + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, + by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, + sort=FALSE)[,.(gene_id, feature=get(res_geneid_type))] + setnames(tmp, "feature", res_geneid_type) + txdb_geneid <- tmp[get(res_geneid_type) %in% result_gene, gene_id] + } else { + tmp <- as.data.table( + select(orgDb, + keys=result_gene, + columns=txdb_geneid_type, + keytype=res_geneid_type) + ) + txdb_geneid <- tmp[, get(txdb_geneid_type)] + } + gr <- genes(txdb, filter=list("gene_id"=txdb_geneid)) + if(length(gr) == 0){ + stop("Could not extract genomic coordinates for input gene.") + } + } else{ + # or just showing a certain region around the outlier junction + gr <- outlier_range + start(gr) <- start(gr) - left_extension + end(gr) <- end(gr) + right_extension + } + + # if several genes overlap, only show those on same strand as outlier + if(as.character(strand(outlier_range)) != "*" & + length(gr[strand(gr) == strand(outlier_range),]) > 0){ + gr <- gr[strand(gr) == strand(outlier_range),] + } + + # create the coverage plot for the given outlier + fds <- plotBamCoverage(fds, + gr=gr, + sampleID=sid, + txdb=txdb, + highlight_range=outlier_range, + ...) + return(invisible(fds)) +} + +plotManhattan.FRASER <- function(object, sampleID, value="pvalue", + type=fitMetrics(object), chr=NULL, + main=paste0("sample: ", sampleID), + chrColor=c("black", "darkgrey"), + ...){ + # check necessary packages + if (!requireNamespace('ggbio')){ + stop("For this function, the ggbio package is required.") + } + if (!requireNamespace('biovizBase')){ + stop("For this function, the biovizBase package is required.") + } + + # check arguments + stopifnot(is(object, "FraserDataSet")) + stopifnot(sampleID %in% samples(object)) + type <- match.arg(type) + additional_args <- list(...) + padjCutoff <- 0.1 + if("padjCutoff" %in% names(additional_args)){ + padjCutoff <- additional_args$padjCutoff + } + deltaPsiCutoff <- ifelse(type == "jaccard", 0.1, 0.3) + if("deltaPsiCutoff" %in% names(additional_args)){ + deltaPsiCutoff <- additional_args$deltaPsiCutoff + } + + # extract neccessary informations + gr_sample <- rowRanges(object, type=type) + seqlevelsStyle(gr_sample) <- seqlevelsStyle(object) + mcols(gr_sample)[,"pvalue"] <- -log10( + pVals(object, type=type, level="junction")[,sampleID]) + mcols(gr_sample)[,"padjust"] <- -log10( + padjVals(object, type=type, level="site")[,sampleID]) + mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID] + + # Add values to granges + if(value %in% c('pvalue', 'pValue', 'pv')){ + gr_sample$value <- mcols(gr_sample)[, "pvalue"] + ylabel <- expression(paste(-log[10], "(P-value)")) + } + if(value %in% c('zscore', 'zScore')){ + gr_sample$value <- zScores(object, type=type)[, sampleID] + ylabel <- value + } + if(value %in% c('delta', 'deltaPsi', 'deltaJaccard')){ + gr_sample$value <- mcols(gr_sample)[, "delta"] + ylabel <- bquote(Delta ~ .(ggplotLabelPsi(type)[[1]])) + } + + # only one point per donor/acceptor site (relevant only for psi5 and psi3) + index <- getSiteIndex(object, type=type) + nonDup <- !duplicated(index) + gr_sample <- gr_sample[nonDup,] + + # Sort granges for plot + gr_sample <- sortSeqlevels(gr_sample) + gr_sample <- sort(gr_sample) + + # subset to chromosomes in chrSubset if requested + if(!is.null(chr)){ + # check input + if(!all(chr %in% unique(seqnames(gr_sample)))){ + stop("Not all chromosomes selected for subsetting are present ", + "in the GRanges object.") + } + + # subset + gr_sample <- gr_sample[as.character(seqnames(gr_sample)) %in% chr] + + # add chr to plot title if only one chr given + if(length(chr) == 1){ + main <- paste0(main, "; ", + paste(chr, collapse=", ", sep="")) + } + } + + # find outlier indices + if(!type %in% c("psi3", "psi5")){ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) & + abs(gr_sample$delta) >= deltaPsiCutoff) + } else{ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff)) + } + message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...") + + # plot manhattan plot + plotGrandLinear.adapted(gr_sample, aes(y=value), + color=chrColor, + highlight.gr=gr_sample[outlier_idx,], + highlight.overlap="equal", + use.genome.coords=is.null(chr)) + + labs(title=main, x="", y=ylabel) + +} + +#' +#' Plot manhattan plot of junction pvalues +#' +#' @rdname plotFunctions +#' @export +setMethod("plotManhattan", signature="FraserDataSet", + plotManhattan.FRASER) + + #' #' helper function to get the annotation as data frame from the col data object #' @@ -1099,6 +1698,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ if(isFALSE(asCharacter)){ vapply(type, FUN=function(x) switch (x, + jaccard = c(bquote(Intron~Jaccard~Index)), psi5 = c(bquote(psi[5])), psi3 = c(bquote(psi[3])), theta = c(bquote(theta))), @@ -1106,9 +1706,180 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ } else{ vapply(type, FUN=function(x) switch (x, + jaccard = "Intron-Jaccard-Index", psi5 = "psi[5]", psi3 = "psi[3]", theta = "theta"), FUN.VALUE=character(1)) } } + +#' +#' Extract info from bam files needed for SGSeq functions to work +#' +#' @noRd +getSGSeqSI <- function(fds, sample_ids){ + + # check if bam info is already stored in fds for given samples + if("SGSeq_sampleinfo" %in% names(metadata(fds))){ + si <- metadata(fds)[["SGSeq_sampleinfo"]] + si <- si[si$sample_name %in% sample_ids,] + if(nrow(si) != length(sample_ids)){ + # add bam info for missing sample_ids + missing_ids <- sample_ids[!sample_ids %in% si$sample_name] + message("Extracting SGSeq sample info from BAM files for samples ", + paste(missing_ids, collapse=", "), " ...") + df_missing <- data.frame( + sample_name=samples(fds)[samples(fds) %in% missing_ids], + file_bam=bamFile(fds)[samples(fds) %in% missing_ids]) + si_new <- SGSeq::getBamInfo(df_missing, yieldSize=1e6) + si_new$lib_size <- 50e6 # dummy value to speed up this part + si <- rbind(si, si_new) + metadata(fds)[["SGSeq_sampleinfo"]] <- + rbind(metadata(fds)[["SGSeq_sampleinfo"]], si_new) + } + return(list(si, fds)) + } else{ + message("Extracting SGSeq sample info from BAM files for samples ", + paste(sample_ids, collapse=", "), " ...") + df <- data.frame( + sample_name=samples(fds)[samples(fds) %in% sample_ids], + file_bam=bamFile(fds)[samples(fds) %in% sample_ids]) + si <- SGSeq::getBamInfo(df, yieldSize=1e6) + si$lib_size <- 50e6 # dummy value to speed up this part + metadata(fds)[["SGSeq_sampleinfo"]] <- si + return(list(si, fds)) + } +} + +#' +#' Adapted function from ggbio package to create manhattan plot. +#' Adapted to allow highlighting only ranges that exactly match. Uses functions +#' from package biovizBase. +#' +#' @noRd +plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, + geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1, + legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL, + highlight.name = NULL, highlight.col = "red", highlight.label = TRUE, + highlight.label.size = 5, highlight.label.offset = 0.05, + highlight.label.col = "black", + highlight.overlap = c("any", "start", "end", "within", "equal"), + spaceline = FALSE, use.genome.coords=TRUE){ + if (is.null(geom)) + geom <- "point" + args <- list(...) + args.aes <- biovizBase::parseArgsForAes(args) + args.non <- biovizBase::parseArgsForNonAes(args) + two.color <- c("#0080FF", "#4CC4FF") + .is.seq <- FALSE + if (!"colour" %in% names(args.aes)) { + if (!any(c("color", "colour") %in% names(args.non))) { + .color <- two.color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + } + else { + if (length(args.non$color) > 1) { + .color <- args.non$color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + args.non <- args.non[!names(args.non) %in% c("colour", + "color")] + } + } + } + else { + if (quo_name(args.aes$colour) == "seqnames") + args.aes$colour <- as.name("seqnames") + } + if (!"y" %in% names(args.aes)) + stop("need to provide y") + if(isTRUE(use.genome.coords)){ + args.non$coord <- "genome" + } + args.non$space.skip <- space.skip + args.non$geom <- geom + args.non$object <- obj + aes.res <- do.call(aes, args.aes) + p <- do.call(ggbio::autoplot, c(list(aes.res), args.non)) + if (!legend) + p <- p + theme(legend.position = "none") + if (!missing(ylab)) + p <- p + ylab(ylab) + if (!is.null(cutoff)) + p <- p + geom_hline(yintercept = cutoff, color = cutoff.color, + size = cutoff.size) + chrs <- names(seqlengths(obj)) + if (.is.seq) { + N <- length(chrs) + cols <- rep(.color, round(N/length(.color)) + 1)[1:N] + names(cols) <- chrs + p <- p + scale_color_manual(values = cols) + } + if (!missing(facets)) { + args$facets <- facets + args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid, + facet_wrap) + facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets) + p <- p + facet + } + p <- p + theme(panel.grid.minor = element_blank()) + if (!is.null(highlight.gr)) { + highlight.overlap <- match.arg(highlight.overlap) + idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap) + .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)), + function(id) { + gr <- GRanges(as.character(seqnames(p@data))[id][1], + IRanges(start = min(start(p@data[id])), end = max(end(p@data[id])))) + val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)])) + val <- val * (1 + highlight.label.offset) + values(gr)$val <- val + gr + }) + .h.pos <- suppressWarnings(do.call("c", unname(.h.pos))) + if (length(.h.pos)) { + if (is.null(highlight.name)) { + highlight.name <- names(highlight.gr) + } + else { + highlight.name <- values(highlight.gr)[, highlight.name] + } + p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]), + do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), + color = highlight.col) + if (!is.null(highlight.name)) { + seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj) + suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj)) + .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip) + values(.trans)$mean <- (start(.trans) + end(.trans))/2 + values(.trans)$names <- highlight.name + p <- p + geom_text(data = biovizBase::mold(.trans), + size = highlight.label.size, + vjust = 0, color = highlight.label.col, do.call(aes, + list(x = substitute(mean), y = as.name("val"), + label = as.name("names")))) + } + } + } + if (spaceline) { + vline.df <- p@ggplot$data + vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames, + function(dd) { + data.frame(start = min(dd$start), end = max(dd$end)) + })) + gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2 + p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") + + theme(panel.grid = element_blank()) + } + if (!missing(main)) + p <- p + labs(title = main) + if (!missing(xlim)) + p <- p + xlim(xlim) + if (!missing(ylim)) + p <- p + ylim(ylim) + if (missing(xlab)) + xlab <- "" + p <- p + ggplot2::xlab(xlab) + p +} diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index deac8b59..be256009 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -33,7 +33,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){ #' @describeIn FRASER This function calculates two-sided p-values based on #' the beta-binomial distribution (or binomial or normal if desired). The -#' returned p values are already adjusted with Holm's method per donor or +#' returned p values are not yet adjusted with Holm's method per donor or #' acceptor site, respectively. #' #' @param distributions The distribution based on which the p-values are @@ -109,11 +109,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="BetaBinomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="BetaBinomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("binomial" %in% distributions){ @@ -125,11 +120,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="Binomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Binomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("normal" %in% distributions){ @@ -142,22 +132,57 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval, 0.5) pVals(fds, dist="Normal", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Normal", level="site", - withDimnames=FALSE) <- fwer_pvals } fds } -adjust_FWER_PValues <- function(i, pvals=pvals, index=index){ - dt <- data.table(p=pvals[,i], idx=index) - dt2 <- dt[,.(pa=min(p.adjust(p, method="holm"), na.rm=TRUE)),by=idx] +adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + dt <- data.table(p=pvals[,i], idx=index, rho=rho) + dt[rho > rhoCutoff, p:=NA] + suppressWarnings(dt2 <- dt[,.(pa=min(p.adjust(p, method=method), + na.rm=TRUE)),by=idx]) + dt2[is.infinite(pa), pa:=NA] setkey(dt2, "idx")[J(index)][,pa] } +adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + pvals[rho > rhoCutoff,] <- NA + dttmp <- data.table(idx=index, rho=rho, + apply(pvals, 2, as.numeric))[idx == i,] + suppressWarnings( + pa <- apply(as.matrix(dttmp[,-c("idx", "rho")]), 2, + function(x) min(p.adjust(x, method=method), + na.rm = TRUE) ) + ) + pa[is.infinite(pa)] <- NA + return(pa) +} + +getFWERpvals_bySample <- function(pvals, index, rho, method="holm", + rhoCutoff, BPPARAM=bpparam()){ + fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(cbind, fwer_pval) + return(fwer_pvals) +} + +getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", + rhoCutoff, BPPARAM=bpparam()){ + unique_idx <- unique(index) + fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(rbind, fwer_pval) + fwer_pvals <- as.matrix( + setkey(data.table(idx=unique_idx, fwer_pvals), + "idx")[J(index)][,-c("idx")]) + return(fwer_pvals) +} + singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ ki <- k[idx,] @@ -171,7 +196,7 @@ singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ pvals <- pmin(1, pbbinom(ki, ni, alphai, betai)) if(any(is.na(pvals))){ - message(date(), " : ", idx) + message(date(), ": obtained NA pvalues for junction ", idx) } return (pvals) @@ -188,37 +213,159 @@ singlePvalueBinomial <- function(idx, k, n, mu){ } #' @describeIn FRASER This function adjusts the previously calculated -#' p-values per sample for multiple testing. +#' p-values per sample for multiple testing. First, the previoulsy calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, if gene symbols have been annotated to +#' junctions (and not otherwise requested), gene-level p values are computed. #' -#' @param method The p.adjust method that should be used. +#' @param method The p.adjust method that should be used for genome-wide +#' multiple testing correction. +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which junctions are +#' masked with NA during p value adjustment (default: NA, no masking). +#' @param geneLevel Logical value indiciating whether gene-level p values +#' should be calculated. Defaults to TRUE. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. +#' @param subsets A named list of named lists specifying any number of gene +#' subsets (can differ per sample). For each subset, FDR correction +#' will be limited to genes in the subset, and the FDR corrected +#' pvalues stored as an assay in the fds object in addition to the +#' transcriptome-wide FDR corrected pvalues. See the examples for +#' how to use this argument. #' #' @export -calculatePadjValues <- function(fds, type=currentType(fds), method="BY"){ +calculatePadjValues <- function(fds, type=currentType(fds), method="BY", + rhoCutoff=NA, geneLevel=TRUE, + geneColumn="hgnc_symbol", subsets=NULL, + BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) idx <- !duplicated(index) for(i in c("BetaBinomial", "Binomial", "Normal")){ # only do it if it exists - if(!paste0("pvalues", i, "_", type) %in% assayNames(fds)){ + if(!paste0("pvalues", i, "_junction_", type) %in% assayNames(fds)){ next } - pvals <- pVals(fds, dist=i) - padj <- apply(pvals[idx,], 2, p.adjust, method=method) + pvals <- pVals(fds, dist=i, level="junction") + rho <- rho(fds, type=type) + + # splice site-level pval correction + message(date(), ": adjusting junction-level pvalues ...") + fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", + rhoCutoff=ifelse(is.na(rhoCutoff), 1, rhoCutoff), + BPPARAM=BPPARAM) + if(!is.na(rhoCutoff)){ + filters <- list(rho=rhoCutoff) + } else{ + filters <- list() + } + pVals(fds, dist=i, level="site", filters=filters, + withDimnames=FALSE) <- fwer_pvals + + # junction-level FDR correction + message(date(), ": genome-wide FDR for junction-level pvalues ...") + padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method) padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)] padjDT[,i:=NULL] - padjVals(fds, dist=i, withDimnames=FALSE) <- as.matrix(padjDT) + padjVals(fds, dist=i, level="site", filters=filters, + withDimnames=FALSE) <- as.matrix(padjDT) + + # gene-level pval correction and FDR + if(isTRUE(geneLevel) && + geneColumn %in% colnames(mcols(fds, type=type))){ + message(date(), ": calculating gene-level pvalues ...") + gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, + method="holm", FDRmethod=method, + geneColumn=geneColumn, + BPPARAM=BPPARAM) + pVals(fds, dist=i, level="gene", filters=filters, + withDimnames=FALSE) <- gene_pvals[["pvals"]] + padjVals(fds, dist=i, level="gene", filters=filters, + withDimnames=FALSE) <- gene_pvals[["padj"]] + } else if(isTRUE(geneLevel)){ + warning("Gene-level pvalues could not be calculated as column ", + geneColumn, "\nwas not found for the given fds object. ", + "Please annotate gene symbols \nfirst using the ", + "annotateRanges function.") + } + + # calculate FDR for each provided subset and assign to fds + if(!is.null(subsets)){ + stopifnot(is.list(subsets)) + stopifnot(!is.null(names(subsets))) + for(setName in names(subsets)){ + geneListSubset <- subsets[[setName]] + fds <- calculatePadjValuesOnSubset(fds=fds, + genesToTest=geneListSubset, + subsetName=setName, + type=type, method=method, + geneColumn=geneColumn, + BPPARAM=BPPARAM) + } + } } return(fds) } -getSiteIndex <- function(fds, type){ +getPvalsPerGene <- function(fds, type=currentType(fds), + pvals=pVals(fds, type=type, level="site"), + sampleID=NULL, method="holm", FDRmethod="BY", + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ + # extract data and take only the first index of per site + message(date(), ": starting gene-level pval computation for type ", type) + samples <- samples(fds) + if(is.null(colnames(pvals))){ + colnames(pvals) <- samples + } + dt <- data.table( + idx=getSiteIndex(fds, type=type), + geneID=getGeneIDs(fds, type=type, unique=FALSE, + geneColumn=geneColumn), + as.data.table(pvals)) + dt <- dt[!is.na(geneID)] + geneIDs <- getGeneIDs(fds, type=type, unique=TRUE, + geneColumn=geneColumn) + + # separate geneIDs into rows + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + setkey(dt, geneID) + + # extract samples + if(!is.null(sampleID)){ + samples <- sampleID + } + + # aggregate pvalues to gene level per sample + message(date(), ": gene-level pval computation per gene (n=", + length(geneIDs), ")") + pvalsPerGene <- genePvalsByGeneID(dt, samples=samples, geneIDs=geneIDs, + method=method, BPPARAM=BPPARAM) + + # compute FDR + message(date(), ": genome-wide FDR for gene-level pvals for type ", type) + padjPerGene <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod) + + message(date(), ": finished gene-level pval computation for type ", type) + return(list(pvals=pvalsPerGene, padj=padjPerGene)) + +} + +getSiteIndex <- function(fds, type=currentType(fds)){ if(type == "theta"){ return(mcols(fds, type=type)[['spliceSiteID']]) } + if(type == "jaccard"){ + return(seq_len(nrow(fds))) + } + startId <- mcols(fds, type=type)[,"startID"] endId <- mcols(fds, type=type)[,"endID"] strand <- strand(rowRanges(fds, type=type)) @@ -233,48 +380,178 @@ getSiteIndex <- function(fds, type){ ans[selectionMat] } -getGeneIDs <- function(fds, type, unique=TRUE){ - geneIDs <- mcols(fds, type=type)$hgnc_symbol +getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, + geneColumn="hgnc_symbol"){ + if(!geneColumn %in% colnames(mcols(fds, type=type))){ + stop("Did not find column '", geneColumn, "' in mcols(fds, type='", + type, "'). Please assign introns\nto genes first using the ", + "annotateRanges(fds, ...) or annotateRangesWithTxDb(fds, ...) ", + "function.") + } + + geneIDs <- mcols(fds, type=type)[[geneColumn]] if(isTRUE(unique)){ - geneIDs <- unique(geneIDs) + geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){ + unlist(strsplit(g, ";"))}) )) geneIDs <- geneIDs[!is.na(geneIDs)] } geneIDs } -getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type), - sampleID=NULL, method="holm", BPPARAM=bpparam()){ - # extract data and take only the first index of per site - dt <- data.table( - idx=getSiteIndex(fds, type=type), - geneID=getGeneIDs(fds, type=type, unique=FALSE), - as.data.table(pvals)) - dt <- dt[!duplicated(idx) & !is.na(geneID)] - setkey(dt, geneID) +genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ + pvalsPerGene <- bplapply(geneIDs, BPPARAM=BPPARAM, + FUN=function(g) { + dttmp <- dt[geneID == g][!duplicated(idx)] + suppressWarnings( + pval_g <- apply(as.matrix(dttmp[,-c("idx", "geneID")]), 2, + function(x) min(p.adjust(x, method=method), na.rm = TRUE) ) + ) + pval_g[is.infinite(pval_g)] <- NA + pval_g + }) + pvalsPerGene <- do.call(rbind, pvalsPerGene) + rownames(pvalsPerGene) <- geneIDs + return(pvalsPerGene) +} + +#' @describeIn FRASER This function does FDR correction only for all junctions +#' in a certain subset of genes which can differ per sample. Requires gene +#' symbols to have been annotated to junctions. As with the full FDR +#' correction across all junctions, first the previously calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, gene-level p values are computed. +#' +#' @param genesToTest A named list with the subset of genes to test per sample. +#' The names must correspond to the sampleIDs in the given fds object. +#' @param subsetName The name under which the resulting FDR corrected pvalues +#' will be stored in metadata(fds). +#' +#' @export +calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, + type=currentType(fds), method='BY', + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ - samples <- samples(fds) - if(!is.null(sampleID)){ - samples <- sampleID + # check input + stopifnot(!is.null(genesToTest)) + stopifnot(is.list(genesToTest) || is.vector(genesToTest)) + + # replicate subset genes for each sample if given as vector + if(!is.list(genesToTest)){ + genesToTest <- rep(list(genesToTest), ncol(fds)) + names(genesToTest) <- colnames(fds) } - pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, - function(i){ - dttmp <- dt[,min(p.adjust(get(i), method=method)),by=geneID] - setkey(dttmp, geneID) - dttmp[J(getGeneIDs(fds, type=type)), V1] - })), ncol=length(samples)) + # check that names are present and correspond to samples in ods + stopifnot(!is.null(names(genesToTest))) + if(!all(names(genesToTest) %in% colnames(fds))){ + stop("names(genesToTest) need to be sampleIDs in the given fds object.") + } - colnames(pvalsPerGene) <- samples - rownames(pvalsPerGene) <- getGeneIDs(fds, type=type) + # get genes from fds object + fds_genes <- getGeneIDs(fds, unique=TRUE, type=type, geneColumn=geneColumn) + ngenes <- length(fds_genes) - return(pvalsPerGene) - + # site index (for psi3/5) + site_idx <- getSiteIndex(fds, type=type) + + # compute FDR on the given subsets of genes + message(date(), ": starting FDR calculation on subset of genes...") + + # compute FDR on the given subsets of genes + fdrSubset <- bplapply(colnames(fds), FUN=function(sampleId){ + + # get genes to test for this sample + genesToTestSample <- genesToTest[[sampleId]] + padj <- rep(NA, nrow(mcols(fds, type=type))) + padj_gene <- rep(NA, ngenes) + + # if no genes present in the subset for this sample, return NAs + if(is.null(genesToTestSample)){ + return(list(padj=padj, padj_gene=padj_gene)) + } + + # get idx of junctions corresponding to genes to test + if(is.character(genesToTestSample)){ + rowIdx <- sort(which(fds_genes %in% genesToTestSample)) + rowIdx <- unlist(lapply(genesToTestSample, function(gene){ + idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), + mcols(fds, type=type)[, geneColumn])) + names(idx) <- rep(gene, length(idx)) + if(length(idx) == 0 && verbose(fds) > 0){ + warning("No introns found in fds object for gene: ", gene, + " and sample: ", sampleId, ". Skipping this gene.") + } + return(idx) + })) + rowIdx <- sort(rowIdx[!duplicated(rowIdx)]) + } else{ + stop("Genes in the list to test must be a character vector ", + "of geneIDs.") + } + + # check that rowIdx is not empty vector + if(length(rowIdx) == 0){ + warning("No genes from the given subset found in the fds ", + "object for sample: ", sampleId) + return(list(padj=padj, padj_gene=padj_gene)) + } + + + + # retrieve pvalues of introns to test + p <- as.matrix(pVals(fds, type=type)) + if(ncol(p) == 1){ + colnames(p) <- colnames(fds) + } + p <- p[rowIdx, sampleId] + + # FDR correction on subset + non_dup_site_idx <- !duplicated(site_idx[rowIdx]) + padjSub <- p.adjust(p[non_dup_site_idx], method=method) + + # set intron FDR on subset (filled with NA for all other genes) + padj[rowIdx] <- padjSub + + # gene level pvals + dt <- data.table(sampleID=sampleId, type=type, pval=p, + gene=names(rowIdx), jidx=rowIdx, site_idx=site_idx[rowIdx]) + dt <- merge(dt, + data.table(site_idx=site_idx[rowIdx][non_dup_site_idx], + FDR_subset=padjSub), + by="site_idx") + dt[!duplicated(dt$site_idx), + pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"] + + # gene level FDR + dt2 <- dt[, unique(pval_gene), by="gene"] + dt2[, FDR_subset_gene := p.adjust(V1, method=method)] + dt2[, gene_rowIdx := sapply(gene, function(g) which(fds_genes == g))] + + # set intron FDR on subset (filled with NA for all other genes) + padj_gene[dt2[,gene_rowIdx]] <- dt2[, FDR_subset_gene] + + # return new FDR + return(list(padj=padj, padj_gene=padj_gene)) + + }, BPPARAM=BPPARAM) + padjSub <- vapply(fdrSubset, '[[', + double(nrow(mcols(fds, type=type))), 'padj') + padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene') + + colnames(padjSub) <- colnames(fds) + rownames(padjSub_gene) <- fds_genes + colnames(padjSub_gene) <- colnames(fds) + + # add FDR subset info to ods object and return + padjVals(fds, type=type, level="site", subsetName=subsetName, + withDimnames=FALSE) <- padjSub + padjVals(fds, type=type, level="gene", subsetName=subsetName, + withDimnames=FALSE) <- padjSub_gene + addToAvailableFDRsubsets(fds) <- subsetName + + message(date(), ": finished FDR calculation on subset of genes.") + validObject(fds) + return(fds) } -getPadjPerGene <- function(pvals, method="BY"){ - - padjPerGene <- apply(pvals, 2, p.adjust, method=method) - - return(padjPerGene) - -} diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R new file mode 100644 index 00000000..7ab9b61d --- /dev/null +++ b/R/resultAnnotations.R @@ -0,0 +1,974 @@ +#' +#' @title Additional result annotations +#' +#' @description These functions work on the result table and add additional +#' annotations to the reported introns: the type of potential impact on +#' splicing (e.g. exon skipping, exon truncation, ...), potential occurence +#' of frameshift, overlap with UTR regions as well as a flag for introns +#' that are located in blacklist regions of the genome. +#' +#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the +#' fds for each intron based on the overlap of the intron's location with +#' the reference annotation. Has to be run before the result table is +#' created so that the new column can be included in it (see examples). +#' +#' \code{\link{annotatePotentialImpact}} annotates each intron in the results +#' table with the type of potential impact on splicing and potential +#' occurence of frameshift (likely, unlikely, inconclusive). Can also +#' calculate overlap with annotated UTR regions. Potential impact can be: +#' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, +#' exonTruncation, exonElongation, exonTruncation&Elongation, +#' exonSkipping, splicingBeyondGene, +#' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, +#' complex (everything else). +#' Splice sites (theta metric) annotations indicate how the splice site is +#' located with respect to the reference annotation. The annotated types +#' are: annotatedSpliceSite, exonicRegion, intronicRegion. +#' +#' \code{\link{flagBlacklistRegions}} flags introns in the results table on +#' whether or not they are located in a blacklist region of the genome. By +#' default, the blacklist regions as reported in +#' \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from +#' \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} +#' are used. +#' +#' @param fds A FraserDataSet +#' @param txdb A txdb object providing the reference annotation. +#' @param result A result table as generated by FRASER, including the column +#' \code{annotatedJunction} as generated by the function +#' \code{annotateIntronReferenceOverlap}. +#' @param addPotentialImpact Logical, indicating if the type of the potential +#' impact should be added to the results table. Defaults to \code{TRUE}. +#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions +#' should checked and added to the results table. Defaults to \code{TRUE}. +#' @param minoverlap Integer value defining the number of base pairs around the +#' splice site that need to overlap with UTR or blacklist region, +#' respectivly, to be considered matching. Defaults to 5 bp. +#' @param blacklist_regions A BED file that contains the blacklist regions. +#' If \code{NULL} (default), the BED files that are packaged with FRASER +#' are used (see Details for more information). +#' @param assemblyVersion Indicates the genome assembly version of the intron +#' coordinates. Only used if blacklist_regions is NULL. For other versions, +#' please provide the BED file containing the blacklist regions directly. +#' @param BPPARAM For controlling parallelization behavior. Defaults to +#' \code{bpparam()}. +#' @return An annotated FraserDataSet or results table, respectively +#' +#' @name potentialImpactAnnotations +#' @rdname potentialImpactAnnotations +#' +#' @examples +#' # get data, fit and compute p-values and z-scores +#' fds <- createTestFraserDataSet() +#' +#' # load reference annotation +#' library(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' +#' # add basic annotations for overlap with the reference annotation +#' # run this function before creating the results table +#' fds <- annotateIntronReferenceOverlap(fds, txdb) +#' +#' # extract results: for this small example dataset, no cutoffs used +#' # to get some results +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +#' +#' # annotate the type of potential impact on splicing and UTR overlap +#' res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) +#' +#' # annotate overlap with blacklist regions +#' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") +#' +#' # show results table containing additional annotations +#' res +#' +NULL + +#' @describeIn potentialImpactAnnotations This method calculates basic annotations +#' based on overlap with the reference annotation (start, end, none, both) +#' for the full fds. The overlap type is added as a new column +#' \code{annotatedJunction} in \code{mcols(fds)}. +#' @export +annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ + message("loading introns ...") + #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + introns <- unique(unlist(intronsByTranscript(txdb))) + # reduce the introns to only the actually expressed introns + fds_known <- fds[unique(to(findOverlaps(introns, + rowRanges(fds, type = "j"), type = "equal"))),] + anno_introns <- as.data.table(rowRanges(fds_known, + type="j"))[,.(seqnames, start, end, strand)] + + # calculate extra columns with mean/median intron expression count + # add the new columns + sampleCounts <- as.matrix(K(fds_known, type = "j")) + anno_introns[, meanCount := rowMeans(sampleCounts)] + anno_introns[, medianCount := rowMedians(sampleCounts)] + # order by medianCount (highest first) + setorderv(anno_introns, "medianCount", order=-1) + anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, + keep.extra.columns = TRUE) + + # get all fds junctions + fds_junctions <- rowRanges(fds, type = "j") + + # Do the annotation just for the intron with highest median expression + message("start calculating basic annotations ...") + overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") + annotations <- bplapply(seq_len(length(fds_junctions)), + function(i, overlaps, fds_junctions, anno_introns_ranges){ + # only select first intron as already ordered by medianCount beforehand + overlap <- overlaps[i] + if(is.na(overlap)) return("none") #no overlap with any intron + + hit_equal <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="equal")) + if(length(hit_equal) > 0) return("both") + + hit_start <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="start")) + if(length(hit_start) > 0) return("start") + hit_end <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="end")) + if(length(hit_end) > 0) return("end") + + # overlaps but no start/end match + return("none") + }, overlaps=overlaps, fds_junctions=fds_junctions, + anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM) + annotations <- unlist(annotations) + + rowRanges(fds)$annotatedJunction <- annotations + mcols(fds, type="ss")$annotatedJunction <- "not computed" + message("basic annotations done") + return(fds) +} + +#' @describeIn potentialImpactAnnotations This method annotates the splice event +#' type to junctions in the given results table. +#' @export +annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE, + addUTRoverlap=TRUE, minoverlap=5, + BPPARAM=bpparam()){ + + # convert to data.table if not already + if(!is.data.table(result)){ + annoResult <- as.data.table(result) + } else{ + annoResult <- result + } + + # Create basic annotation of overlap with reference + if(!("annotatedJunction" %in% colnames(annoResult))){ + stop("Column 'annotatedJunction' not found in the results table!\n", + "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ", + "first and then extract \nthe results table using the ", + "'results(fds, ...)' function before calling this function.") + } + + # Calculate splice types and frameshift + if(isTRUE(addPotentialImpact)){ + annoResult <- addPotentialImpactLabels(annoResult, fds, txdb) + annoResult[potentialImpact == "singleExonSkipping", + potentialImpact := "exonSkipping"] + } + + # Add UTR labels + if(isTRUE(addUTRoverlap)){ + annoResult <- addUTRLabels(annoResult, txdb) + } + + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) +} + +#' @describeIn potentialImpactAnnotations This method flags all introns and +#' splice sites in the given results table for which at least one splice +#' site (donor or acceptor) is located in a blacklist region. Blacklist +#' regions of the genome are determined from the provided BED file. +#' @export +flagBlacklistRegions <- function(result, blacklist_regions=NULL, + assemblyVersion=c('hg19', 'hg38'), + minoverlap=5){ + + # convert to data.table if not already + if(!is.data.table(result)){ + annoResult <- as.data.table(result) + } else{ + annoResult <- result + } + + assemblyVersion <- match.arg(assemblyVersion) + if(is.null(blacklist_regions)){ + blacklist_regions <- + system.file("extdata", "blacklist_regions", + paste0(assemblyVersion, "-blacklist.v2.bed.gz"), + package = "FRASER") + } + if(!file.exists(blacklist_regions)){ + stop("BED file with blacklist regions does not exist: ", + blacklist_regions) + } + message("Importing blacklist regions ...") + blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED") + annoResult <- addBlacklistLabels(annoResult, blacklist_gr) + + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) +} + +############# helper functions ############################## + +#' blacklist annotation for aberrant splicing events +#' @noRd +addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){ + # add the blacklist information + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) + + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + + # set to the same seqlevelsstyle + seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr) + + ## create overlap with blacklist and annotate extra column + message("finding blacklist overlap ...") + black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr))) + black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr))) + junctions_dt[, blacklist := FALSE] + + junctions_dt[black_hits_start_ss, blacklist := TRUE] + junctions_dt[black_hits_end_ss, blacklist := TRUE] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + + message("blacklist labels done") + return(junctions_dt) +} + +#' adds UTR overlap annotation to results table +#' @noRd +addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){ + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + + ### UTR labels based on txdb file + ### add 5' 3' UTR labels + message("finding UTR overlap ...") + threes_start <- unique(from(findOverlaps(gr_start_ss, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + threes_end <- unique(from(findOverlaps(gr_end_ss, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + fives_start <- unique(from(findOverlaps(gr_start_ss, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + fives_end <- unique(from(findOverlaps(gr_end_ss, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + junctions_dt[, UTR_overlap := "no"] + junctions_dt[threes_start, UTR_overlap := "3'-UTR"] + junctions_dt[threes_end, UTR_overlap := "3'-UTR"] + junctions_dt[fives_start, UTR_overlap := "5'-UTR"] + junctions_dt[fives_end, UTR_overlap := "5'-UTR"] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + message("UTR labels done") + return(junctions_dt) +} + + + +#' adds type of splicing to each intron in the results table +#' @noRd +addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ + message("preparing ...") + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + introns_tmp <- unique(unlist(intronsByTranscript(txdb))) + exons <- exons(txdb) + + # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + fds_known <- fds[unique(to(findOverlaps(introns_tmp, + rowRanges(fds, type = "j"), + type = "equal"))),] + grIntrons <- rowRanges(fds_known, type="j") + introns <- as.data.table(grIntrons) + introns <- introns[,.(seqnames, start, end, strand)] + + sampleCounts <- K(fds_known, type = "j") + introns[, "meanCount" := rowMeans(sampleCounts)] + introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))] + intron_ranges <- makeGRangesFromDataFrame(introns, + keep.extra.columns = TRUE) + + # prepare the results column + junctions_dt[, potentialImpact := "complex"] + junctions_dt[, causesFrameshift := "inconclusive"] + junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, + potentialImpact := "annotatedIntron_increasedUsage"] + junctions_dt[annotatedJunction == "both" & deltaPsi < 0, + potentialImpact := "annotatedIntron_reducedUsage"] + junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] + + if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% + colnames(junctions_dt))){ + junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" & + type == "jaccard" & + nonsplitProportion >= nonsplitProportion_99quantile + 0.05 & + nonsplitCounts >= 10, + potentialImpact := "(partial)intronRetention"] + + # TODO check frameshift for intron retention + junctions_dt[potentialImpact == "(partial)intronRetention", + causesFrameshift := "inconclusive"] + } + + starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start") + ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") + nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none") + + message("calculating splice types ...") + # start junctions + start_results <- sapply(starts, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "start")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE, + intron_ranges, exons)) + }) + junctions_dt[psi_positions[starts], + causesFrameshift:=start_results[2,]] + junctions_dt[psi_positions[starts], + potentialImpact := start_results[1,]] + + # end junctions + end_results <- sapply(ends, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "end")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE, + intron_ranges, exons)) + + }) + junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] + junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]] + + # none junctions pt1 + none_results <- sapply(nones, function(i){ + # find most freq intron + # check start and end + + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges)) + if(length(overlap) == 0) return(c("noOverlap", "inconclusive")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift + st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE, + intron_ranges, exons) + en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE, + intron_ranges, exons) + + # merge, start and end results + # merge exon elongation/truncation + # if both likely/unlikely fine + # if one is likely -> return likely + # if one is notYet -> return notYet + if((st[1] == "singleExonSkipping" & !(en[1] %in% + c("singleExonSkipping", "exonSkipping"))) || + (en[1] == "singleExonSkipping" & !(st[1] %in% + c("singleExonSkipping", "exonSkipping")))){ + ## only one is single exonSkipping, the other is trunc/elong + if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("singleExonSkipping", frs)) + } + + if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in% + c("exonSkipping", "singleExonSkipping")){ + return(c("exonSkipping", "inconclusive")) + } + + if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + if( st[1] != en[1]){ + combined = "exonTruncation&Elongation" + }else{combined = st[1]} + return(c(combined,frs)) + + }) + junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] + junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]] + + noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap") + refseq.genes<- genes(txdb) + + # none junctions pt2 + noLaps_results <- sapply(noLaps, function(i){ + overlap <- to(findOverlaps(junctions_gr[i], exons)) + # no overlap with an intron or an exon + if(length(overlap) == 0){ + return(checkIntergenic(junctions_gr, i, refseq.genes)) + } + + # for the exons, check if splice site is contained in the exon + for(j in overlap){ + exon_start = start(exons[j]) + exon_end = end(exons[j]) + if(exon_start <= start(junctions_gr[i]) & + exon_end >= end(junctions_gr[i])){ + if((end(junctions_gr[i]) - + start(junctions_gr[i]) + 1) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("exonTruncation", frs)) + } + } + + return(c("complex","inconclusive")) + }) + junctions_dt[psi_positions[noLaps], + causesFrameshift:=noLaps_results[2,]] + junctions_dt[psi_positions[noLaps], + potentialImpact := noLaps_results[1,]] + + # theta annotations + thetas <- which(junctions_dt$type == "theta") + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], + keep.extra.columns = TRUE) + + # specify default type for theta results as NA + junctions_dt[thetas, potentialImpact := NA] + junctions_dt[thetas, causesFrameshift := NA] + + # label all as intronic first if they have any intron overlap + intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp))) + junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"] + + # for exonic, check if theta is fully contained in an exon + # if one end is in an intron and the other in an exon it is a splice site + exonic <- unique(from(findOverlaps(junctions_gr, exons))) + within <- findOverlaps(junctions_gr, exons, type = "within") + all <- findOverlaps(junctions_gr, exons) + exonic_results <- sapply(exonic, function(i){ + w <- unique(to(within)[which(from(within) == i)]) + a <- unique(to(all)[which(from(all) == i)]) + if(length(a) == length(w)) return("exonicRegion") + return("annotatedSpliceSite") + }) + junctions_dt[thetas[exonic], potentialImpact := exonic_results] + + # check cases that don't overlap with an exon/intron + nones <- which(is.na(junctions_dt[thetas,]$potentialImpact)) + none_results <- sapply(nones, function(i){ + if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) + #return("intergenic") + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start(junctions_gr[i])){ + ifelse(strand(junctions_gr[i]) == "+", + return("upstreamOfNearestGene"), + return("downstreamOfNearestGene")) + }else{ + ifelse(strand(junctions_gr[i]) == "+", + return("downstreamOfNearestGene"), + return("upstreamOfNearestGene")) + } + }) + junctions_dt[thetas[nones], potentialImpact := none_results] + + # add distance to closest neighbour gene for intergenic results + # (both psi and theta) + message("adding distances to nearest gene ...") + up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene") + down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene") + + # create full grange object containing psi and theta + junctions_gr <- makeGRangesFromDataFrame(junctions_dt, + keep.extra.columns = TRUE) + + # Calculate distances + if(length(up) > 0){ + distanceNearestGene_up <- sapply(up, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) + if(length(distanceNearestGene_up > 0)){ + junctions_dt[psi_positions[up], + distNearestGene := distanceNearestGene_up] + } else{ + junctions_dt[psi_positions[up], distNearestGene := NA] + message("No distances found for upstream") + } + }else{message("No upstream targets")} + + if(length(down) > 0){ + distanceNearestGene_down <- sapply(down, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) + if(length(distanceNearestGene_down > 0)){ + junctions_dt[psi_positions[down], + distNearestGene := distanceNearestGene_down] + }else{ + junctions_dt[psi_positions[down], distNearestGene := NA] + message("No distances found for downstream") + } + }else{message("No downstream targets")} + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + message("done calculating splice types") + + # Add the subtypes for exonSkipping and inconclusive + junctions_dt <- checkExonSkipping(junctions_dt, txdb) + junctions_dt <- checkInconclusive(junctions_dt, txdb) + + return(junctions_dt) +} + +#' +#' @noRd +compareStarts <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_start = start(intron_ranges[max_lap]) + ss_start = start(junctions_gr[i]) + + # found the most freq intron with same end again + # check if intron starts before splice site -> exon elongation -> FRS + if(intron_start < ss_start){ + if(((ss_start - intron_start) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, + (ss_start - intron_start))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_start > ss_start){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(intron_start-2, end = intron_start -1), + strand = toString(strand(intron_ranges[max_lap])) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end")) + for(j in exonChoices){ + exon_start = start(exons[j]) + if(exon_start < ss_start){ + if((end(exons[j]) - ss_start + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation", frs, + (-1)*(end(exons[j]) - ss_start + 1))), + return(c("exonTruncation", frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, + end(junctions_gr[i])), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "end")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation + + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + ss_start - start(intron_ranges[secItrChoices[maxExpr]]) + + frs = ifelse(shift %% 3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +compareEnds <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_end = end(intron_ranges[max_lap]) + ss_end = end(junctions_gr[i]) + + # found the most freq intron with same start again + # check if intron ends after splice site -> exon elongation -> FRS -> done + if(intron_end > ss_end){ + if(((intron_end - ss_end) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, (intron_end - ss_end))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_end < ss_end){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + ranges = IRanges(intron_end + 1, end = intron_end + 2), + strand = toString(intron_ranges[max_lap]@strand@values) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start")) + for(j in exonChoices){ + exon_end = end(exons[j]) + if(exon_end > ss_end){ + if((ss_end - start(exons[j]) + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation",frs, + (-1)*(ss_end - start(exons[j]) + 1))), + return(c("exonTruncation",frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(start(junctions_gr[i]), + start(exons[exonChoices[1]]) - 1), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "start")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation at end + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + end(intron_ranges[secItrChoices[maxExpr]]) - ss_end + frs = ifelse(shift%%3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +checkIntergenic <- function(junctions_gr, i, refseq.genes){ + # check if start > 1000 + # start - 1000, end + 1000 + start = start(junctions_gr[i]) + # ifelse(start > 1000, start = start - 1000, start = 1) + # if(start > 1000){ + # start = start - 1000 + # }else{start = 1} + + end = end(junctions_gr[i]) #+ 1000 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_junction <- GRanges( + seqnames = seqnames(junctions_gr[i]), + ranges = IRanges(start, end), + strand = strand(junctions_gr[i]) + ) + + # overlap with introns and exon + # IGNORE STRANDS? -> decided its not necessary + + # check if distance to nearest is > 1000 -> intergenic + # otherwise up/downstream + dist = min(distance(test_junction, refseq.genes), na.rm = TRUE) + if(dist > 0){ + # find nearest and compare starts + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start){ + ifelse(strand(junctions_gr[i]) == "+", + return(c("upstreamOfNearestGene", "unlikely")), + return(c("downstreamOfNearestGene", "unlikely"))) + }else{ + ifelse(strand(junctions_gr[i]) == "+", + return(c("downstreamOfNearestGene", "unlikely")), + return(c("upstreamOfNearestGene", "unlikely"))) + } + } + return(c("complex", "inconclusive")) +} + +#' +#' @noRd +checkExonSkipping <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in% + c("exonSkipping", "singleExonSkipping")) + + message("start checking exonSkipping") + newSkip_results <- sapply(exonSkip, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("splicingBeyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("exonSkipping") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("splicingBeyondGene") + # end is in a different gene + return("multigenicSplicing") + }) + + # checking exonSkipping done + if(length(exonSkip) > 0){ + junctions_dt[psi_positions[exonSkip], + potentialImpact2 := newSkip_results] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + potentialImpact := "splicingBeyondGene"] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + causesFrameshift := "inconclusive"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + potentialImpact := "multigenicSplicing"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + causesFrameshift := "inconclusive"] + junctions_dt[, potentialImpact2 := NULL] + } + + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + return(junctions_dt) +} + +#' +#' @noRd +checkInconclusive <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + inconclusive <- which(junctions_dt[psi_positions + ]$potentialImpact == "complex") + + inconclusive_results <- sapply(inconclusive, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("splicingBeyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("complex") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("splicingBeyondGene") + # end is in a different gene + return("multigenicSplicing") + }) + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + + if(length(inconclusive) > 0){ + junctions_dt[psi_positions[inconclusive], + potentialImpact := inconclusive_results] + } + + return(junctions_dt) +} diff --git a/R/updateRho.R b/R/updateRho.R index f5a0d429..628b21fd 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -8,11 +8,15 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ n <- N(fds) y <- predictY(fds, noiseAlpha=currentNoiseAlpha(fds)) - fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, - k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, + # k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + fitparameters <- bplapply(seq_len(nrow(k)), estRho, + nll=fullNLLRho_penalized, + k=k, n=n, y=y, rhoRange=rhoRange, lambda=1e-4, + BPPARAM=BPPARAM) - rho(fds) <- vapply(fitparameters, "[[", - double(1), "minimum") + rho(fds) <- plogis(vapply(fitparameters, "[[", + double(1), "minimum")) if(isTRUE(verbose)){ stxt <- capture.output(summary(rho(fds))) @@ -23,16 +27,27 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ return(fds) } -estRho <- function(idx, k, n, y, rhoRange, nll, control=list()){ +estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=1e-4){ ki <- k[idx,] ni <- n[idx,] yi <- y[idx,] - est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # maximum=FALSE, tol=0.0000001) + # est + est <- optimize(f=nll, interval=rhoRange, + mui=plogis(yi), ki=ki, ni=ni, lambda=lambda, maximum=FALSE, tol=0.0000001) est } +fullNLLRho_penalized <- function(logit_rho, ki, ni, mui, lambda=1e-4){ + rho <- plogis(logit_rho) + nll <- -mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) + nll <- nll + lambda * (logit_rho^2) + return(nll) +} + negLogLikelihoodRho <- function(rho, ki, ni, mui){ #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE)) @@ -63,6 +78,20 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){ mean(alpha + beta - alphaK - betaNK ) } +trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){ + #-mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) + + rho <- plogis(logit_rho) + r <- (1-rho)/rho + alpha <- lgamma(mui*r) + alphaK <- lgamma(mui*r + ki) + beta <- lgamma((mui-1)*(-r)) + betaNK <- lgamma((mui-1)*(-r) + (ni - ki)) + + #mean negative log likelihood with pseudocounts + mean(alpha + beta - alphaK - betaNK ) + lambda * (logit_rho*logit_rho) +} + methodOfMomentsRho <- function(k, n, rhoRange=c(1e-5, 1 - 1e-5)){ # taken from wiki: diff --git a/R/variables.R b/R/variables.R index ccba65f5..d097201c 100644 --- a/R/variables.R +++ b/R/variables.R @@ -1,11 +1,11 @@ #' -#' Available psi types +#' Available splice metrics #' #' @examples -#' # to show available psi types: +#' # to show all available splice metrics: #' psiTypes #' +#' @rdname psiTypes #' @export -psiTypes <- c("psi5", "psi3", "theta") -names(psiTypes) <- psiTypes - +psiTypes <- c("jaccard", "psi5", "psi3", "theta") +names(psiTypes) <- c("Intron Jaccard Index", "psi5", "psi3", "theta") diff --git a/R/zzz.R b/R/zzz.R index 8f087a08..7f0b67d5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,7 +8,7 @@ op.fraser <- list( `FRASER-hdf5-chunk-nrow` = 30000, `FRASER-hdf5-chunk-ncol` = 20, - `FRASER.pseudoCount` = 1, + `FRASER.pseudoCount` = 0.1, `FRASER.minSamplesForDelayed` = 1000, `FRASER.maxSamplesNoHDF5` = 20, `FRASER.maxJunctionsNoHDF5` = 1000) diff --git a/README.md b/README.md index 8e9443aa..ef17f68a 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,21 @@ Please cite our method paper if you use it in a publication: > Mertes, C., Scheller, I.F., Yépez, V.A. *et al.* Detection of aberrant splicing events in RNA-seq data using FRASER. *Nat Commun* **12**, 529 (2021). https://doi.org/10.1038/s41467-020-20573-7 +## What's new + +FRASER 2.0, an improved version of FRASER, is now available and used by default (version 1.99.0 and above). +FRASER 2.0 uses the Intron Jaccard Index as its splice metric instead of FRASER's +previous three metrics along with some other parameter optimizations of pseudocount, +filtering settings and default delta cutoff. + +To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics +specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the +new default delta cutoff is 0.1 instead of the previous value of 0.3. When using +the 3 previous metrics, the delta cutoff should be set manually to 0.3 +during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`. + +The manuscript describing these changes in more detail will be available soon. + ## Installation `FRASER` is an R/Bioconductor software package requiring a running diff --git a/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz new file mode 100644 index 00000000..5d87eb13 Binary files /dev/null and b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz differ diff --git a/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz new file mode 100644 index 00000000..a4ec8581 Binary files /dev/null and b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz differ diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 22f3fff5..c4848052 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -5,11 +5,13 @@ \alias{calculateZscore} \alias{calculatePvalues} \alias{calculatePadjValues} +\alias{calculatePadjValuesOnSubset} \title{FRASER: Find RAre Splicing Events in RNA-seq data} \usage{ FRASER( fds, q, + type = fitMetrics(fds), implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations = 15, BPPARAM = bpparam(), @@ -28,7 +30,26 @@ calculatePvalues( capN = 5 * 1e+05 ) -calculatePadjValues(fds, type = currentType(fds), method = "BY") +calculatePadjValues( + fds, + type = currentType(fds), + method = "BY", + rhoCutoff = NA, + geneLevel = TRUE, + geneColumn = "hgnc_symbol", + subsets = NULL, + BPPARAM = bpparam() +) + +calculatePadjValuesOnSubset( + fds, + genesToTest, + subsetName, + type = currentType(fds), + method = "BY", + geneColumn = "hgnc_symbol", + BPPARAM = bpparam() +) } \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} @@ -38,6 +59,9 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing +efficiency)} + \item{implementation}{The method that should be used to correct for confounders.} @@ -50,9 +74,6 @@ not yet converged after these number of iterations, the fit stops anyway.} \item{...}{Additional parameters passed on to the internal fit function} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing -efficiency)} - \item{logit}{Indicates if z scores are computed on the logit scale (default) or in the natural (psi) scale.} @@ -62,7 +83,31 @@ calculated. Possible are beta-binomial, binomial and normal.} \item{capN}{Counts are capped at this value to speed up the p-value calculation} -\item{method}{The p.adjust method that should be used.} +\item{method}{The p.adjust method that should be used for genome-wide +multiple testing correction.} + +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which junctions are +masked with NA during p value adjustment (default: NA, no masking).} + +\item{geneLevel}{Logical value indiciating whether gene-level p values +should be calculated. Defaults to TRUE.} + +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} + +\item{subsets}{A named list of named lists specifying any number of gene +subsets (can differ per sample). For each subset, FDR correction +will be limited to genes in the subset, and the FDR corrected +pvalues stored as an assay in the fds object in addition to the +transcriptome-wide FDR corrected pvalues. See the examples for +how to use this argument.} + +\item{genesToTest}{A named list with the subset of genes to test per sample. +The names must correspond to the sampleIDs in the given fds object.} + +\item{subsetName}{The name under which the resulting FDR corrected pvalues +will be stored in metadata(fds).} } \value{ FraserDataSet @@ -104,11 +149,21 @@ psi. \item \code{calculatePvalues}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The -returned p values are already adjusted with Holm's method per donor or +returned p values are not yet adjusted with Holm's method per donor or acceptor site, respectively. \item \code{calculatePadjValues}: This function adjusts the previously calculated -p-values per sample for multiple testing. +p-values per sample for multiple testing. First, the previoulsy calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, if gene symbols have been annotated to +junctions (and not otherwise requested), gene-level p values are computed. + +\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions +in a certain subset of genes which can differ per sample. Requires gene +symbols to have been annotated to junctions. As with the full FDR +correction across all junctions, first the previously calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, gene-level p values are computed. }} \examples{ @@ -132,16 +187,32 @@ fds <- FRASER(fds, q=2, implementation="PCA") # The functions run inside the FRASER function can also be directly # run themselves. # To directly run the fit function: -fds <- fit(fds, implementation="PCA", q=2, type="psi5") +fds <- fit(fds, implementation="PCA", q=2, type="jaccard") # To directly run the nomial and adjusted p value and z score # calculation, the following functions can be used: -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds, type="psi5")) -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds, type="jaccard")) +fds <- calculateZscore(fds, type="jaccard") +head(zScores(fds, type="jaccard")) + +# example of restricting FDR correction to subsets of genes of interest +genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +padjVals(fds, type="jaccard", subsetName="exampleSubset") +padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") + +# only adding FDR corrected pvalues on a subset without calculating +# transcriptome-wide FDR again: +fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, + subsetName="setOfInterest", type="jaccard") +padjVals(fds, type="jaccard", subsetName="setOfInterest") } \seealso{ diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd index c82a3211..e74728b2 100644 --- a/man/annotateRanges.Rd +++ b/man/annotateRanges.Rd @@ -20,7 +20,8 @@ annotateRangesWithTxDb( featureName = "hgnc_symbol", keytype = "ENTREZID", txdb = NULL, - orgDb = NULL + orgDb = NULL, + filter = list() ) } \arguments{ @@ -52,6 +53,11 @@ one is used, currently this is \item{orgDb}{An \code{orgDb} object or a data table to map the feature names. If this is NULL, then \code{org.Hs.eg.db} is used as the default.} + +\item{filter}{A named list specifying the filters which should be applied to +subset to e.g. only protein-coding genes for annotation. +\code{names(filter)} needs to be column names in the given +orgDb object (default: no filtering).} } \value{ FraserDataSet @@ -67,13 +73,13 @@ fds <- createTestFraserDataSet() # either using biomart with GRCh38 try({ fds <- annotateRanges(fds, GRCh=38) - rowRanges(fds, type="psi5")[,c("hgnc_symbol")] + rowRanges(fds, type="j")[,c("hgnc_symbol")] }) # either using biomart with GRCh37 try({ fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) - rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] + rowRanges(fds, type="j")[,c("hgnc_symbol_37")] }) # or with a provided TxDb object @@ -82,6 +88,6 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene require(org.Hs.eg.db) orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -rowRanges(fds, type="psi5")[,"hgnc_symbol"] +rowRanges(fds, type="j")[,"hgnc_symbol"] } diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd index 0128221d..9e0c14fa 100644 --- a/man/calculatePSIValues.Rd +++ b/man/calculatePSIValues.Rd @@ -15,7 +15,7 @@ calculatePSIValues( \item{fds}{A \code{\link{FraserDataSet}} object} \item{types}{A vector with the psi types which should be calculated. Default -is all of psi5, psi3 and theta.} +is all of jaccard, psi5, psi3 and theta.} \item{overwriteCts}{FALSE or TRUE (the default) the total counts (aka N) will be recalculated based on the existing junction counts (aka K)} @@ -31,7 +31,7 @@ based on the FraserDataSet object } \examples{ fds <- createTestFraserDataSet() - fds <- calculatePSIValues(fds, types="psi5") + fds <- calculatePSIValues(fds, types="jaccard") ### usually one would run this function for all psi types by using: # fds <- calculatePSIValues(fds) diff --git a/man/counts.Rd b/man/counts.Rd index 1ddac4d5..79beb47f 100644 --- a/man/counts.Rd +++ b/man/counts.Rd @@ -11,9 +11,14 @@ K(fds, type = currentType(fds)) N(fds, type = currentType(fds)) -\S4method{counts}{FraserDataSet}(object, type = NULL, side = c("ofInterest", "otherSide")) - -\S4method{counts}{FraserDataSet,ANY}(object, type = NULL, side = c("ofInterest", "otherSide"), ...) <- value +\S4method{counts}{FraserDataSet}(object, type = currentType(object), side = c("ofInterest", "otherSide")) + +\S4method{counts}{FraserDataSet,ANY}( + object, + type = currentType(object), + side = c("ofInterest", "otherSide"), + ... +) <- value } \arguments{ \item{fds, object}{FraserDataSet} @@ -39,8 +44,10 @@ setter for count data \examples{ fds <- createTestFraserDataSet() - counts(fds, type="psi5", side="ofInterest") - counts(fds, type="psi5", side="other") + counts(fds, side="ofInterest") + counts(fds, type="jaccard", side="other") + head(K(fds)) + head(K(fds, type="psi5")) head(K(fds, type="psi3")) head(N(fds, type="theta")) diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd index 3e11e4e4..d6caf3fd 100644 --- a/man/fds-methods.Rd +++ b/man/fds-methods.Rd @@ -41,17 +41,7 @@ \alias{FRASER.mcols.get} \alias{FRASER.rowRanges.get} \alias{mapSeqlevels} -\title{Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. - -\code{samples} sets or gets the sample IDs; \code{condition} ; -\code{} -\code{nonSplicedReads} return a RangedSummarizedExperiment object -containing the counts for the non spliced reads overlapping splice -sites in the fds. -\code{}} +\title{Getter/Setter methods for the FraserDataSet} \usage{ samples(object) @@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().} Getter method return the respective current value. } \description{ -Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. +The following methods are getter and setter methods to extract +or set certain values of a FraserDataSet object. \code{samples} sets or gets the sample IDs; \code{condition} ; \code{} @@ -162,8 +150,6 @@ certain values of a FraserDataSet object. containing the counts for the non spliced reads overlapping splice sites in the fds. \code{} - -Mapping of chromosome names } \examples{ fds <- createTestFraserDataSet() diff --git a/man/filtering.Rd b/man/filtering.Rd index b40ad786..b99936dc 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -1,44 +1,52 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filterExpression.R -\name{filtering} +% Please edit documentation in R/AllGenerics-definitions.R, R/filterExpression.R +\name{filterVariability} +\alias{filterVariability} \alias{filtering} \alias{filterExpressionAndVariability} \alias{filterExpression,FraserDataSet-method} -\alias{filterVariability} +\alias{filterVariability,FraserDataSet-method} \title{Filtering FraserDataSets} \usage{ +filterVariability(object, ...) + filterExpressionAndVariability( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, - minDeltaPsi = 0.05, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) \S4method{filterExpression}{FraserDataSet}( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) -filterVariability( +\S4method{filterVariability}{FraserDataSet}( object, - minDeltaPsi = 0.05, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) } \arguments{ \item{object}{A \code{\link{FraserDataSet}} object} +\item{...}{Further parameters passed on to Rsubread::featureCounts.} + \item{minExpressionInOneSample}{The minimal read count in at least one sample that is required for an intron to pass the filter.} @@ -60,6 +68,10 @@ mcols.} otherwise the function works on the delayedMatrix representations. The default value depends on the number of samples in the fds-object.} +\item{filterOnJaccard}{If TRUE, the Intron Jaccard Metric is used to define +express introns during fitlering. Otherwise, the psi5, psi3 and theta +metrics are used (default: TRUE).} + \item{BPPARAM}{the BiocParallel parameters for the parallelization} } \value{ @@ -78,15 +90,15 @@ read support and introns that are not variable across samples. \item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability}: This function filters out introns and corresponding -splice sites which do not show variablity across samples. +\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding +splice sites that have low read support in all samples. }} \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -mcols(fds, type="psi5")[, c( - "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +mcols(fds, type="jaccard")[, c( + "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] plotFilterExpression(fds) plotFilterVariability(fds) diff --git a/man/fit.Rd b/man/fit.Rd index eb68296c..dcbee2dc 100644 --- a/man/fit.Rd +++ b/man/fit.Rd @@ -10,8 +10,8 @@ implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), q, - type = "psi3", - rhoRange = c(1e-08, 1 - 1e-08), + type = psiTypes, + rhoRange = c(-30, 30), weighted = FALSE, noiseAlpha = 1, convergence = 1e-05, @@ -35,7 +35,7 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{rhoRange}{Defines the range of values that rho parameter from the diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index bf56e32c..9b10f832 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -15,10 +15,13 @@ \alias{zScores} \alias{pVals} \alias{padjVals} +\alias{availableFDRsubsets} \alias{predictedMeans} \alias{deltaPsiValue} \alias{currentType} \alias{currentType<-} +\alias{fitMetrics} +\alias{fitMetrics<-} \alias{pseudocount} \alias{hyperParams} \alias{dontWriteHDF5} @@ -35,9 +38,26 @@ rho(fds, type = currentType(fds)) zScores(fds, type = currentType(fds), byGroup = FALSE, ...) -pVals(fds, type = currentType(fds), level = "site", dist = "BetaBinomial", ...) - -padjVals(fds, type = currentType(fds), dist = c("BetaBinomial"), ...) +pVals( + fds, + type = currentType(fds), + level = "site", + filters = list(), + dist = "BetaBinomial", + ... +) + +padjVals( + fds, + type = currentType(fds), + dist = c("BetaBinomial"), + level = "site", + subsetName = NULL, + filters = list(), + ... +) + +availableFDRsubsets(fds) predictedMeans(fds, type = currentType(fds)) @@ -47,6 +67,10 @@ currentType(fds) currentType(fds) <- value +fitMetrics(fds) + +fitMetrics(fds) <- value + pseudocount(value = NULL) hyperParams(fds, type = currentType(fds), all = FALSE) @@ -70,12 +94,16 @@ verbose(fds) <- value \item{byGroup}{If TRUE, aggregation by donor/acceptor site will be done.} -\item{...}{Internally used parameteres.} +\item{...}{Internally used parameters.} \item{level}{Indicates if the retrieved p values should be adjusted on the donor/acceptor site-level (default) or if unadjusted junction-level p values should be returned.} +\item{filters}{A named list giving the filters that were applied for masking +during p value correction. Used for storing and retrieving the +correct set of requested p values.} + \item{dist}{Distribution for which the p-values should be extracted.} \item{all}{Logical value indicating whether \code{hyperParams(fds)} should @@ -110,6 +138,9 @@ beta-binomial distribution \item \code{padjVals}: This returns the adjusted p-values. +\item \code{availableFDRsubsets}: This returns the names of FDR subsets +for which adjusted p values have been calculated. + \item \code{predictedMeans}: This returns the fitted mu (i.e. psi) values. @@ -117,11 +148,18 @@ values. observed and the fitted psi values. \item \code{currentType}: Returns the psi type that is used -within several methods in the FRASER package. +within several methods in the FRASER package (defaults to jaccard). \item \code{currentType<-}: Sets the psi type that is to be used within several methods in the FRASER package. +\item \code{fitMetrics}: Returns the splice metrics that will be +fitted (defaults to jaccard, used within several methods in the +FRASER package). + +\item \code{fitMetrics<-}: Sets the splice metrics that will be +fitted (used within several methods in the FRASER package). + \item \code{pseudocount}: Sets and returns the pseudo count used within the FRASER fitting procedure. @@ -140,7 +178,7 @@ assays should be stored as hdf5 files. \item \code{dontWriteHDF5<-}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose}: Dependend on the level of verbosity +\item \code{verbose}: Dependent on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. @@ -156,7 +194,7 @@ dontWriteHDF5(fds) dontWriteHDF5 <- TRUE # get/set the splice metric for which results should be retrieved -currentType(fds) <- "psi5" +currentType(fds) <- "jaccard" currentType(fds) # get fitted parameters @@ -167,6 +205,9 @@ rho(fds) # get statistics pVals(fds) padjVals(fds) + +# zscore not calculated by default +fds <- calculateZscore(fds, type="jaccard") zScores(fds) # set and get pseudocount @@ -174,9 +215,9 @@ pseudocount(4L) pseudocount() # retrieve or set a mask to exclude certain junctions in the fitting step -featureExclusionMask(fds, type="theta") <- sample( - c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -featureExclusionMask(fds, type="theta") +featureExclusionMask(fds, type="jaccard") <- sample( + c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +featureExclusionMask(fds, type="jaccard") # controlling the verbosity level of the output of some algorithms verbose(fds) <- 2 diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index 1f85338e..5132c6e4 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -6,7 +6,7 @@ \usage{ injectOutliers( fds, - type = c("psi5", "psi3", "theta"), + type = psiTypes, freq = 0.001, minDpsi = 0.2, minCoverage = 2, @@ -50,5 +50,6 @@ Inject artificial outliers in an existing fds \examples{ # A generic dataset fds <- makeSimulatedFraserDataSet() +fds <- calculatePSIValues(fds) fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) } diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index f3d10622..e1333b4a 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -6,9 +6,9 @@ \usage{ optimHyperParams( fds, - type, + type = psiTypes, implementation = "PCA", - q_param = seq(2, min(40, ncol(fds)), by = 3), + q_param = getEncDimRange(fds), noise_param = 0, minDeltaPsi = 0.1, iterations = 5, @@ -24,7 +24,7 @@ optimHyperParams( \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{implementation}{The method that should be used to correct for @@ -71,13 +71,14 @@ ratios while maximizing the precision-recall curve. \examples{ # generate data fds <- makeSimulatedFraserDataSet(m=15, j=20) + fds <- calculatePSIValues(fds) # run hyperparameter optimization - fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) + fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) # get estimated optimal dimension of the latent space - bestQ(fds, type="psi5") - hyperParams(fds, type="psi5") + bestQ(fds, type="jaccard") + hyperParams(fds, type="jaccard") } \seealso{ diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index cbeddee6..db8025a0 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -1,6 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plotMethods.R -\name{plotFunctions} +% Please edit documentation in R/AllGenerics-definitions.R, R/plotMethods.R +\name{plotManhattan} +\alias{plotManhattan} \alias{plotFunctions} \alias{plotAberrantPerSample} \alias{plotVolcano} @@ -10,23 +11,29 @@ \alias{plotFilterExpression} \alias{plotExpectedVsObservedPsi} \alias{plotEncDimSearch} +\alias{plotBamCoverage} +\alias{plotBamCoverageFromResultTable} \alias{plotVolcano,FraserDataSet-method} \alias{plotAberrantPerSample,FraserDataSet-method} +\alias{plotSpliceMetricRank} \alias{plotQQ,FraserDataSet-method} \alias{plotEncDimSearch,FraserDataSet-method} \alias{plotFilterVariability} \alias{plotCountCorHeatmap,FraserDataSet-method} +\alias{plotManhattan,FraserDataSet-method} \title{Visualization functions for FRASER} \usage{ +plotManhattan(object, ...) + \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = c("psi3", "psi5", "theta"), + type = fitMetrics(object), basePlot = TRUE, aggregate = FALSE, main = NULL, label = NULL, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, padjCutoff = 0.1, ... ) @@ -34,10 +41,9 @@ \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = c("psi3", "psi5", "theta"), + type = fitMetrics(object), padjCutoff = 0.1, - zScoreCutoff = NA, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, aggregate = TRUE, BPPARAM = bpparam(), ... @@ -45,8 +51,20 @@ plotExpression( fds, - type = c("psi5", "psi3", "theta"), - site = NULL, + type = fitMetrics(fds), + idx = NULL, + result = NULL, + colGroup = NULL, + basePlot = TRUE, + main = NULL, + label = "aberrant", + ... +) + +plotSpliceMetricRank( + fds, + type = fitMetrics(fds), + idx = NULL, result = NULL, colGroup = NULL, basePlot = TRUE, @@ -57,7 +75,7 @@ plotExpression( plotExpectedVsObservedPsi( fds, - type = c("psi5", "psi3", "theta"), + type = fitMetrics(fds), idx = NULL, result = NULL, colGroup = NULL, @@ -83,11 +101,7 @@ plotExpectedVsObservedPsi( ... ) -\S4method{plotEncDimSearch}{FraserDataSet}( - object, - type = c("psi3", "psi5", "theta"), - plotType = c("auc", "loss") -) +\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes, plotType = c("auc", "loss")) plotFilterExpression( fds, @@ -105,7 +119,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = c("psi5", "psi3", "theta"), + type = psiTypes, logit = FALSE, topN = 50000, topJ = 5000, @@ -126,10 +140,62 @@ plotFilterVariability( plotCov = TRUE, ... ) + +plotBamCoverage( + fds, + gr, + sampleID, + control_samples = sample(samples(fds[, which(samples(fds) != sampleID)]), min(3, + ncol(fds) - length(sampleID))), + txdb = NULL, + min_junction_count = 20, + highlight_range = NULL, + highlight_range_color = "firebrick", + color_annotated = "gray", + color_novel = "goldenrod3", + color_sample_interest = "firebrick", + color_control_samples = "dodgerblue4", + toscale = c("exon", "gene", "none"), + mar = c(2, 10, 0.1, 5), + curvature_splicegraph = 1, + curvature_coverage = 1, + cex = 1, + splicegraph_labels = c("genomic_range", "id", "name", "none"), + splicegraph_position = c("top", "bottom"), + ... +) + +plotBamCoverageFromResultTable( + fds, + result, + show_full_gene = FALSE, + txdb = NULL, + orgDb = NULL, + res_gene_col = "hgncSymbol", + res_geneid_type = "SYMBOL", + txdb_geneid_type = "ENTREZID", + left_extension = 1000, + right_extension = 1000, + ... +) + +\S4method{plotManhattan}{FraserDataSet}( + object, + sampleID, + value = "pvalue", + type = fitMetrics(object), + chr = NULL, + main = paste0("sample: ", sampleID), + chrColor = c("black", "darkgrey"), + ... +) } \arguments{ \item{object, fds}{An \code{\link{FraserDataSet}} object.} +\item{...}{Additional parameters passed to plot() or plot_ly() if not stated +otherwise in the details for each plot function} + \item{sampleID}{A sample ID which should be plotted. Can also be a vector. Integers are treated as indices.} @@ -150,22 +216,19 @@ samples. Labelling can be turned off by setting \code{label=NULL}. The user can also provide a custom list of gene symbols or sampleIDs.} -\item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta +\item{padjCutoff, deltaPsiCutoff}{Significance or delta psi cutoff to mark outliers} -\item{...}{Additional parameters passed to plot() or plot_ly() if not stated -otherwise in the details for each plot function} - \item{BPPARAM}{BiocParallel parameter to use.} +\item{idx}{A junction site ID or gene ID or one of both, which +should be plotted. Can also be a vector. Integers are treated +as indices.} + \item{result}{The result table to be used by the method.} \item{colGroup}{Group of samples that should be colored.} -\item{idx, site}{A junction site ID or gene ID or one of both, which -should be plotted. Can also be a vector. Integers are treated -as indices.} - \item{global}{Flag to plot a global Q-Q plot, default FALSE} \item{conf.alpha}{If set, a confidence interval is plotted, defaults to 0.05} @@ -228,6 +291,108 @@ annotation of the heatmap.} \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with the mean psi values or the junction coverage.} + +\item{gr}{A GRanges object indicating the genomic range that should be shown +in \code{plotBamCoverage}.} + +\item{control_samples}{The sampleIDs of the samples used as control in +\code{plotBamCoverage}.} + +\item{txdb}{A TxDb object giving the gene/transcript annotation to use.} + +\item{min_junction_count}{The minimal junction count across samples required +for a junction to appear in the splicegraph and coverage tracks +of \code{plotBamCoverage}.} + +\item{highlight_range}{A \code{GenomicRanges} or \code{GenomicRangesList} +object of ranges to be highlighted in the splicegraph of +\code{plotBamCoverage}.} + +\item{highlight_range_color}{The color of highlighted ranges in +the splicegraph of \code{plotBamCoverage}.} + +\item{color_annotated}{The color for exons and junctions present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_novel}{The color for novel exons and junctions not present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_sample_interest}{The color in \code{plotBamCoverage} for the +sample of interest.} + +\item{color_control_samples}{The color in \code{plotBamCoverage} for the +samples used as controls.} + +\item{toscale}{In \code{plotBamCoverage}, indicates which part of the +plotted region should be drawn to scale. Possible values are +'exon' (exonic regions are drawn to scale), +'gene' (both exonic and intronic regions are drawn to scale) or +'none' (exonic and intronic regions have constant length) +(see SGSeq package).} + +\item{mar}{The margin of the plot area for \code{plotBamCoverage} +(b,l,t,r).} + +\item{curvature_splicegraph}{The curvature of the junction arcs in the +splicegraph in \code{plotBamCoverage}. Decrease this value +for flatter arcs and increase it for steeper arcs.} + +\item{curvature_coverage}{The curvature of the junction arcs in the +coverage tracks of \code{plotBamCoverage}. Decrease this +value for flatter arcs and increase it for steeper arcs.} + +\item{cex}{For controlling the size of text and numbers in +\code{plotBamCoverage}.} + +\item{splicegraph_labels}{Indicated the format of exon/splice junction +labels in the splicegraph of \code{plotBamCoverage}. +Possible values are 'genomic_range' (gives the start position +of the first exon and the end position of the last exon that +are shown), 'id' (format E1,... J1,...), 'name' (format +type:chromosome:start-end:strand for each feature), +'none' for no labels (see SGSeq package).} + +\item{splicegraph_position}{The position of the splicegraph relative to the +coverage tracks in \code{plotBamCoverage}. Possible values +are 'top' (default) and 'bottom'.} + +\item{show_full_gene}{Should the full genomic range of the gene be shown in +\code{plotBamCoverageFromResultTable} (default: FALSE)? +If FALSE, only a certain region (see parameters left_extension +and right_extension) around the outlier junction is shown.} + +\item{orgDb}{A OrgDb object giving the mapping of gene ids and symbols.} + +\item{res_gene_col}{The column name in the given results table that +contains the gene annotation.} + +\item{res_geneid_type}{The type of gene annotation in the results table in +\code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +information is needed for mapping between the results table and +the provided annotation in the txdb object.} + +\item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} +(e.g. ENTREZID). This information is needed for +mapping between the results table and the provided annotation +in the txdb object.} + +\item{left_extension}{Indicating how far the plotted range around the outlier +junction should be extended to the left in +\code{plotBamCoverageFromResultTable}.} + +\item{right_extension}{Indicating how far the plotted range around the +outlier junction should be extended to the right in +\code{plotBamCoverageFromResultTable}.} + +\item{value}{Indicates which assay is shown in the manhattan plot. Defaults +to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.} + +\item{chr}{Vector of chromosome names to show in \code{plotManhattan}. The +default is to show all chromosomes.} + +\item{chrColor}{Interchanging colors by chromosome for \code{plotManhattan}.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -246,6 +411,9 @@ Plot the number of aberrant events per samples Plots the observed split reads of the junction of interest over all reads coming from the given donor/acceptor. +Plots the observed values of the splice metric across samples for a +junction of interest. + Plots the expected psi value over the observed psi value of the given junction. @@ -269,6 +437,10 @@ This is the list of all plotting function provided by FRASER: \item plotFilterExpression() \item plotFilterVariability() \item plotEncDimSearch() + \item plotBamCoverage() + \item plotBamCoverageFromResultTable() + \item plotManhattan() + \item plotSpliceMetricRank() } For a detailed description of each plot function please see the details. @@ -297,6 +469,9 @@ log10 space. \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi against the predicted psi for a given site. +\code{plotSpliceMetricRank}: This function plots for a given intron the +observed values of the selected splice metrix against the sample rank. + \code{plotCountCorHeatmap}: The correlation heatmap of the count data either of the full data set (i.e. sample-sample correlations) or of the top x most variable junctions (i.e. junction-sample correlations). By default the values @@ -317,33 +492,81 @@ introns and for the filtered (i.e. non-variable) introns. It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for the \code{q} in fitting process. + +\code{plotManhattan}: A Manhattan plot showing the junction pvalues by +genomic position. Useful to identify if outliers cluster by genomic position. + +\code{plotBamCoverage}: A sashimi plot showing the read coverage from +the underlying bam files for a given genomic range and sampleIDs. + +\code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +coverage from the underlying bam files for a row in the results table. Can +either show the full range of the gene with the outlier junction or only a +certain region around the outlier. } \examples{ # create full FRASER object fds <- makeSimulatedFraserDataSet(m=40, j=200) fds <- calculatePSIValues(fds) fds <- filterExpressionAndVariability(fds, filter=FALSE) -# this step should be done for all splicing metrics and more dimensions -fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) +# this step should be done for more dimensions in practice +fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) fds <- FRASER(fds) # QC plotting plotFilterExpression(fds) plotFilterVariability(fds) -plotCountCorHeatmap(fds, "theta") -plotCountCorHeatmap(fds, "theta", normalized=TRUE) -plotEncDimSearch(fds, type="psi5") +plotCountCorHeatmap(fds, "jaccard") +plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +plotEncDimSearch(fds, type="jaccard") # extract results plotAberrantPerSample(fds, aggregate=FALSE) -plotVolcano(fds, "sample1", "psi5") +plotVolcano(fds, "sample1", "jaccard") # dive into gene/sample level results res <- results(fds) res plotExpression(fds, result=res[1]) plotQQ(fds, result=res[1]) -plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +plotExpectedVsObservedPsi(fds, res=res[1]) + +# create manhattan plot of pvalues by genomic position +if(require(ggbio)){ + plotManhattan(fds, type="jaccard", sampleID="sample10") +} +# plot splice graph and coverage from bam files in a given region +if(require(SGSeq)){ + fds <- createTestFraserSettings() + gr <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") + plotBamCoverage(fds, gr=gr, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + + # plot coverage from bam file for a row in the result table + fds <- createTestFraserDataSet() + require(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + require(org.Hs.eg.db) + orgDb <- org.Hs.eg.db + + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) + res_dt <- as.data.table(res) + res_dt <- res_dt[sampleID == "sample2",] + + # plot full range of gene containing outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + + # plot only certain range around outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") +} } diff --git a/man/potentialImpactAnnotations.Rd b/man/potentialImpactAnnotations.Rd new file mode 100644 index 00000000..282a6d09 --- /dev/null +++ b/man/potentialImpactAnnotations.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/resultAnnotations.R +\name{potentialImpactAnnotations} +\alias{potentialImpactAnnotations} +\alias{annotateIntronReferenceOverlap} +\alias{annotatePotentialImpact} +\alias{flagBlacklistRegions} +\title{Additional result annotations} +\usage{ +annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam()) + +annotatePotentialImpact( + result, + txdb, + fds, + addPotentialImpact = TRUE, + addUTRoverlap = TRUE, + minoverlap = 5, + BPPARAM = bpparam() +) + +flagBlacklistRegions( + result, + blacklist_regions = NULL, + assemblyVersion = c("hg19", "hg38"), + minoverlap = 5 +) +} +\arguments{ +\item{fds}{A FraserDataSet} + +\item{txdb}{A txdb object providing the reference annotation.} + +\item{BPPARAM}{For controlling parallelization behavior. Defaults to +\code{bpparam()}.} + +\item{result}{A result table as generated by FRASER, including the column +\code{annotatedJunction} as generated by the function +\code{annotateIntronReferenceOverlap}.} + +\item{addPotentialImpact}{Logical, indicating if the type of the potential +impact should be added to the results table. Defaults to \code{TRUE}.} + +\item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions +should checked and added to the results table. Defaults to \code{TRUE}.} + +\item{minoverlap}{Integer value defining the number of base pairs around the +splice site that need to overlap with UTR or blacklist region, +respectivly, to be considered matching. Defaults to 5 bp.} + +\item{blacklist_regions}{A BED file that contains the blacklist regions. +If \code{NULL} (default), the BED files that are packaged with FRASER +are used (see Details for more information).} + +\item{assemblyVersion}{Indicates the genome assembly version of the intron +coordinates. Only used if blacklist_regions is NULL. For other versions, +please provide the BED file containing the blacklist regions directly.} +} +\value{ +An annotated FraserDataSet or results table, respectively +} +\description{ +These functions work on the result table and add additional + annotations to the reported introns: the type of potential impact on + splicing (e.g. exon skipping, exon truncation, ...), potential occurence + of frameshift, overlap with UTR regions as well as a flag for introns + that are located in blacklist regions of the genome. + +\code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the + fds for each intron based on the overlap of the intron's location with + the reference annotation. Has to be run before the result table is + created so that the new column can be included in it (see examples). + +\code{\link{annotatePotentialImpact}} annotates each intron in the results + table with the type of potential impact on splicing and potential + occurence of frameshift (likely, unlikely, inconclusive). Can also + calculate overlap with annotated UTR regions. Potential impact can be: + annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, + exonTruncation, exonElongation, exonTruncation&Elongation, + exonSkipping, splicingBeyondGene, + multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, + complex (everything else). + Splice sites (theta metric) annotations indicate how the splice site is + located with respect to the reference annotation. The annotated types + are: annotatedSpliceSite, exonicRegion, intronicRegion. + +\code{\link{flagBlacklistRegions}} flags introns in the results table on + whether or not they are located in a blacklist region of the genome. By + default, the blacklist regions as reported in + \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from + \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} + are used. +} +\section{Functions}{ +\itemize{ +\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations +based on overlap with the reference annotation (start, end, none, both) +for the full fds. The overlap type is added as a new column +\code{annotatedJunction} in \code{mcols(fds)}. + +\item \code{annotatePotentialImpact}: This method annotates the splice event +type to junctions in the given results table. + +\item \code{flagBlacklistRegions}: This method flags all introns and +splice sites in the given results table for which at least one splice +site (donor or acceptor) is located in a blacklist region. Blacklist +regions of the genome are determined from the provided BED file. +}} + +\examples{ + # get data, fit and compute p-values and z-scores + fds <- createTestFraserDataSet() + + # load reference annotation + library(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + + # add basic annotations for overlap with the reference annotation + # run this function before creating the results table + fds <- annotateIntronReferenceOverlap(fds, txdb) + + # extract results: for this small example dataset, no cutoffs used + # to get some results + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) + + # annotate the type of potential impact on splicing and UTR overlap + res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) + + # annotate overlap with blacklist regions + res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") + + # show results table containing additional annotations + res + +} diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index f07be433..9a4b9ca2 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -3,18 +3,18 @@ \docType{data} \name{psiTypes} \alias{psiTypes} -\title{Available psi types} +\title{Available splice metrics} \format{ -An object of class \code{character} of length 3. +An object of class \code{character} of length 4. } \usage{ psiTypes } \description{ -Available psi types +Available splice metrics } \examples{ - # to show available psi types: + # to show all available splice metrics: psiTypes } diff --git a/man/results.Rd b/man/results.Rd index 57682616..ceeb9fc8 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/AllGenerics.R \name{results,FraserDataSet-method} \alias{results,FraserDataSet-method} -\alias{resultsByGenes} \alias{aberrant,FraserDataSet-method} \title{Extracting results and aberrant splicing events} \usage{ @@ -10,26 +9,31 @@ object, sampleIDs = samples(object), padjCutoff = 0.05, - zScoreCutoff = NA, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, + rhoCutoff = NA, + aggregate = FALSE, + collapse = FALSE, minCount = 5, - psiType = c("psi3", "psi5", "theta"), + psiType = psiTypes, + geneColumn = "hgnc_symbol", + all = FALSE, + returnTranscriptomewideResults = TRUE, additionalColumns = NULL, - BPPARAM = bpparam(), - ... + BPPARAM = bpparam() ) -resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY") - \S4method{aberrant}{FraserDataSet}( object, - type = currentType(object), + type = fitMetrics(object), padjCutoff = 0.05, - deltaPsiCutoff = 0.3, - zScoreCutoff = NA, + deltaPsiCutoff = 0.1, minCount = 5, + rhoCutoff = NA, by = c("none", "sample", "feature"), aggregate = FALSE, + geneColumn = "hgnc_symbol", + subsetName = NULL, + all = FALSE, ... ) } @@ -41,16 +45,38 @@ retrieved} \item{padjCutoff}{The FDR cutoff to be applied or NA if not requested.} -\item{zScoreCutoff}{The z-score cutoff to be applied or NA if not requested.} - \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.} +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which +junctions are filtered} + +\item{aggregate}{If TRUE the returned object is aggregated to the feature +level (i.e. gene level).} + +\item{collapse}{Only takes effect if \code{aggregate=TRUE}. +If TRUE, collapses results across the different psi +types to return only one row per feature (gene) and sample.} + \item{minCount}{The minimum count value of the total coverage of an intron to be considered as significant. result} \item{psiType}{The psi types for which the results should be retrieved.} +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} + +\item{all}{By default FALSE, only significant introns (or genes) are listed +in the results. If TRUE, results are assembled for all +samples and introns/genes regardless of significance.} + +\item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets +of genes of interest have been calculated, this parameter +indicates whether additionally the transcriptome-wide results +should be returned as well (default), or whether only results +for those subsets should be retrieved.} + \item{additionalColumns}{Character vector containing the names of additional columns from mcols(fds) that should appear in the result table (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns @@ -58,26 +84,15 @@ are included.} \item{BPPARAM}{The BiocParallel parameter.} -\item{...}{Further arguments can be passed to the method. If "zscores", -"padjVals" or "dPsi" is given, the values of those arguments -are used to define the aberrant events.} - -\item{res}{Result as created with \code{results()}} - -\item{geneColumn}{The name of the column in \code{mcols(res)} that contains -the gene symbols.} - -\item{method}{The p.adjust method that is being used to adjust p values per -sample.} - \item{type}{Splicing type (psi5, psi3 or theta)} \item{by}{By default \code{none} which means no grouping. But if \code{sample} or \code{feature} is specified the sum by sample or feature is returned} -\item{aggregate}{If TRUE the returned object is based on the grouped -features} +\item{...}{Further arguments can be passed to the method. If "n", +"padjVals", "dPsi" or "rhoVals" are given, the values of those +arguments are used to define the aberrant events.} } \value{ For \code{results}: GRanges object containing significant results. @@ -95,24 +110,36 @@ aberrant splicing events based on the given cutoffs. # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() -# extract results: for this example dataset, z score cutoff of 2 is used to -# get at least one result and show the output -res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +# extract results: for this example dataset, no cutoffs are used to +# show the output of the results function +res <- results(fds, all=TRUE) res # aggregate the results by genes (gene symbols need to be annotated first # using annotateRanges() function) -resultsByGenes(res) +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) + +# aggregate the results by genes and collapse over all psi types to obtain +# only one row per gene in the results table +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, + collapse=TRUE) # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value -aberrant(fds, type="psi5", by="sample") +aberrant(fds, type="jaccard", by="sample") # get aberrant events per gene (first annotate gene symbols) fds <- annotateRangesWithTxDb(fds) -aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, - aggregate=TRUE) +aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) # find aberrant junctions/splice sites -aberrant(fds, type="psi5") +aberrant(fds, type="jaccard") + +# retrieve results limiting FDR correction to only a subset of genes +# first, we need to create a list of genes per sample that will be tested +geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=geneList)) +results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) + } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 461280c0..b03ef489 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -152,6 +152,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// truncNLL_rho_penalized +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda); +RcppExport SEXP _FRASER_truncNLL_rho_penalized(SEXP logit_rhoSEXP, SEXP yiSEXP, SEXP kiSEXP, SEXP niSEXP, SEXP lambdaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< double >::type logit_rho(logit_rhoSEXP); + Rcpp::traits::input_parameter< arma::vec >::type yi(yiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ki(kiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ni(niSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + rcpp_result_gen = Rcpp::wrap(truncNLL_rho_penalized(logit_rho, yi, ki, ni, lambda)); + return rcpp_result_gen; +END_RCPP +} // fullNLL arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows); RcppExport SEXP _FRASER_fullNLL(SEXP ySEXP, SEXP rhoSEXP, SEXP kSEXP, SEXP nSEXP, SEXP DSEXP, SEXP lambdaSEXP, SEXP byRowsSEXP) { @@ -215,6 +230,7 @@ static const R_CallMethodDef CallEntries[] = { {"_FRASER_truncNLL_e", (DL_FUNC) &_FRASER_truncNLL_e, 7}, {"_FRASER_truncGrad_e", (DL_FUNC) &_FRASER_truncGrad_e, 7}, {"_FRASER_truncNLL_rho", (DL_FUNC) &_FRASER_truncNLL_rho, 4}, + {"_FRASER_truncNLL_rho_penalized", (DL_FUNC) &_FRASER_truncNLL_rho_penalized, 5}, {"_FRASER_fullNLL", (DL_FUNC) &_FRASER_fullNLL, 7}, {"_FRASER_truncWeightedNLL_db", (DL_FUNC) &_FRASER_truncWeightedNLL_db, 7}, {"_FRASER_truncWeightedGrad_db", (DL_FUNC) &_FRASER_truncWeightedGrad_db, 7}, diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp index 07eadcca..0078a270 100644 --- a/src/loss_n_gradient_functions.cpp +++ b/src/loss_n_gradient_functions.cpp @@ -6,7 +6,7 @@ using namespace Rcpp; const double MAX_EXP_VALUE = 700; -double PSEUDO_COUNT = 1; +double PSEUDO_COUNT = 0; // [[Rcpp::export(.setPseudoCount)]] double setPseudoCount(double pseudoCount){ @@ -142,6 +142,12 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double infPosB = arma::find_nonfinite(beta); // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; @@ -266,6 +272,12 @@ double truncNLL_e(arma::vec par, arma::mat x, arma::mat D, arma::vec b, beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); + nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; return arma::as_scalar(nll); @@ -361,11 +373,58 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){ // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; return arma::as_scalar(nll); } +// [[Rcpp::export()]] +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda){ + arma::vec mui, u, alpha, alphaK, beta, betaNK, alphaBeta, nll; + double rho, rhoa, rhob; + + rho = exp(logit_rho)/(1 + exp(logit_rho)); + rhoa = (1 - rho)/rho; + rhob = (rho - 1)/rho; + mui = predictMuCpp(yi); + u = (mui-1) * rhob; + + alpha = arma::lgamma(mui * rhoa); + alphaK = arma::lgamma(mui * rhoa + ki + PSEUDO_COUNT); + beta = arma::lgamma(u); + betaNK = arma::lgamma(u + ni - ki + PSEUDO_COUNT); + alphaBeta = arma::lgamma(rhoa + ni + (2*PSEUDO_COUNT)) - lgamma(rhoa); + + // arma::vec abs; + arma::uvec infPosA, infPosB; + // abs = arma::abs(yi); + infPosA = arma::find_nonfinite(alpha); + // alpha.elem( infPosA ) = abs.elem( infPosA ); + alpha.elem( infPosA ) = estLgammaAlpha(yi, infPosA, rhoa); + infPosB = arma::find_nonfinite(beta); + // beta.elem( infPosB ) = abs.elem( infPosB ); + beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; + nll = nll + lambda * (logit_rho*logit_rho); + + return arma::as_scalar(nll); +} + // [[Rcpp::export()]] arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows=false){ arma::mat rhoa, rhob; @@ -395,6 +454,11 @@ arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::ma infPosB = arma::find_nonfinite(beta); beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); if(byRows){ nll = rowMeans(alpha + beta - alphaK - betaNK + nonTruncTerms); @@ -471,6 +535,12 @@ double truncWeightedNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); + nll = arma::accu((alpha + beta - alphaK - betaNK)%w)/k.n_elem; nll = nll + (lambda/k.n_elem) * arma::accu(d % d); diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R index 4fcd0d61..1bdbfc37 100644 --- a/tests/testthat/test_counting.R +++ b/tests/testthat/test_counting.R @@ -51,7 +51,7 @@ test_that("test minAnchor", { "sample3", features, fds, minAnchor=25, recount=TRUE)) }) expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1]) - expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1]) + expect_equivalent(c(5, 8, 0, 0, 7), ctnNS25[,1]) }) test_that("Test psi values", { diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 069a217d..45c9b2ff 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -4,8 +4,8 @@ test_that("FRASER function", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", - "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", - rep(psiTypes, 5))) + "pvaluesBetaBinomial", "padjBetaBinomial"), "_", + rep(fitMetrics(fds), 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) }) diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R index 26331409..fd8a961b 100644 --- a/tests/testthat/test_hyperParams.R +++ b/tests/testthat/test_hyperParams.R @@ -2,6 +2,7 @@ context("Test hyper param optimization") test_that("Test hyper param testing", { fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB") + fds <- calculatePSIValues(fds) # test BB no hyper params and accessors fds <- optimHyperParams(fds, type="psi3", implementation="BB") diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R index 85a4b9d3..43022359 100644 --- a/tests/testthat/test_plotJunctionDist.R +++ b/tests/testthat/test_plotJunctionDist.R @@ -3,15 +3,15 @@ context("Test distribution plots for given results/junction") test_that("Main junction distribution plot", { # get results fds <- getFraser() - res <- results(fds, padjCutoff=1, zScoreCutoff=NA, deltaPsiCutoff=NA) + res <- results(fds, padjCutoff=1, deltaPsiCutoff=NA) # plot distributions expect_silent(plotExpression(fds, result=res[1])) - expect_silent(plotVolcano(fds, "sample1", "psi5")) + expect_silent(plotVolcano(fds, "sample1", "jaccard")) expect_silent(plotExpectedVsObservedPsi(fds, result=res[2])) - expect_is(plotCountCorHeatmap(fds, "psi5", norm=FALSE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE, topN=10), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=FALSE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE, topN=10), "pheatmap") }) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 8e76cc19..b1eedb8c 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -17,16 +17,94 @@ test_that("PSI value calculation", { expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0)) }) -test_that("Zscore calculation", { - fds <- getFraser(clean = TRUE) +# test_that("Zscore calculation", { +# fds <- getFraser(clean = TRUE) +# +# # prepare zScore input for logit scale +# psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount()) +# mu <- predictedMeans(fds, "jaccard") +# residual <- qlogis(psiVal) - qlogis(mu) +# +# # compute zscore +# zscores <- (residual - rowMeans(residual)) / rowSds(residual) +# +# expect_equal(zscores, zScores(fds, "jaccard")) +# }) + +test_that("Gene p value calculation with NAs", { + fds <- getFraser() + fds <- fds[15:24,] + mcols(fds, type="j")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(3, 4, 3)) + mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(4, 6, 4)) + + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho[c(1, 4:7)] <- 0.5 + rho(fds, type="jaccard") <- rho + + # calc p values + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) + + # check dimension of junction-, site- and gene-level pval matrices + expect_equal(nrow(pVals(fds, type="jaccard", level="junction")), nrow(fds)) + expect_equal(nrow(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))), nrow(fds)) + expect_equal(nrow(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))), 3) + + # check jaccard pvals are partly NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) - # prepare zScore input for logit scale - psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount()) - mu <- predictedMeans(fds, "psi5") - residual <- qlogis(psiVal) - qlogis(mu) + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho <- rep(0.5, length(rho)) + rho(fds, type="jaccard") <- rho + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) + + # check jaccard pvals are all NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))))) +}) + +test_that("FDR on subset of genes", { + fds <- getFraser() + mcols(fds, type="j")$hgnc_symbol <- + rep(c("geneA", "geneB", "geneC", "geneD", "geneE"), + times=c(3, 7, 5, 4, 8)) - # compute zscore - zscores <- (residual - rowMeans(residual)) / rowSds(residual) + # define gene subset per sample + genes_per_sample <- list( + "sample1" = c("geneE", "geneC", "geneA"), + "sample2" = c("geneB"), + "sample3" = c("geneA", "geneB", "geneC", "geneD") + ) - expect_equal(zscores, zScores(fds, "psi5")) + subsetName <- "subset_test" + fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, + subsetName=subsetName, type="jaccard") + subset_padj <- padjVals(fds, type="jaccard", subsetName=subsetName) + expect_true(is(subset_padj, "matrix")) + expect_true(nrow(subset_padj) == 27) + expect_true(ncol(subset_padj) == 3) + subset_padj_gene <- padjVals(fds, type="jaccard", level="gene", + subsetName=subsetName) + expect_true(is(subset_padj_gene, "matrix")) + expect_true(nrow(subset_padj_gene) == 5) + expect_true(ncol(subset_padj_gene) == 3) + }) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 11a7a599..70e80f21 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -1,4 +1,4 @@ -%\VignetteIndexEntry{FRASER: Find RAre Splicing Evens in RNA-seq Data} +%\VignetteIndexEntry{FRASER: Find RAre Splicing Events in RNA-seq Data} %\VignettePackage{FRASER} %\VignetteEngine{knitr::knitr} %\VignetteEncoding{UTF-8} @@ -36,7 +36,7 @@ opts_chunk$set( \newcommand{\fraser}{\Biocpkg{FRASER}} \newcommand{\fds}{\Rclass{FraserDataSet}} -\title{FRASER: Find RAre Splicing Events in RNA-seq} +\title{FRASER: Find RAre Splicing Events in RNA-seq Data} \author{ Christian Mertes$^{1}$, Ines Scheller$^{1}$, Julien Gagneur$^{1}$ \\ @@ -175,6 +175,32 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the following. +From \fraser{} 2.0 on, only a single metric - the Intron Jaccard Index (Figure +\ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard +Index is more robust and allows to focus more on functionally relevant +aberrant splicing events. It allows to detect all types of aberrant splicing +previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$) +within a single metric. + +\incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron +Jaccard Index, the splice metric used in \fraser{}2.}{ +The Intron Jaccard Index considers both split and nonsplit reads within a +single metric and allows to detect all different types of aberrant splicing +previously captured with either of the metrics $\psi_5$, $\psi_3$, $\theta$. +} + +The Intron Jaccard Index considers both split and nonsplit reads and is +defined as the jaccard index of the set of donor reads (reads sharing a donor +site with the intron of interest and nonsplit reads at that donor site) and +acceptor reads (reads sharing an acceptor site with the intron of interest and +nonsplit reads at that acceptor site): + +\begin{equation} + J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)} + \label{eq:jaccard} +\end{equation} + + \section{Quick guide to \fraser{}} Here we quickly show how to do an analysis with \fraser{}, starting from a @@ -204,10 +230,6 @@ fds <- calculatePSIValues(fds) fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20, minDeltaPsi=0.0, filter=TRUE) -# fit the splicing model for each metric -# with a specific latentsapce dimension -fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3)) - # we provide two ways to anntoate introns with the corresponding gene symbols: # the first way uses TxDb-objects provided by the user as shown here library(TxDb.Hsapiens.UCSC.hg19.knownGene) @@ -216,17 +238,21 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) +# fit the splicing model for each metric +# with a specific latentspace dimension +fds <- FRASER(fds, q=c(jaccard=2)) + # alternatively, we also provide a way to use biomart for the annotation: # fds <- annotateRanges(fds) # get results: we recommend to use an FDR cutoff 0.05, but due to the small # dataset size we extract all events and their associated values -# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3) -res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA) +# eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1) +res <- results(fds, all=TRUE) res # result visualization -plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE) +plotVolcano(fds, sampleID="sample1", type="jaccard", aggregate=TRUE) @ @@ -386,9 +412,9 @@ be processed with the same protocol and origin from the same tissue. \label{sec:filtering} Before we can filter the data, we have to compute the main splicing metric: -the $\psi$-value (Percent Spliced In). +the $\psi$-value (Percent Spliced In) and the Intron Jaccard Index. -<>= +<>= fds <- calculatePSIValues(fds) fds @ @@ -439,7 +465,7 @@ transformed $\psi$ values to compute the correlation. <>= # Heatmap of the sample correlation -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE) +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE) @ It is also possible to visualize the correlation structure of the logit @@ -447,7 +473,7 @@ transformed $\psi$ values of the $topJ$ most variable introns for all samples: <>= # Heatmap of the intron/sample expression -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE, +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi = 0.01) @ @@ -470,14 +496,14 @@ p-values and z-scores for all $\psi$ types. For more details see section <>= # This is computational heavy on real size datasets and can take awhile -fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2)) +fds <- FRASER(fds, q=c(jaccard=3)) @ To check whether the correction worked, we can have a look at the correlation heatmap using the normalized $\psi$ values from the fit. <>= -plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE) +plotCountCorHeatmap(fds, type="jaccard", normalized=TRUE, logit=TRUE) @ \subsubsection{Calling splicing outliers} @@ -519,34 +545,58 @@ the following additional information: \item hgncSymbol: the gene symbol of the gene that contains the splice junction or site if available \item type: the metric for which the aberrant event was detected (either - psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$) - \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of - this event + jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or + theta for $\theta$) + \item pValue, padjust: the p-value and adjusted p-value (FDR) of + this event (at intron or splice site level depending on metric) + \item pValueGene, padjustGene: only present in the gene-level results table, + gives the p-value and FDR adjusted p-value at gene-level \item psiValue: the value of $\psi_5$, $\psi_3$ or $\theta$ metric (depending on the type column) of this junction or splice site for the sample in which it is detected as aberrant \item deltaPsi: the $\Delta\psi$-value of the event in this sample, which is the difference between the actual observed $\psi$ and the expected $\psi$ + \item counts, totalCounts: the count (k) and total count (n) of the splice + junction or site for the sample where it is detected as aberrant \item meanCounts: the mean count (k) of reads mapping to this splice junction or site over all samples \item meanTotalCounts: the mean total count (n) of reads mapping to the same donor or acceptor site as this junction or site over all samples - \item counts, totalCounts: the count (k) and total count (n) of the splice - junction or site for the sample where it is detected as aberrant + \item nonsplitCounts, nonsplitProportion: only present for the Intron + Jaccard Index. States the sum of nonsplit counts overlapping either the + donor or acceptor site of the outlier intron for the sample where it is + detected as aberrant; and their proportion out of the total counts (N). + A high nonsplitProportion indicates possible (partial) intron retention. + \item FDR\_set The set of genes on which FDR correction is applied. If not + otherwise specified, FDR correction is transcriptome-wide. \end{itemize} Please refer to section \ref{sec:Introduction} for more information about the -metrics $\psi_5$, $\psi_3$ and $\theta$ and their definition. In general, an +Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and +$\theta$) and their definition. In general, an aberrant $\psi_5$ value might indicate aberrant acceptor site usage of the junction where the event is detected; an aberrant $\psi_3$ value might indicate aberrant donor site usage of the junction where the event is detected; and an aberrant $\theta$ value might indicate partial or full intron retention, or -exon truncation or elongation. We recommend using a genome browser to -investigate interesting detected events in more detail. +exon truncation or elongation. As the Intron Jaccard Index combines the +previously described metrics, an aberrant Intron Jaccard value can indicate any +of the above described cases. We recommend using a genome browser to +investigate interesting detected events in more detail. \fraser{}2 also +provides the function \Rfunction{plotBamCoverageFromResultTable} to create a +sashimi plot for an outlier in the results table directly in R (if paths to +bam files are available in the \fds{} object). <>= -# to show result visualization functions for this tuturial, zScore cutoff used -res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1) +# to show result visualization functions for this tutorial, no cutoff used +res <- results(fds, all=TRUE) res + +# for the gene level pvalues, gene symbols need to be annotated the fds object +# before calling the calculatePadjValues function (part of FRASER() function) +# as we previously called FRASER() before annotating genes, we run it again here +fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) +# generate gene-level results table (if gene symbols have been annotated) +res_gene <- results(fds, aggregate=TRUE, all=TRUE) +res_gene @ \subsection{Finding splicing candidates in patients} @@ -555,7 +605,7 @@ Let's hava a look at sample 10 and check if we got some splicing candidates for this sample. <>= -plotVolcano(fds, type="psi5", "sample10") +plotVolcano(fds, type="jaccard", "sample10") @ Which are the splicing events in detail? @@ -568,8 +618,9 @@ sampleRes To have a closer look at the junction level, use the following functions: <>= -plotExpression(fds, type="psi5", result=sampleRes[1]) -plotExpectedVsObservedPsi(fds, result=sampleRes[1]) +plotExpression(fds, type="jaccard", result=sampleRes[9]) +plotSpliceMetricRank(fds, type="jaccard", result=sampleRes[9]) +plotExpectedVsObservedPsi(fds, result=sampleRes[9]) @ \subsection{Saving and loading a \fds{}} @@ -619,7 +670,7 @@ confounders in the data. Currently the following methods are implemented: # Using an alternative way to correct splicing ratios # here: only 2 iteration to speed the calculation up # for the vignette, the default is 15 iterations -fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder", +fds <- fit(fds, q=3, type="jaccard", implementation="PCA-BB-Decoder", iterations=2) @ @@ -638,17 +689,17 @@ for a subset of the dataset: <>= set.seed(42) # hyperparameter opimization -fds <- optimHyperParams(fds, type="psi5", plot=FALSE) +fds <- optimHyperParams(fds, type="jaccard", plot=FALSE) # retrieve the estimated optimal dimension of the latent space -bestQ(fds, type="psi5") +bestQ(fds, type="jaccard") @ The results from this hyper parameter optimization can be visualized with the function \Rfunction{plotEncDimSearch}. <>= -plotEncDimSearch(fds, type="psi5") +plotEncDimSearch(fds, type="jaccard") @ \subsection{P-value calculation} @@ -669,8 +720,8 @@ computed as the product of the fitted correction values from the autoencoder and the fitted mean adjustements. <>= -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) @ Afterwards, adjusted p-values can be calculated. Multiple testing correction is @@ -680,32 +731,21 @@ methods supported by \Rfunction{p.adjust} can be used via the \Robject{method} argument. <>= -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds,type="psi5")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds,type="jaccard")) @ -\subsection{Z-score calculation} -\label{sec:Z-score-calculation} - -To calculate z-scores on the logit transformed $\Delta\psi$ values and to store -them in the \fds{} object, the function \Rfunction{calculateZScores} can be -called. The Z-scores can be used for visualization, filtering, and ranking of -samples. The Z-scores are calculated as follows: - -\begin{equation} - z_{ij} = \frac{\delta_{ij} - \bar{\delta_j}}{sd(\delta_j)} -\end{equation} -\begin{equation*} - \delta_{ij} = logit{(\frac{k_{ij} + 1}{n_{ij} + 2})} - logit{(\mu_{ij})}, -\end{equation*} - -where $\delta_{ij}$ is the difference on the logit scale between the measured -counts and the counts after correction for confounders and $\bar{\delta_j}$ is -the mean of intron $j$. +With FRASER 2.0 we introduce the option to limit FDR correction to a subset of +genes based on prior knowledge, e.g. genes that contain a rare variant per +sample. To use this option, provide a list of genes per sample during FDR +computation: -<>= -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +<>= +genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), + "sample2"=c("ARHGEF18", "TRAPPC5")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +head(padjVals(fds, type="jaccard", subsetName="exampleSubset")) @ \subsection{Result visualization} @@ -713,6 +753,7 @@ head(zScores(fds, type="psi5")) In addition to the plotting methods \Rfunction{plotVolcano}, \Rfunction{plotExpression}, \Rfunction{plotExpectedVsObservedPsi}, +\Rfunction{plotSpliceMetricRank}, \Rfunction{plotFilterExpression} and \Rfunction{plotEncDimSearch} used above, the \fraser{} package provides two additional functions to visualize the results: @@ -731,6 +772,54 @@ plotQQ(fds, result=res[1]) plotQQ(fds, aggregate=TRUE, global=TRUE) @ +The \Rfunction{plotManhattan} function can be used to visualize the p-values +along with the genomic coordinates of the introns: +<>= +plotManhattan(fds, sampleID="sample10") +plotManhattan(fds, sampleID="sample10", chr="chr19") +@ + +Finally, when one has access to the bam files from which the split and unsplit +counts of FRASER were created, the \Rfunction{plotBamCoverage} and +\Rfunction{plotBamCoverageFromResultTable} functions use the \Rpackage{SGSeq} +package to allow visualizing the read coverage in the bam file a certain intron +from the results table or within a given genomic region as a sashimi plot: +<>= +### plot coverage from bam file for a certain genomic region +fds <- createTestFraserSettings() +vizRange <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") +plotBamCoverage(fds, gr=vizRange, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + +### plot coverage from bam file for a row in the result table +fds <- createTestFraserDataSet() + +# load gene annotation +require(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +require(org.Hs.eg.db) +orgDb <- org.Hs.eg.db + +# get results table +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +res_dt <- as.data.table(res) +res_dt <- res_dt[sampleID == "sample2",] + +# plot full range of gene highlighting the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + +# plot only certain range around the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") +@ + \bibliography{bibliography} \section{Session Info} diff --git a/vignettes/IntronJaccardIndex_sketch.png b/vignettes/IntronJaccardIndex_sketch.png new file mode 100644 index 00000000..79cca6a8 Binary files /dev/null and b/vignettes/IntronJaccardIndex_sketch.png differ